Skip to content
Snippets Groups Projects
Commit 400e479a authored by Eike Cochu's avatar Eike Cochu
Browse files

added start,enddate to sequence, needs update for sequenceindex

parent 90ef5efd
No related branches found
No related tags found
No related merge requests found
......@@ -88,6 +88,16 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT
return windowSizes.size();
}
public Date getStartDate(int index) {
// TODO implement
return null;
}
public Date getEndDate(int index) {
// TODO implement
return null;
}
@Override
public Iterator<DTMDateIndexEntry> iterator() {
Collections.sort(entries);
......
......@@ -11,10 +11,7 @@ import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -27,9 +24,7 @@ import de.vipra.cmd.file.FilebaseIndex;
import de.vipra.util.ArrayUtils;
import de.vipra.util.Config;
import de.vipra.util.Constants;
import de.vipra.util.CountMap;
import de.vipra.util.StringUtils;
import de.vipra.util.Tuple;
import de.vipra.util.ex.ConfigException;
import de.vipra.util.ex.DatabaseException;
import de.vipra.util.model.ArticleFull;
......@@ -146,9 +141,6 @@ public class DTMAnalyzer extends Analyzer {
final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS);
// collects created words
final Set<Word> newWords = new HashSet<>(wordCount);
// collect mapping between words and topics
@SuppressWarnings("unchecked")
final Tuple<Double, Integer>[] wordTopicMapping = new Tuple[wordCount];
log.info("vocabulary size: " + wordCount);
log.info("sequences: " + sequencesCount);
......@@ -162,7 +154,6 @@ public class DTMAnalyzer extends Analyzer {
// create new topic
final TopicFull newTopic = new TopicFull();
final List<Sequence> newSequences = new ArrayList<>(sequencesCount);
final List<TopicWord> newTopicWords = new ArrayList<>(wordCount);
newTopic.setSequences(newSequences);
in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile)));
......@@ -178,41 +169,9 @@ public class DTMAnalyzer extends Analyzer {
in.close();
// find maximums
final double[] maxWordLikelinesses = ArrayUtils.findRowMaximum(likelinesses);
// find maximum
final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses);
final double maxOverallLikeliness = ArrayUtils.findMaximum(maxSeqLikelinesses);
final double minAcceptableLikeliness = (maxOverallLikeliness >= 0 ? 1
: 2 - Constants.MINIMUM_RELATIVE_PROB) * maxOverallLikeliness;
// static topic and word topic mapping
// most likely words form the static topic over all sequences
for (int idxWord = 0; idxWord < wordCount; idxWord++) {
if (maxWordLikelinesses[idxWord] >= minAcceptableLikeliness) {
// add word to static topic
final Word word = vocab.getWord(idxWord);
newWords.add(word);
final TopicWord topicWord = new TopicWord(word, maxWordLikelinesses[idxWord]);
newTopicWords.add(topicWord);
// check if better word topic mapping than previous
final Tuple<Double, Integer> tuple = wordTopicMapping[idxWord];
if (tuple == null)
wordTopicMapping[idxWord] = new Tuple<>(maxWordLikelinesses[idxWord], idxTopic);
else if (maxWordLikelinesses[idxWord] > tuple.first())
tuple.setSecond(idxTopic);
}
}
if (newTopicWords.isEmpty())
continue;
newTopic.setWords(newTopicWords);
String msg = "topic with " + newTopicWords.size() + " "
+ StringUtils.quantity(newTopicWords.size(), "word") + ", sequences: [";
// dynamic topics
// go through each sequence and gather all words that are above
// the minimum relative word likeliness
for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) {
......@@ -234,13 +193,11 @@ public class DTMAnalyzer extends Analyzer {
final Sequence newSequence = new Sequence();
newSequence.setNumber(idxSeq);
newSequence.setWords(newSeqTopicWords);
newSequence.setStartDate(sequences.getStartDate(idxSeq));
newSequence.setEndDate(sequences.getEndDate(idxSeq));
newSequences.add(newSequence);
msg += " " + newSeqTopicWords.size();
}
log.info(msg + " ]");
newTopics.add(newTopic);
}
......@@ -265,60 +222,36 @@ public class DTMAnalyzer extends Analyzer {
// create topic references
final File multFile = new File(modelDir, "dtm-mult.dat");
in = new BufferedReader(new InputStreamReader(new FileInputStream(multFile)));
final Pattern wordCountPattern = Pattern.compile("(\\d+):(\\d+)");
int articleIndex = 0;
// for each article in the model file
while ((line = in.readLine()) != null) {
// get topic id from word id, count topics
final CountMap<Integer> countMap = new CountMap<>();
final Matcher matcher = wordCountPattern.matcher(line);
double totalCount = 0;
while (matcher.find()) {
final Integer wordId = Integer.parseInt(matcher.group(1));
final Tuple<Double, Integer> wordTopicTuple = wordTopicMapping[wordId];
if (wordTopicTuple != null) {
final int count = Integer.parseInt(matcher.group(2));
countMap.count(wordTopicTuple.second(), count);
totalCount += count;
}
final File gamFile = new File(outDirSeq, "gam.dat");
in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile)));
for (String articleId : index) {
// normalize topic proportions
double totalTopicProportions = 0;
double[] topicProportions = new double[Constants.K_TOPICS];
for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
double topicProportion = Double.parseDouble(in.readLine());
topicProportions[idxTopic] = topicProportion;
totalTopicProportions += topicProportion;
}
// create list of topics refs referencing topics with counted
// occurrences, sum accepted topic word count
long reducedCount = 0;
final List<TopicRef> newTopicRefs = new ArrayList<>(countMap.size());
for (final Entry<Integer, Integer> entry : countMap.entrySet()) {
// check if topic above threshold
if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) {
reducedCount += entry.getValue();
final TopicFull topic = newTopics.get(entry.getKey());
// TODO words with low relative likeliness are ignored.
// topic references from this file are possibly wrong.
// fix this by checking if the word is actually accepted
// by the referenced topic.
final TopicRef ref = new TopicRef();
ref.setCount(entry.getValue());
ref.setTopic(new Topic(topic.getId()));
newTopicRefs.add(ref);
}
// create topic references
final List<TopicRef> newTopicRefs = new ArrayList<>(Constants.K_TOPICS);
for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
TopicRef newTopicRef = new TopicRef();
TopicFull topicFull = newTopics.get(idxTopic);
newTopicRef.setTopic(new Topic(topicFull.getId()));
newTopicRef.setShare(topicProportions[idxTopic] / totalTopicProportions);
newTopicRefs.add(newTopicRef);
}
log.info("article with " + totalCount + " topic refs and " + reducedCount + " reduced topic refs ("
+ (reducedCount - totalCount) + ")");
// calculate each accepted topic share
for (final TopicRef ref : newTopicRefs)
ref.setShare((double) ref.getCount() / reducedCount);
// update article
if (!newTopicRefs.isEmpty()) {
Collections.sort(newTopicRefs, Comparator.reverseOrder());
// update article with topic references (partial update)
final ArticleFull article = new ArticleFull();
article.setId(index.get(articleIndex++));
article.setId(articleId);
article.setTopics(newTopicRefs);
try {
......
......@@ -10,19 +10,28 @@ import org.mongodb.morphia.annotations.Embedded;
@Embedded
public class Sequence implements Comparable<Sequence>, Serializable {
private Date date;
private Date startDate;
private Date endDate;
private Integer number;
@Embedded
private List<TopicWord> words;
public Date getDate() {
return date;
public Date getStartDate() {
return startDate;
}
public void setDate(final Date date) {
this.date = date;
public void setStartDate(Date startDate) {
this.startDate = startDate;
}
public Date getEndDate() {
return endDate;
}
public void setEndDate(Date endDate) {
this.endDate = endDate;
}
public Integer getNumber() {
......@@ -48,7 +57,8 @@ public class Sequence implements Comparable<Sequence>, Serializable {
@Override
public String toString() {
return "Sequence [date=" + date + ", number=" + number + ", words=" + words + "]";
return "Sequence [startDate=" + startDate + ", endDate=" + endDate + ", number=" + number + ", words=" + words
+ "]";
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment