Skip to content
Snippets Groups Projects
Commit 400e479a authored by Eike Cochu's avatar Eike Cochu
Browse files

added start,enddate to sequence, needs update for sequenceindex

parent 90ef5efd
No related branches found
No related tags found
No related merge requests found
...@@ -88,6 +88,16 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT ...@@ -88,6 +88,16 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT
return windowSizes.size(); return windowSizes.size();
} }
public Date getStartDate(int index) {
// TODO implement
return null;
}
public Date getEndDate(int index) {
// TODO implement
return null;
}
@Override @Override
public Iterator<DTMDateIndexEntry> iterator() { public Iterator<DTMDateIndexEntry> iterator() {
Collections.sort(entries); Collections.sort(entries);
......
...@@ -11,10 +11,7 @@ import java.util.Collections; ...@@ -11,10 +11,7 @@ import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -27,9 +24,7 @@ import de.vipra.cmd.file.FilebaseIndex; ...@@ -27,9 +24,7 @@ import de.vipra.cmd.file.FilebaseIndex;
import de.vipra.util.ArrayUtils; import de.vipra.util.ArrayUtils;
import de.vipra.util.Config; import de.vipra.util.Config;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.CountMap;
import de.vipra.util.StringUtils; import de.vipra.util.StringUtils;
import de.vipra.util.Tuple;
import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.ConfigException;
import de.vipra.util.ex.DatabaseException; import de.vipra.util.ex.DatabaseException;
import de.vipra.util.model.ArticleFull; import de.vipra.util.model.ArticleFull;
...@@ -146,9 +141,6 @@ public class DTMAnalyzer extends Analyzer { ...@@ -146,9 +141,6 @@ public class DTMAnalyzer extends Analyzer {
final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS);
// collects created words // collects created words
final Set<Word> newWords = new HashSet<>(wordCount); final Set<Word> newWords = new HashSet<>(wordCount);
// collect mapping between words and topics
@SuppressWarnings("unchecked")
final Tuple<Double, Integer>[] wordTopicMapping = new Tuple[wordCount];
log.info("vocabulary size: " + wordCount); log.info("vocabulary size: " + wordCount);
log.info("sequences: " + sequencesCount); log.info("sequences: " + sequencesCount);
...@@ -162,7 +154,6 @@ public class DTMAnalyzer extends Analyzer { ...@@ -162,7 +154,6 @@ public class DTMAnalyzer extends Analyzer {
// create new topic // create new topic
final TopicFull newTopic = new TopicFull(); final TopicFull newTopic = new TopicFull();
final List<Sequence> newSequences = new ArrayList<>(sequencesCount); final List<Sequence> newSequences = new ArrayList<>(sequencesCount);
final List<TopicWord> newTopicWords = new ArrayList<>(wordCount);
newTopic.setSequences(newSequences); newTopic.setSequences(newSequences);
in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile))); in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile)));
...@@ -178,41 +169,9 @@ public class DTMAnalyzer extends Analyzer { ...@@ -178,41 +169,9 @@ public class DTMAnalyzer extends Analyzer {
in.close(); in.close();
// find maximums // find maximum
final double[] maxWordLikelinesses = ArrayUtils.findRowMaximum(likelinesses);
final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses); final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses);
final double maxOverallLikeliness = ArrayUtils.findMaximum(maxSeqLikelinesses);
final double minAcceptableLikeliness = (maxOverallLikeliness >= 0 ? 1
: 2 - Constants.MINIMUM_RELATIVE_PROB) * maxOverallLikeliness;
// static topic and word topic mapping
// most likely words form the static topic over all sequences
for (int idxWord = 0; idxWord < wordCount; idxWord++) {
if (maxWordLikelinesses[idxWord] >= minAcceptableLikeliness) {
// add word to static topic
final Word word = vocab.getWord(idxWord);
newWords.add(word);
final TopicWord topicWord = new TopicWord(word, maxWordLikelinesses[idxWord]);
newTopicWords.add(topicWord);
// check if better word topic mapping than previous
final Tuple<Double, Integer> tuple = wordTopicMapping[idxWord];
if (tuple == null)
wordTopicMapping[idxWord] = new Tuple<>(maxWordLikelinesses[idxWord], idxTopic);
else if (maxWordLikelinesses[idxWord] > tuple.first())
tuple.setSecond(idxTopic);
}
}
if (newTopicWords.isEmpty())
continue;
newTopic.setWords(newTopicWords);
String msg = "topic with " + newTopicWords.size() + " "
+ StringUtils.quantity(newTopicWords.size(), "word") + ", sequences: [";
// dynamic topics
// go through each sequence and gather all words that are above // go through each sequence and gather all words that are above
// the minimum relative word likeliness // the minimum relative word likeliness
for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) {
...@@ -234,13 +193,11 @@ public class DTMAnalyzer extends Analyzer { ...@@ -234,13 +193,11 @@ public class DTMAnalyzer extends Analyzer {
final Sequence newSequence = new Sequence(); final Sequence newSequence = new Sequence();
newSequence.setNumber(idxSeq); newSequence.setNumber(idxSeq);
newSequence.setWords(newSeqTopicWords); newSequence.setWords(newSeqTopicWords);
newSequence.setStartDate(sequences.getStartDate(idxSeq));
newSequence.setEndDate(sequences.getEndDate(idxSeq));
newSequences.add(newSequence); newSequences.add(newSequence);
msg += " " + newSeqTopicWords.size();
} }
log.info(msg + " ]");
newTopics.add(newTopic); newTopics.add(newTopic);
} }
...@@ -265,60 +222,36 @@ public class DTMAnalyzer extends Analyzer { ...@@ -265,60 +222,36 @@ public class DTMAnalyzer extends Analyzer {
// create topic references // create topic references
final File multFile = new File(modelDir, "dtm-mult.dat"); final File gamFile = new File(outDirSeq, "gam.dat");
in = new BufferedReader(new InputStreamReader(new FileInputStream(multFile))); in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile)));
final Pattern wordCountPattern = Pattern.compile("(\\d+):(\\d+)");
int articleIndex = 0;
// for each article in the model file for (String articleId : index) {
while ((line = in.readLine()) != null) { // normalize topic proportions
// get topic id from word id, count topics double totalTopicProportions = 0;
final CountMap<Integer> countMap = new CountMap<>(); double[] topicProportions = new double[Constants.K_TOPICS];
final Matcher matcher = wordCountPattern.matcher(line); for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
double totalCount = 0; double topicProportion = Double.parseDouble(in.readLine());
while (matcher.find()) { topicProportions[idxTopic] = topicProportion;
final Integer wordId = Integer.parseInt(matcher.group(1)); totalTopicProportions += topicProportion;
final Tuple<Double, Integer> wordTopicTuple = wordTopicMapping[wordId];
if (wordTopicTuple != null) {
final int count = Integer.parseInt(matcher.group(2));
countMap.count(wordTopicTuple.second(), count);
totalCount += count;
}
} }
// create list of topics refs referencing topics with counted // create topic references
// occurrences, sum accepted topic word count final List<TopicRef> newTopicRefs = new ArrayList<>(Constants.K_TOPICS);
long reducedCount = 0; for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
final List<TopicRef> newTopicRefs = new ArrayList<>(countMap.size()); TopicRef newTopicRef = new TopicRef();
for (final Entry<Integer, Integer> entry : countMap.entrySet()) { TopicFull topicFull = newTopics.get(idxTopic);
// check if topic above threshold newTopicRef.setTopic(new Topic(topicFull.getId()));
if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) { newTopicRef.setShare(topicProportions[idxTopic] / totalTopicProportions);
reducedCount += entry.getValue(); newTopicRefs.add(newTopicRef);
final TopicFull topic = newTopics.get(entry.getKey());
// TODO words with low relative likeliness are ignored.
// topic references from this file are possibly wrong.
// fix this by checking if the word is actually accepted
// by the referenced topic.
final TopicRef ref = new TopicRef();
ref.setCount(entry.getValue());
ref.setTopic(new Topic(topic.getId()));
newTopicRefs.add(ref);
}
} }
log.info("article with " + totalCount + " topic refs and " + reducedCount + " reduced topic refs (" // update article
+ (reducedCount - totalCount) + ")");
// calculate each accepted topic share
for (final TopicRef ref : newTopicRefs)
ref.setShare((double) ref.getCount() / reducedCount);
if (!newTopicRefs.isEmpty()) { if (!newTopicRefs.isEmpty()) {
Collections.sort(newTopicRefs, Comparator.reverseOrder()); Collections.sort(newTopicRefs, Comparator.reverseOrder());
// update article with topic references (partial update) // update article with topic references (partial update)
final ArticleFull article = new ArticleFull(); final ArticleFull article = new ArticleFull();
article.setId(index.get(articleIndex++)); article.setId(articleId);
article.setTopics(newTopicRefs); article.setTopics(newTopicRefs);
try { try {
......
...@@ -10,19 +10,28 @@ import org.mongodb.morphia.annotations.Embedded; ...@@ -10,19 +10,28 @@ import org.mongodb.morphia.annotations.Embedded;
@Embedded @Embedded
public class Sequence implements Comparable<Sequence>, Serializable { public class Sequence implements Comparable<Sequence>, Serializable {
private Date date; private Date startDate;
private Date endDate;
private Integer number; private Integer number;
@Embedded @Embedded
private List<TopicWord> words; private List<TopicWord> words;
public Date getDate() { public Date getStartDate() {
return date; return startDate;
} }
public void setDate(final Date date) { public void setStartDate(Date startDate) {
this.date = date; this.startDate = startDate;
}
public Date getEndDate() {
return endDate;
}
public void setEndDate(Date endDate) {
this.endDate = endDate;
} }
public Integer getNumber() { public Integer getNumber() {
...@@ -48,7 +57,8 @@ public class Sequence implements Comparable<Sequence>, Serializable { ...@@ -48,7 +57,8 @@ public class Sequence implements Comparable<Sequence>, Serializable {
@Override @Override
public String toString() { public String toString() {
return "Sequence [date=" + date + ", number=" + number + ", words=" + words + "]"; return "Sequence [startDate=" + startDate + ", endDate=" + endDate + ", number=" + number + ", words=" + words
+ "]";
} }
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment