diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java index fabdb6e0b22d34a1e64c888aac6527d96189ed47..bd6b19eb5c8b75b99019fcc46df7c64d45dc8b3d 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java @@ -88,6 +88,16 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT return windowSizes.size(); } + public Date getStartDate(int index) { + // TODO implement + return null; + } + + public Date getEndDate(int index) { + // TODO implement + return null; + } + @Override public Iterator<DTMDateIndexEntry> iterator() { Collections.sort(entries); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index 05e5a3a2b92d70474a2af0cddcf10b37f9f1ecac..188d49f3467374e49995b9742d4e382bdc992c4c 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -11,10 +11,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; -import java.util.Map.Entry; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -27,9 +24,7 @@ import de.vipra.cmd.file.FilebaseIndex; import de.vipra.util.ArrayUtils; import de.vipra.util.Config; import de.vipra.util.Constants; -import de.vipra.util.CountMap; import de.vipra.util.StringUtils; -import de.vipra.util.Tuple; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleFull; @@ -146,9 +141,6 @@ public class DTMAnalyzer extends Analyzer { final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); // collects created words final Set<Word> newWords = new HashSet<>(wordCount); - // collect mapping between words and topics - @SuppressWarnings("unchecked") - final Tuple<Double, Integer>[] wordTopicMapping = new Tuple[wordCount]; log.info("vocabulary size: " + wordCount); log.info("sequences: " + sequencesCount); @@ -162,7 +154,6 @@ public class DTMAnalyzer extends Analyzer { // create new topic final TopicFull newTopic = new TopicFull(); final List<Sequence> newSequences = new ArrayList<>(sequencesCount); - final List<TopicWord> newTopicWords = new ArrayList<>(wordCount); newTopic.setSequences(newSequences); in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile))); @@ -178,41 +169,9 @@ public class DTMAnalyzer extends Analyzer { in.close(); - // find maximums - final double[] maxWordLikelinesses = ArrayUtils.findRowMaximum(likelinesses); + // find maximum final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses); - final double maxOverallLikeliness = ArrayUtils.findMaximum(maxSeqLikelinesses); - final double minAcceptableLikeliness = (maxOverallLikeliness >= 0 ? 1 - : 2 - Constants.MINIMUM_RELATIVE_PROB) * maxOverallLikeliness; - // static topic and word topic mapping - // most likely words form the static topic over all sequences - for (int idxWord = 0; idxWord < wordCount; idxWord++) { - if (maxWordLikelinesses[idxWord] >= minAcceptableLikeliness) { - // add word to static topic - final Word word = vocab.getWord(idxWord); - newWords.add(word); - final TopicWord topicWord = new TopicWord(word, maxWordLikelinesses[idxWord]); - newTopicWords.add(topicWord); - - // check if better word topic mapping than previous - final Tuple<Double, Integer> tuple = wordTopicMapping[idxWord]; - if (tuple == null) - wordTopicMapping[idxWord] = new Tuple<>(maxWordLikelinesses[idxWord], idxTopic); - else if (maxWordLikelinesses[idxWord] > tuple.first()) - tuple.setSecond(idxTopic); - } - } - - if (newTopicWords.isEmpty()) - continue; - - newTopic.setWords(newTopicWords); - - String msg = "topic with " + newTopicWords.size() + " " - + StringUtils.quantity(newTopicWords.size(), "word") + ", sequences: ["; - - // dynamic topics // go through each sequence and gather all words that are above // the minimum relative word likeliness for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { @@ -234,13 +193,11 @@ public class DTMAnalyzer extends Analyzer { final Sequence newSequence = new Sequence(); newSequence.setNumber(idxSeq); newSequence.setWords(newSeqTopicWords); + newSequence.setStartDate(sequences.getStartDate(idxSeq)); + newSequence.setEndDate(sequences.getEndDate(idxSeq)); newSequences.add(newSequence); - - msg += " " + newSeqTopicWords.size(); } - log.info(msg + " ]"); - newTopics.add(newTopic); } @@ -265,60 +222,36 @@ public class DTMAnalyzer extends Analyzer { // create topic references - final File multFile = new File(modelDir, "dtm-mult.dat"); - in = new BufferedReader(new InputStreamReader(new FileInputStream(multFile))); - final Pattern wordCountPattern = Pattern.compile("(\\d+):(\\d+)"); - int articleIndex = 0; - - // for each article in the model file - while ((line = in.readLine()) != null) { - // get topic id from word id, count topics - final CountMap<Integer> countMap = new CountMap<>(); - final Matcher matcher = wordCountPattern.matcher(line); - double totalCount = 0; - while (matcher.find()) { - final Integer wordId = Integer.parseInt(matcher.group(1)); - final Tuple<Double, Integer> wordTopicTuple = wordTopicMapping[wordId]; - if (wordTopicTuple != null) { - final int count = Integer.parseInt(matcher.group(2)); - countMap.count(wordTopicTuple.second(), count); - totalCount += count; - } + final File gamFile = new File(outDirSeq, "gam.dat"); + in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile))); + + for (String articleId : index) { + // normalize topic proportions + double totalTopicProportions = 0; + double[] topicProportions = new double[Constants.K_TOPICS]; + for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + double topicProportion = Double.parseDouble(in.readLine()); + topicProportions[idxTopic] = topicProportion; + totalTopicProportions += topicProportion; } - // create list of topics refs referencing topics with counted - // occurrences, sum accepted topic word count - long reducedCount = 0; - final List<TopicRef> newTopicRefs = new ArrayList<>(countMap.size()); - for (final Entry<Integer, Integer> entry : countMap.entrySet()) { - // check if topic above threshold - if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) { - reducedCount += entry.getValue(); - final TopicFull topic = newTopics.get(entry.getKey()); - // TODO words with low relative likeliness are ignored. - // topic references from this file are possibly wrong. - // fix this by checking if the word is actually accepted - // by the referenced topic. - final TopicRef ref = new TopicRef(); - ref.setCount(entry.getValue()); - ref.setTopic(new Topic(topic.getId())); - newTopicRefs.add(ref); - } + // create topic references + final List<TopicRef> newTopicRefs = new ArrayList<>(Constants.K_TOPICS); + for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + TopicRef newTopicRef = new TopicRef(); + TopicFull topicFull = newTopics.get(idxTopic); + newTopicRef.setTopic(new Topic(topicFull.getId())); + newTopicRef.setShare(topicProportions[idxTopic] / totalTopicProportions); + newTopicRefs.add(newTopicRef); } - log.info("article with " + totalCount + " topic refs and " + reducedCount + " reduced topic refs (" - + (reducedCount - totalCount) + ")"); - - // calculate each accepted topic share - for (final TopicRef ref : newTopicRefs) - ref.setShare((double) ref.getCount() / reducedCount); - + // update article if (!newTopicRefs.isEmpty()) { Collections.sort(newTopicRefs, Comparator.reverseOrder()); // update article with topic references (partial update) final ArticleFull article = new ArticleFull(); - article.setId(index.get(articleIndex++)); + article.setId(articleId); article.setTopics(newTopicRefs); try { diff --git a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java index 0a7a4f03c4ae0115a764b805416e6b3a1c8ff26a..36be515e2596f09e06f869df9ba0ffa593d9733d 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java @@ -10,19 +10,28 @@ import org.mongodb.morphia.annotations.Embedded; @Embedded public class Sequence implements Comparable<Sequence>, Serializable { - private Date date; + private Date startDate; + private Date endDate; private Integer number; @Embedded private List<TopicWord> words; - public Date getDate() { - return date; + public Date getStartDate() { + return startDate; } - public void setDate(final Date date) { - this.date = date; + public void setStartDate(Date startDate) { + this.startDate = startDate; + } + + public Date getEndDate() { + return endDate; + } + + public void setEndDate(Date endDate) { + this.endDate = endDate; } public Integer getNumber() { @@ -48,7 +57,8 @@ public class Sequence implements Comparable<Sequence>, Serializable { @Override public String toString() { - return "Sequence [date=" + date + ", number=" + number + ", words=" + words + "]"; + return "Sequence [startDate=" + startDate + ", endDate=" + endDate + ", number=" + number + ", words=" + words + + "]"; } }