From 400e479aba31d6b83ae8aa9f227853a4c6f86d13 Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Thu, 3 Mar 2016 20:57:05 +0100
Subject: [PATCH] added start,enddate to sequence, needs update for
 sequenceindex

---
 .../de/vipra/cmd/file/DTMSequenceIndex.java   |  10 ++
 .../java/de/vipra/cmd/lda/DTMAnalyzer.java    | 115 ++++--------------
 .../java/de/vipra/util/model/Sequence.java    |  22 +++-
 3 files changed, 50 insertions(+), 97 deletions(-)

diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java
index fabdb6e0..bd6b19eb 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java
@@ -88,6 +88,16 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT
 		return windowSizes.size();
 	}
 
+	public Date getStartDate(int index) {
+		// TODO implement
+		return null;
+	}
+
+	public Date getEndDate(int index) {
+		// TODO implement
+		return null;
+	}
+
 	@Override
 	public Iterator<DTMDateIndexEntry> iterator() {
 		Collections.sort(entries);
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
index 05e5a3a2..188d49f3 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
@@ -11,10 +11,7 @@ import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map.Entry;
 import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -27,9 +24,7 @@ import de.vipra.cmd.file.FilebaseIndex;
 import de.vipra.util.ArrayUtils;
 import de.vipra.util.Config;
 import de.vipra.util.Constants;
-import de.vipra.util.CountMap;
 import de.vipra.util.StringUtils;
-import de.vipra.util.Tuple;
 import de.vipra.util.ex.ConfigException;
 import de.vipra.util.ex.DatabaseException;
 import de.vipra.util.model.ArticleFull;
@@ -146,9 +141,6 @@ public class DTMAnalyzer extends Analyzer {
 			final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS);
 			// collects created words
 			final Set<Word> newWords = new HashSet<>(wordCount);
-			// collect mapping between words and topics
-			@SuppressWarnings("unchecked")
-			final Tuple<Double, Integer>[] wordTopicMapping = new Tuple[wordCount];
 
 			log.info("vocabulary size: " + wordCount);
 			log.info("sequences: " + sequencesCount);
@@ -162,7 +154,6 @@ public class DTMAnalyzer extends Analyzer {
 				// create new topic
 				final TopicFull newTopic = new TopicFull();
 				final List<Sequence> newSequences = new ArrayList<>(sequencesCount);
-				final List<TopicWord> newTopicWords = new ArrayList<>(wordCount);
 				newTopic.setSequences(newSequences);
 
 				in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile)));
@@ -178,41 +169,9 @@ public class DTMAnalyzer extends Analyzer {
 
 				in.close();
 
-				// find maximums
-				final double[] maxWordLikelinesses = ArrayUtils.findRowMaximum(likelinesses);
+				// find maximum
 				final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses);
-				final double maxOverallLikeliness = ArrayUtils.findMaximum(maxSeqLikelinesses);
-				final double minAcceptableLikeliness = (maxOverallLikeliness >= 0 ? 1
-						: 2 - Constants.MINIMUM_RELATIVE_PROB) * maxOverallLikeliness;
 
-				// static topic and word topic mapping
-				// most likely words form the static topic over all sequences
-				for (int idxWord = 0; idxWord < wordCount; idxWord++) {
-					if (maxWordLikelinesses[idxWord] >= minAcceptableLikeliness) {
-						// add word to static topic
-						final Word word = vocab.getWord(idxWord);
-						newWords.add(word);
-						final TopicWord topicWord = new TopicWord(word, maxWordLikelinesses[idxWord]);
-						newTopicWords.add(topicWord);
-
-						// check if better word topic mapping than previous
-						final Tuple<Double, Integer> tuple = wordTopicMapping[idxWord];
-						if (tuple == null)
-							wordTopicMapping[idxWord] = new Tuple<>(maxWordLikelinesses[idxWord], idxTopic);
-						else if (maxWordLikelinesses[idxWord] > tuple.first())
-							tuple.setSecond(idxTopic);
-					}
-				}
-
-				if (newTopicWords.isEmpty())
-					continue;
-
-				newTopic.setWords(newTopicWords);
-
-				String msg = "topic with " + newTopicWords.size() + " "
-						+ StringUtils.quantity(newTopicWords.size(), "word") + ", sequences: [";
-
-				// dynamic topics
 				// go through each sequence and gather all words that are above
 				// the minimum relative word likeliness
 				for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) {
@@ -234,13 +193,11 @@ public class DTMAnalyzer extends Analyzer {
 					final Sequence newSequence = new Sequence();
 					newSequence.setNumber(idxSeq);
 					newSequence.setWords(newSeqTopicWords);
+					newSequence.setStartDate(sequences.getStartDate(idxSeq));
+					newSequence.setEndDate(sequences.getEndDate(idxSeq));
 					newSequences.add(newSequence);
-
-					msg += " " + newSeqTopicWords.size();
 				}
 
-				log.info(msg + " ]");
-
 				newTopics.add(newTopic);
 			}
 
@@ -265,60 +222,36 @@ public class DTMAnalyzer extends Analyzer {
 
 			// create topic references
 
-			final File multFile = new File(modelDir, "dtm-mult.dat");
-			in = new BufferedReader(new InputStreamReader(new FileInputStream(multFile)));
-			final Pattern wordCountPattern = Pattern.compile("(\\d+):(\\d+)");
-			int articleIndex = 0;
-
-			// for each article in the model file
-			while ((line = in.readLine()) != null) {
-				// get topic id from word id, count topics
-				final CountMap<Integer> countMap = new CountMap<>();
-				final Matcher matcher = wordCountPattern.matcher(line);
-				double totalCount = 0;
-				while (matcher.find()) {
-					final Integer wordId = Integer.parseInt(matcher.group(1));
-					final Tuple<Double, Integer> wordTopicTuple = wordTopicMapping[wordId];
-					if (wordTopicTuple != null) {
-						final int count = Integer.parseInt(matcher.group(2));
-						countMap.count(wordTopicTuple.second(), count);
-						totalCount += count;
-					}
+			final File gamFile = new File(outDirSeq, "gam.dat");
+			in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile)));
+
+			for (String articleId : index) {
+				// normalize topic proportions
+				double totalTopicProportions = 0;
+				double[] topicProportions = new double[Constants.K_TOPICS];
+				for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
+					double topicProportion = Double.parseDouble(in.readLine());
+					topicProportions[idxTopic] = topicProportion;
+					totalTopicProportions += topicProportion;
 				}
 
-				// create list of topics refs referencing topics with counted
-				// occurrences, sum accepted topic word count
-				long reducedCount = 0;
-				final List<TopicRef> newTopicRefs = new ArrayList<>(countMap.size());
-				for (final Entry<Integer, Integer> entry : countMap.entrySet()) {
-					// check if topic above threshold
-					if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) {
-						reducedCount += entry.getValue();
-						final TopicFull topic = newTopics.get(entry.getKey());
-						// TODO words with low relative likeliness are ignored.
-						// topic references from this file are possibly wrong.
-						// fix this by checking if the word is actually accepted
-						// by the referenced topic.
-						final TopicRef ref = new TopicRef();
-						ref.setCount(entry.getValue());
-						ref.setTopic(new Topic(topic.getId()));
-						newTopicRefs.add(ref);
-					}
+				// create topic references
+				final List<TopicRef> newTopicRefs = new ArrayList<>(Constants.K_TOPICS);
+				for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
+					TopicRef newTopicRef = new TopicRef();
+					TopicFull topicFull = newTopics.get(idxTopic);
+					newTopicRef.setTopic(new Topic(topicFull.getId()));
+					newTopicRef.setShare(topicProportions[idxTopic] / totalTopicProportions);
+					newTopicRefs.add(newTopicRef);
 				}
 
-				log.info("article with " + totalCount + " topic refs and " + reducedCount + " reduced topic refs ("
-						+ (reducedCount - totalCount) + ")");
-
-				// calculate each accepted topic share
-				for (final TopicRef ref : newTopicRefs)
-					ref.setShare((double) ref.getCount() / reducedCount);
-
+				// update article
 				if (!newTopicRefs.isEmpty()) {
 					Collections.sort(newTopicRefs, Comparator.reverseOrder());
 
 					// update article with topic references (partial update)
 					final ArticleFull article = new ArticleFull();
-					article.setId(index.get(articleIndex++));
+					article.setId(articleId);
 					article.setTopics(newTopicRefs);
 
 					try {
diff --git a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java
index 0a7a4f03..36be515e 100644
--- a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java
+++ b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java
@@ -10,19 +10,28 @@ import org.mongodb.morphia.annotations.Embedded;
 @Embedded
 public class Sequence implements Comparable<Sequence>, Serializable {
 
-	private Date date;
+	private Date startDate;
+	private Date endDate;
 
 	private Integer number;
 
 	@Embedded
 	private List<TopicWord> words;
 
-	public Date getDate() {
-		return date;
+	public Date getStartDate() {
+		return startDate;
 	}
 
-	public void setDate(final Date date) {
-		this.date = date;
+	public void setStartDate(Date startDate) {
+		this.startDate = startDate;
+	}
+
+	public Date getEndDate() {
+		return endDate;
+	}
+
+	public void setEndDate(Date endDate) {
+		this.endDate = endDate;
 	}
 
 	public Integer getNumber() {
@@ -48,7 +57,8 @@ public class Sequence implements Comparable<Sequence>, Serializable {
 
 	@Override
 	public String toString() {
-		return "Sequence [date=" + date + ", number=" + number + ", words=" + words + "]";
+		return "Sequence [startDate=" + startDate + ", endDate=" + endDate + ", number=" + number + ", words=" + words
+				+ "]";
 	}
 
 }
-- 
GitLab