From 6f6a13a48b308ee8319eb55aebbe1fd54cb8191c Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Sun, 6 Mar 2016 18:19:04 +0100
Subject: [PATCH] added missing lines to topic name generation

---
 .../java/de/vipra/cmd/lda/DTMAnalyzer.java    |  4 ++--
 .../java/de/vipra/cmd/lda/JGibbAnalyzer.java  | 21 +++++++++++++------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
index ecfc02ad..25fe7de0 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
@@ -202,16 +202,16 @@ public class DTMAnalyzer extends Analyzer {
 						}
 					}
 
-					// collect top n words
 					if (!newSeqTopicWords.isEmpty()) {
 						Collections.sort(newSeqTopicWords, Comparator.reverseOrder());
 
 						// top n percent cutoff
 						if (seqPercentCutoff) {
 							final int fromIndex = (int) Math.round(newSeqTopicWords.size() * Constants.PERCENT_PROB);
-							newSeqTopicWords.subList(fromIndex, newSeqTopicWords.size());
+							newSeqTopicWords.subList(fromIndex, newSeqTopicWords.size()).clear();
 						}
 
+						// collect top words
 						topTopicWords.addAll(newSeqTopicWords.subList(0,
 								Math.min(newSeqTopicWords.size(), Constants.TOPIC_AUTO_NAMING_WORDS)));
 					}
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
index 23064625..a9814c61 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
@@ -98,6 +98,9 @@ public class JGibbAnalyzer extends Analyzer {
 		estimator.init(options);
 		estimator.estimate();
 
+		final boolean seqRelativeCutoff = Constants.MINIMUM_RELATIVE_PROB > 0;
+		final boolean seqPercentCutoff = Constants.PERCENT_PROB < 1;
+
 		// read topic definitions and save
 
 		final File twords = new File(modelDir, NAME + ".twords");
@@ -146,18 +149,24 @@ public class JGibbAnalyzer extends Analyzer {
 			final double maxLikeliness = maxLikelinesses[topicIndex];
 			final ArrayList<TopicWord> filteredTopicWords = new ArrayList<>(topic.getWords().size());
 			for (final TopicWord word : topic.getWords()) {
-				if (word.getLikeliness() >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) {
+				if (!seqRelativeCutoff || word.getLikeliness() >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) {
 					filteredTopicWords.add(word);
 					newWords.add(word.getWord());
 				}
 			}
 			topic.setWords(filteredTopicWords);
-		}
 
-		// sort topic words and generate topic name
-		for (final TopicFull topic : newTopics) {
-			Collections.sort(topic.getWords(), Collections.reverseOrder());
-			topic.setName(TopicFull.getNameFromWords(topic.getWords()));
+			if (!filteredTopicWords.isEmpty()) {
+				Collections.sort(filteredTopicWords, Collections.reverseOrder());
+
+				// top n percent cutoff
+				if (seqPercentCutoff) {
+					final int fromIndex = (int) Math.round(filteredTopicWords.size() * Constants.PERCENT_PROB);
+					filteredTopicWords.subList(fromIndex, filteredTopicWords.size()).clear();
+				}
+
+				topic.setName(TopicFull.getNameFromWords(filteredTopicWords));
+			}
 		}
 
 		// recreate topics and words
-- 
GitLab