diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index ecfc02ad7ef90655594efba8a4d61c8f7b0d7823..25fe7de094f1f0032c07e98f36f3642b821b0918 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -202,16 +202,16 @@ public class DTMAnalyzer extends Analyzer { } } - // collect top n words if (!newSeqTopicWords.isEmpty()) { Collections.sort(newSeqTopicWords, Comparator.reverseOrder()); // top n percent cutoff if (seqPercentCutoff) { final int fromIndex = (int) Math.round(newSeqTopicWords.size() * Constants.PERCENT_PROB); - newSeqTopicWords.subList(fromIndex, newSeqTopicWords.size()); + newSeqTopicWords.subList(fromIndex, newSeqTopicWords.size()).clear(); } + // collect top words topTopicWords.addAll(newSeqTopicWords.subList(0, Math.min(newSeqTopicWords.size(), Constants.TOPIC_AUTO_NAMING_WORDS))); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java index 23064625d1fa20889dd17f9e4a9a54408f93c3e8..a9814c61ae0ac95fe3e1a6a71b69de0c5e7d5a5a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java @@ -98,6 +98,9 @@ public class JGibbAnalyzer extends Analyzer { estimator.init(options); estimator.estimate(); + final boolean seqRelativeCutoff = Constants.MINIMUM_RELATIVE_PROB > 0; + final boolean seqPercentCutoff = Constants.PERCENT_PROB < 1; + // read topic definitions and save final File twords = new File(modelDir, NAME + ".twords"); @@ -146,18 +149,24 @@ public class JGibbAnalyzer extends Analyzer { final double maxLikeliness = maxLikelinesses[topicIndex]; final ArrayList<TopicWord> filteredTopicWords = new ArrayList<>(topic.getWords().size()); for (final TopicWord word : topic.getWords()) { - if (word.getLikeliness() >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) { + if (!seqRelativeCutoff || word.getLikeliness() >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) { filteredTopicWords.add(word); newWords.add(word.getWord()); } } topic.setWords(filteredTopicWords); - } - // sort topic words and generate topic name - for (final TopicFull topic : newTopics) { - Collections.sort(topic.getWords(), Collections.reverseOrder()); - topic.setName(TopicFull.getNameFromWords(topic.getWords())); + if (!filteredTopicWords.isEmpty()) { + Collections.sort(filteredTopicWords, Collections.reverseOrder()); + + // top n percent cutoff + if (seqPercentCutoff) { + final int fromIndex = (int) Math.round(filteredTopicWords.size() * Constants.PERCENT_PROB); + filteredTopicWords.subList(fromIndex, filteredTopicWords.size()).clear(); + } + + topic.setName(TopicFull.getNameFromWords(filteredTopicWords)); + } } // recreate topics and words