From 6f6a13a48b308ee8319eb55aebbe1fd54cb8191c Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Sun, 6 Mar 2016 18:19:04 +0100 Subject: [PATCH] added missing lines to topic name generation --- .../java/de/vipra/cmd/lda/DTMAnalyzer.java | 4 ++-- .../java/de/vipra/cmd/lda/JGibbAnalyzer.java | 21 +++++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index ecfc02ad..25fe7de0 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -202,16 +202,16 @@ public class DTMAnalyzer extends Analyzer { } } - // collect top n words if (!newSeqTopicWords.isEmpty()) { Collections.sort(newSeqTopicWords, Comparator.reverseOrder()); // top n percent cutoff if (seqPercentCutoff) { final int fromIndex = (int) Math.round(newSeqTopicWords.size() * Constants.PERCENT_PROB); - newSeqTopicWords.subList(fromIndex, newSeqTopicWords.size()); + newSeqTopicWords.subList(fromIndex, newSeqTopicWords.size()).clear(); } + // collect top words topTopicWords.addAll(newSeqTopicWords.subList(0, Math.min(newSeqTopicWords.size(), Constants.TOPIC_AUTO_NAMING_WORDS))); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java index 23064625..a9814c61 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java @@ -98,6 +98,9 @@ public class JGibbAnalyzer extends Analyzer { estimator.init(options); estimator.estimate(); + final boolean seqRelativeCutoff = Constants.MINIMUM_RELATIVE_PROB > 0; + final boolean seqPercentCutoff = Constants.PERCENT_PROB < 1; + // read topic definitions and save final File twords = new File(modelDir, NAME + ".twords"); @@ -146,18 +149,24 @@ public class JGibbAnalyzer extends Analyzer { final double maxLikeliness = maxLikelinesses[topicIndex]; final ArrayList<TopicWord> filteredTopicWords = new ArrayList<>(topic.getWords().size()); for (final TopicWord word : topic.getWords()) { - if (word.getLikeliness() >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) { + if (!seqRelativeCutoff || word.getLikeliness() >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) { filteredTopicWords.add(word); newWords.add(word.getWord()); } } topic.setWords(filteredTopicWords); - } - // sort topic words and generate topic name - for (final TopicFull topic : newTopics) { - Collections.sort(topic.getWords(), Collections.reverseOrder()); - topic.setName(TopicFull.getNameFromWords(topic.getWords())); + if (!filteredTopicWords.isEmpty()) { + Collections.sort(filteredTopicWords, Collections.reverseOrder()); + + // top n percent cutoff + if (seqPercentCutoff) { + final int fromIndex = (int) Math.round(filteredTopicWords.size() * Constants.PERCENT_PROB); + filteredTopicWords.subList(fromIndex, filteredTopicWords.size()).clear(); + } + + topic.setName(TopicFull.getNameFromWords(filteredTopicWords)); + } } // recreate topics and words -- GitLab