From fd5bc11db17d5422a5836c7e7e08de57742aeef7 Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Thu, 25 Feb 2016 00:03:22 +0100 Subject: [PATCH] removed topic similarity, too complex, meh --- .../main/java/de/vipra/cmd/lda/Analyzer.java | 7 +- .../java/de/vipra/cmd/lda/DTMAnalyzer.java | 3 +- .../java/de/vipra/cmd/lda/JGibbAnalyzer.java | 12 ++- .../de/vipra/cmd/option/ModelingCommand.java | 9 +- vipra-ui/app/html/topics/show.html | 12 +-- vipra-ui/app/html/topics/similar.html | 5 -- vipra-ui/app/js/app.js | 11 +-- .../src/main/java/de/vipra/util/CountMap.java | 4 + .../src/main/java/de/vipra/util/MultiMap.java | 46 ++++++++++ .../java/de/vipra/util/TopicSimilarity.java | 27 ++++++ .../src/main/java/de/vipra/util/WordMap.java | 85 ------------------- .../java/de/vipra/util/model/TopicFull.java | 12 --- 12 files changed, 92 insertions(+), 141 deletions(-) delete mode 100644 vipra-ui/app/html/topics/similar.html create mode 100644 vipra-util/src/main/java/de/vipra/util/MultiMap.java create mode 100644 vipra-util/src/main/java/de/vipra/util/TopicSimilarity.java delete mode 100644 vipra-util/src/main/java/de/vipra/util/WordMap.java diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java index 1d8e0f20..77b7f29f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java @@ -2,7 +2,6 @@ package de.vipra.cmd.lda; import de.vipra.cmd.ex.AnalyzerException; import de.vipra.util.Config; -import de.vipra.util.WordMap; public abstract class Analyzer { @@ -16,11 +15,11 @@ public abstract class Analyzer { return name; } - public abstract void init(Config config, WordMap wordMap) throws AnalyzerException; + public abstract void init(Config config) throws AnalyzerException; public abstract void analyze() throws AnalyzerException; - public static Analyzer getAnalyzer(Config config, WordMap wordMap) throws AnalyzerException { + public static Analyzer getAnalyzer(Config config) throws AnalyzerException { Analyzer analyzer = null; switch (config.analyzer) { case DTM: @@ -32,7 +31,7 @@ public abstract class Analyzer { default: return null; } - analyzer.init(config, wordMap); + analyzer.init(config); return analyzer; } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index 306ed272..1e922e5e 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -12,7 +12,6 @@ import de.vipra.cmd.ex.AnalyzerException; import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.StringUtils; -import de.vipra.util.WordMap; import de.vipra.util.ex.ConfigException; public class DTMAnalyzer extends Analyzer { @@ -33,7 +32,7 @@ public class DTMAnalyzer extends Analyzer { } @Override - public void init(Config config, WordMap wordMap) throws AnalyzerException { + public void init(Config config) throws AnalyzerException { try { File dataDir = config.getDataDirectory(); this.modelDir = new File(dataDir, NAME); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java index d6b339ca..fd8ce1ba 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java @@ -26,7 +26,6 @@ import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.CountMap; import de.vipra.util.FileUtils; -import de.vipra.util.WordMap; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleFull; @@ -58,7 +57,7 @@ public class JGibbAnalyzer extends Analyzer { } @Override - public void init(Config config, WordMap wordMap) throws AnalyzerException { + public void init(Config config) throws AnalyzerException { options = new LDACmdOption(); try { @@ -109,8 +108,11 @@ public class JGibbAnalyzer extends Analyzer { throw new AnalyzerException(e); } + // the list of new topics List<TopicFull> newTopics = new ArrayList<>(options.K); + // a map of topic index -> topic. resolves topic ids from tassign file Map<Integer, Topic> newTopicsMap = new HashMap<>(options.K); + // set of new words Set<Word> newWords = new HashSet<>(); TopicFull newTopic = null; @@ -182,14 +184,15 @@ public class JGibbAnalyzer extends Analyzer { // create list of topics refs referencing topics with counted // occurrences, sum accepted topic word count long reducedCount = 0; - List<TopicRef> newTopicRefs = new ArrayList<>(); + List<TopicRef> newTopicRefs = new ArrayList<>(countMap.size()); for (Entry<String, Integer> entry : countMap.entrySet()) { // check if topic above threshold if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) { reducedCount += entry.getValue(); + Topic topic = newTopicsMap.get(Integer.parseInt(entry.getKey())); TopicRef ref = new TopicRef(); ref.setCount(entry.getValue()); - ref.setTopic(newTopicsMap.get(Integer.parseInt(entry.getKey()))); + ref.setTopic(topic); newTopicRefs.add(ref); } } @@ -203,6 +206,7 @@ public class JGibbAnalyzer extends Analyzer { ArticleFull article = new ArticleFull(); article.setId(index.get(articleIndex++)); article.setTopics(newTopicRefs); + try { // TODO: using field name here. Hard to refactor dbArticles.updateSingle(article, "topics"); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java index fd5a5a6e..09557333 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java @@ -7,25 +7,18 @@ import de.vipra.cmd.lda.Analyzer; import de.vipra.util.Config; import de.vipra.util.StringUtils; import de.vipra.util.Timer; -import de.vipra.util.WordMap; -import de.vipra.util.model.Word; -import de.vipra.util.service.MongoService; public class ModelingCommand implements Command { public static final Logger log = LogManager.getLogger(ModelingCommand.class); private Config config; - private MongoService<Word, String> dbWords; - private WordMap wordMap; private Analyzer analyzer; @Override public void run() throws Exception { config = Config.getConfig(); - dbWords = MongoService.getDatabaseService(config, Word.class); - wordMap = new WordMap(dbWords); - analyzer = Analyzer.getAnalyzer(config, wordMap); + analyzer = Analyzer.getAnalyzer(config); log.info("using analyzer: " + analyzer.getName()); diff --git a/vipra-ui/app/html/topics/show.html b/vipra-ui/app/html/topics/show.html index 2875d8c7..0d31e783 100644 --- a/vipra-ui/app/html/topics/show.html +++ b/vipra-ui/app/html/topics/show.html @@ -32,12 +32,6 @@ <td> <a class="btn btn-default" ui-sref="topics.show.articles({id:topic.id})">Articles</a> </td> - <td> - <bs-dropdown label="Similar Topics"> - <li><a ui-sref="topics.show.similar({id:topic.id, type:'by-words'})">By word share</a></li> - <li><a ui-sref="topics.show.similar({id:topic.id, type:'by-articles'})">By article share</a></li> - </bs-dropdown> - </td> </tr> </table> </div> @@ -52,10 +46,6 @@ <th>ID</th> <td ng-bind="::topic.id"></td> </tr> - <tr> - <th>Index</th> - <td ng-bind="::topic.index"></td> - </tr> <tr> <th>Created</th> <td ng-bind="::topicCreated"></td> @@ -87,7 +77,7 @@ <tbody> <tr ng-repeat="word in topic.words | orderBy:wordSort:wordSortRev"> <td><a ui-sref="words.show({id:word.id})" ng-bind="word.id"></a></td> - <td ng-bind="word.likeliness"></td> + <td ng-bind-template="{{word.likeliness.toFixed(6)}}"></td> </tr> </tbody> </table> diff --git a/vipra-ui/app/html/topics/similar.html b/vipra-ui/app/html/topics/similar.html deleted file mode 100644 index e868dc6e..00000000 --- a/vipra-ui/app/html/topics/similar.html +++ /dev/null @@ -1,5 +0,0 @@ -<div ng-cloak ng-hide="$state.current.name !== 'topics.show.similar'"> - -</div> - -<div ng-cloak ui-view></div> \ No newline at end of file diff --git a/vipra-ui/app/js/app.js b/vipra-ui/app/js/app.js index f348a0cb..58fa1c0a 100644 --- a/vipra-ui/app/js/app.js +++ b/vipra-ui/app/js/app.js @@ -96,16 +96,7 @@ templateUrl: 'html/topics/articles.html', controller: 'TopicsArticlesController', ncyBreadcrumb: { - label: 'Topic Articles' - } - }); - - $stateProvider.state('topics.show.similar', { - url: '/similar/:type', - templateUrl: 'html/topics/similar.html', - controller: 'TopicsSimilarController', - ncyBreadcrumb: { - label: 'Similar Topics (by {{typeLabel}})' + label: 'Articles' } }); diff --git a/vipra-util/src/main/java/de/vipra/util/CountMap.java b/vipra-util/src/main/java/de/vipra/util/CountMap.java index 45b3d48d..5200660a 100644 --- a/vipra-util/src/main/java/de/vipra/util/CountMap.java +++ b/vipra-util/src/main/java/de/vipra/util/CountMap.java @@ -29,4 +29,8 @@ public class CountMap<T> { return map.entrySet(); } + public int size() { + return map.size(); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/MultiMap.java b/vipra-util/src/main/java/de/vipra/util/MultiMap.java new file mode 100644 index 00000000..bcd8411d --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/MultiMap.java @@ -0,0 +1,46 @@ +package de.vipra.util; + +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +public class MultiMap<K, V> { + + private final Map<K, Set<V>> map; + + public MultiMap() { + this.map = new HashMap<K, Set<V>>(); + } + + public void put(K key, V value) { + Set<V> set = map.get(key); + if (set == null) + set = new HashSet<>(); + set.add(value); + map.put(key, set); + } + + public void put(K key, Collection<V> values) { + Set<V> set = map.get(key); + if (set == null) + set = new HashSet<>(); + set.addAll(values); + map.put(key, set); + } + + public Set<V> get(K key) { + return map.get(key); + } + + public Set<Entry<K, Set<V>>> entrySet() { + return map.entrySet(); + } + + public int size() { + return map.size(); + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/TopicSimilarity.java b/vipra-util/src/main/java/de/vipra/util/TopicSimilarity.java new file mode 100644 index 00000000..f68e5e8e --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/TopicSimilarity.java @@ -0,0 +1,27 @@ +package de.vipra.util; + +import de.vipra.util.model.Topic; + +public class TopicSimilarity { + + private Topic topic; + + private int shareCount; + + public Topic getTopic() { + return topic; + } + + public void setTopic(Topic topic) { + this.topic = topic; + } + + public int getShareCount() { + return shareCount; + } + + public void setShareCount(int shareCount) { + this.shareCount = shareCount; + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/WordMap.java b/vipra-util/src/main/java/de/vipra/util/WordMap.java deleted file mode 100644 index ba1a02b2..00000000 --- a/vipra-util/src/main/java/de/vipra/util/WordMap.java +++ /dev/null @@ -1,85 +0,0 @@ -package de.vipra.util; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import de.vipra.util.ex.DatabaseException; -import de.vipra.util.model.Word; -import de.vipra.util.service.MongoService; - -public class WordMap { - - public static final Logger log = LoggerFactory.getLogger(WordMap.class); - - private final MongoService<Word, String> dbWords; - private final Map<String, Word> wordMap; - private final Set<Word> newWords; - private boolean createNow = false; - - public WordMap(MongoService<Word, String> dbWords) { - this.dbWords = dbWords; - this.wordMap = new HashMap<>(); - this.newWords = new HashSet<>(); - List<Word> words = dbWords.getAll(); - for (Word word : words) - wordMap.put(word.getId().toLowerCase(), word); - } - - public Word get(Object w) { - String strWord = w.toString().toLowerCase(); - Word word = wordMap.get(strWord); - if (word == null) { - word = new Word(strWord); - createWord(word); - wordMap.put(strWord, word); - } - return word; - } - - public void add(Object w) { - get(w); - } - - private Word createWord(Word word) { - if (createNow) { - try { - dbWords.createSingle(word); - newWords.add(word); - } catch (DatabaseException e) { - log.error("could not create word in database", e); - throw new RuntimeException(e); - } - } - return word; - } - - public void create() throws DatabaseException { - List<Word> newWords = new ArrayList<>(); - for (Entry<String, Word> e : wordMap.entrySet()) - if (!e.getValue().isCreated()) - newWords.add(e.getValue()); - dbWords.createMultiple(newWords); - this.newWords.addAll(newWords); - } - - public boolean isCreateNow() { - return createNow; - } - - public void setCreateNow(boolean createNow) { - this.createNow = createNow; - } - - public Set<Word> getNewWords() { - return newWords; - } - -} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java index c46f5a25..177a791d 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java @@ -10,7 +10,6 @@ import org.mongodb.morphia.annotations.Embedded; import org.mongodb.morphia.annotations.Entity; import org.mongodb.morphia.annotations.Id; import org.mongodb.morphia.annotations.PrePersist; -import org.mongodb.morphia.annotations.Transient; import de.vipra.util.Constants; import de.vipra.util.MongoUtils; @@ -32,9 +31,6 @@ public class TopicFull implements Model<ObjectId>, Serializable { @QueryIgnore(multi = true) private List<TopicWord> words; - @Transient - private List<ArticleFull> articles; - private Date created; private Date modified; @@ -77,14 +73,6 @@ public class TopicFull implements Model<ObjectId>, Serializable { this.words = topicWords; } - public List<ArticleFull> getArticles() { - return articles; - } - - public void setArticles(List<ArticleFull> articles) { - this.articles = articles; - } - public Date getCreated() { return created; } -- GitLab