From fd5bc11db17d5422a5836c7e7e08de57742aeef7 Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Thu, 25 Feb 2016 00:03:22 +0100
Subject: [PATCH] removed topic similarity, too complex, meh

---
 .../main/java/de/vipra/cmd/lda/Analyzer.java  |  7 +-
 .../java/de/vipra/cmd/lda/DTMAnalyzer.java    |  3 +-
 .../java/de/vipra/cmd/lda/JGibbAnalyzer.java  | 12 ++-
 .../de/vipra/cmd/option/ModelingCommand.java  |  9 +-
 vipra-ui/app/html/topics/show.html            | 12 +--
 vipra-ui/app/html/topics/similar.html         |  5 --
 vipra-ui/app/js/app.js                        | 11 +--
 .../src/main/java/de/vipra/util/CountMap.java |  4 +
 .../src/main/java/de/vipra/util/MultiMap.java | 46 ++++++++++
 .../java/de/vipra/util/TopicSimilarity.java   | 27 ++++++
 .../src/main/java/de/vipra/util/WordMap.java  | 85 -------------------
 .../java/de/vipra/util/model/TopicFull.java   | 12 ---
 12 files changed, 92 insertions(+), 141 deletions(-)
 delete mode 100644 vipra-ui/app/html/topics/similar.html
 create mode 100644 vipra-util/src/main/java/de/vipra/util/MultiMap.java
 create mode 100644 vipra-util/src/main/java/de/vipra/util/TopicSimilarity.java
 delete mode 100644 vipra-util/src/main/java/de/vipra/util/WordMap.java

diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java
index 1d8e0f20..77b7f29f 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java
@@ -2,7 +2,6 @@ package de.vipra.cmd.lda;
 
 import de.vipra.cmd.ex.AnalyzerException;
 import de.vipra.util.Config;
-import de.vipra.util.WordMap;
 
 public abstract class Analyzer {
 
@@ -16,11 +15,11 @@ public abstract class Analyzer {
 		return name;
 	}
 
-	public abstract void init(Config config, WordMap wordMap) throws AnalyzerException;
+	public abstract void init(Config config) throws AnalyzerException;
 
 	public abstract void analyze() throws AnalyzerException;
 
-	public static Analyzer getAnalyzer(Config config, WordMap wordMap) throws AnalyzerException {
+	public static Analyzer getAnalyzer(Config config) throws AnalyzerException {
 		Analyzer analyzer = null;
 		switch (config.analyzer) {
 			case DTM:
@@ -32,7 +31,7 @@ public abstract class Analyzer {
 			default:
 				return null;
 		}
-		analyzer.init(config, wordMap);
+		analyzer.init(config);
 		return analyzer;
 	}
 
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
index 306ed272..1e922e5e 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
@@ -12,7 +12,6 @@ import de.vipra.cmd.ex.AnalyzerException;
 import de.vipra.util.Config;
 import de.vipra.util.Constants;
 import de.vipra.util.StringUtils;
-import de.vipra.util.WordMap;
 import de.vipra.util.ex.ConfigException;
 
 public class DTMAnalyzer extends Analyzer {
@@ -33,7 +32,7 @@ public class DTMAnalyzer extends Analyzer {
 	}
 
 	@Override
-	public void init(Config config, WordMap wordMap) throws AnalyzerException {
+	public void init(Config config) throws AnalyzerException {
 		try {
 			File dataDir = config.getDataDirectory();
 			this.modelDir = new File(dataDir, NAME);
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
index d6b339ca..fd8ce1ba 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
@@ -26,7 +26,6 @@ import de.vipra.util.Config;
 import de.vipra.util.Constants;
 import de.vipra.util.CountMap;
 import de.vipra.util.FileUtils;
-import de.vipra.util.WordMap;
 import de.vipra.util.ex.ConfigException;
 import de.vipra.util.ex.DatabaseException;
 import de.vipra.util.model.ArticleFull;
@@ -58,7 +57,7 @@ public class JGibbAnalyzer extends Analyzer {
 	}
 
 	@Override
-	public void init(Config config, WordMap wordMap) throws AnalyzerException {
+	public void init(Config config) throws AnalyzerException {
 		options = new LDACmdOption();
 
 		try {
@@ -109,8 +108,11 @@ public class JGibbAnalyzer extends Analyzer {
 			throw new AnalyzerException(e);
 		}
 
+		// the list of new topics
 		List<TopicFull> newTopics = new ArrayList<>(options.K);
+		// a map of topic index -> topic. resolves topic ids from tassign file
 		Map<Integer, Topic> newTopicsMap = new HashMap<>(options.K);
+		// set of new words
 		Set<Word> newWords = new HashSet<>();
 
 		TopicFull newTopic = null;
@@ -182,14 +184,15 @@ public class JGibbAnalyzer extends Analyzer {
 				// create list of topics refs referencing topics with counted
 				// occurrences, sum accepted topic word count
 				long reducedCount = 0;
-				List<TopicRef> newTopicRefs = new ArrayList<>();
+				List<TopicRef> newTopicRefs = new ArrayList<>(countMap.size());
 				for (Entry<String, Integer> entry : countMap.entrySet()) {
 					// check if topic above threshold
 					if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) {
 						reducedCount += entry.getValue();
+						Topic topic = newTopicsMap.get(Integer.parseInt(entry.getKey()));
 						TopicRef ref = new TopicRef();
 						ref.setCount(entry.getValue());
-						ref.setTopic(newTopicsMap.get(Integer.parseInt(entry.getKey())));
+						ref.setTopic(topic);
 						newTopicRefs.add(ref);
 					}
 				}
@@ -203,6 +206,7 @@ public class JGibbAnalyzer extends Analyzer {
 					ArticleFull article = new ArticleFull();
 					article.setId(index.get(articleIndex++));
 					article.setTopics(newTopicRefs);
+
 					try {
 						// TODO: using field name here. Hard to refactor
 						dbArticles.updateSingle(article, "topics");
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java
index fd5a5a6e..09557333 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java
@@ -7,25 +7,18 @@ import de.vipra.cmd.lda.Analyzer;
 import de.vipra.util.Config;
 import de.vipra.util.StringUtils;
 import de.vipra.util.Timer;
-import de.vipra.util.WordMap;
-import de.vipra.util.model.Word;
-import de.vipra.util.service.MongoService;
 
 public class ModelingCommand implements Command {
 
 	public static final Logger log = LogManager.getLogger(ModelingCommand.class);
 
 	private Config config;
-	private MongoService<Word, String> dbWords;
-	private WordMap wordMap;
 	private Analyzer analyzer;
 
 	@Override
 	public void run() throws Exception {
 		config = Config.getConfig();
-		dbWords = MongoService.getDatabaseService(config, Word.class);
-		wordMap = new WordMap(dbWords);
-		analyzer = Analyzer.getAnalyzer(config, wordMap);
+		analyzer = Analyzer.getAnalyzer(config);
 
 		log.info("using analyzer: " + analyzer.getName());
 
diff --git a/vipra-ui/app/html/topics/show.html b/vipra-ui/app/html/topics/show.html
index 2875d8c7..0d31e783 100644
--- a/vipra-ui/app/html/topics/show.html
+++ b/vipra-ui/app/html/topics/show.html
@@ -32,12 +32,6 @@
         <td>
           <a class="btn btn-default" ui-sref="topics.show.articles({id:topic.id})">Articles</a>
         </td>
-        <td>
-          <bs-dropdown label="Similar Topics">
-            <li><a ui-sref="topics.show.similar({id:topic.id, type:'by-words'})">By word share</a></li>
-            <li><a ui-sref="topics.show.similar({id:topic.id, type:'by-articles'})">By article share</a></li>
-          </bs-dropdown>
-        </td>
       </tr>
     </table>
   </div>
@@ -52,10 +46,6 @@
             <th>ID</th>
             <td ng-bind="::topic.id"></td>
           </tr>
-          <tr>
-            <th>Index</th>
-            <td ng-bind="::topic.index"></td>
-          </tr>
           <tr>
             <th>Created</th>
             <td ng-bind="::topicCreated"></td>
@@ -87,7 +77,7 @@
         <tbody>
           <tr ng-repeat="word in topic.words | orderBy:wordSort:wordSortRev">
             <td><a ui-sref="words.show({id:word.id})" ng-bind="word.id"></a></td>
-            <td ng-bind="word.likeliness"></td>
+            <td ng-bind-template="{{word.likeliness.toFixed(6)}}"></td>
           </tr>
         </tbody>
       </table>
diff --git a/vipra-ui/app/html/topics/similar.html b/vipra-ui/app/html/topics/similar.html
deleted file mode 100644
index e868dc6e..00000000
--- a/vipra-ui/app/html/topics/similar.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<div ng-cloak ng-hide="$state.current.name !== 'topics.show.similar'">
-
-</div>
-
-<div ng-cloak ui-view></div>
\ No newline at end of file
diff --git a/vipra-ui/app/js/app.js b/vipra-ui/app/js/app.js
index f348a0cb..58fa1c0a 100644
--- a/vipra-ui/app/js/app.js
+++ b/vipra-ui/app/js/app.js
@@ -96,16 +96,7 @@
       templateUrl: 'html/topics/articles.html',
       controller: 'TopicsArticlesController',
       ncyBreadcrumb: {
-        label: 'Topic Articles'
-      }
-    });
-
-    $stateProvider.state('topics.show.similar', {
-      url: '/similar/:type',
-      templateUrl: 'html/topics/similar.html',
-      controller: 'TopicsSimilarController',
-      ncyBreadcrumb: {
-        label: 'Similar Topics (by {{typeLabel}})'
+        label: 'Articles'
       }
     });
 
diff --git a/vipra-util/src/main/java/de/vipra/util/CountMap.java b/vipra-util/src/main/java/de/vipra/util/CountMap.java
index 45b3d48d..5200660a 100644
--- a/vipra-util/src/main/java/de/vipra/util/CountMap.java
+++ b/vipra-util/src/main/java/de/vipra/util/CountMap.java
@@ -29,4 +29,8 @@ public class CountMap<T> {
 		return map.entrySet();
 	}
 
+	public int size() {
+		return map.size();
+	}
+
 }
diff --git a/vipra-util/src/main/java/de/vipra/util/MultiMap.java b/vipra-util/src/main/java/de/vipra/util/MultiMap.java
new file mode 100644
index 00000000..bcd8411d
--- /dev/null
+++ b/vipra-util/src/main/java/de/vipra/util/MultiMap.java
@@ -0,0 +1,46 @@
+package de.vipra.util;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+public class MultiMap<K, V> {
+
+	private final Map<K, Set<V>> map;
+
+	public MultiMap() {
+		this.map = new HashMap<K, Set<V>>();
+	}
+
+	public void put(K key, V value) {
+		Set<V> set = map.get(key);
+		if (set == null)
+			set = new HashSet<>();
+		set.add(value);
+		map.put(key, set);
+	}
+
+	public void put(K key, Collection<V> values) {
+		Set<V> set = map.get(key);
+		if (set == null)
+			set = new HashSet<>();
+		set.addAll(values);
+		map.put(key, set);
+	}
+
+	public Set<V> get(K key) {
+		return map.get(key);
+	}
+
+	public Set<Entry<K, Set<V>>> entrySet() {
+		return map.entrySet();
+	}
+
+	public int size() {
+		return map.size();
+	}
+
+}
diff --git a/vipra-util/src/main/java/de/vipra/util/TopicSimilarity.java b/vipra-util/src/main/java/de/vipra/util/TopicSimilarity.java
new file mode 100644
index 00000000..f68e5e8e
--- /dev/null
+++ b/vipra-util/src/main/java/de/vipra/util/TopicSimilarity.java
@@ -0,0 +1,27 @@
+package de.vipra.util;
+
+import de.vipra.util.model.Topic;
+
+public class TopicSimilarity {
+
+	private Topic topic;
+
+	private int shareCount;
+
+	public Topic getTopic() {
+		return topic;
+	}
+
+	public void setTopic(Topic topic) {
+		this.topic = topic;
+	}
+
+	public int getShareCount() {
+		return shareCount;
+	}
+
+	public void setShareCount(int shareCount) {
+		this.shareCount = shareCount;
+	}
+
+}
diff --git a/vipra-util/src/main/java/de/vipra/util/WordMap.java b/vipra-util/src/main/java/de/vipra/util/WordMap.java
deleted file mode 100644
index ba1a02b2..00000000
--- a/vipra-util/src/main/java/de/vipra/util/WordMap.java
+++ /dev/null
@@ -1,85 +0,0 @@
-package de.vipra.util;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import de.vipra.util.ex.DatabaseException;
-import de.vipra.util.model.Word;
-import de.vipra.util.service.MongoService;
-
-public class WordMap {
-
-	public static final Logger log = LoggerFactory.getLogger(WordMap.class);
-
-	private final MongoService<Word, String> dbWords;
-	private final Map<String, Word> wordMap;
-	private final Set<Word> newWords;
-	private boolean createNow = false;
-
-	public WordMap(MongoService<Word, String> dbWords) {
-		this.dbWords = dbWords;
-		this.wordMap = new HashMap<>();
-		this.newWords = new HashSet<>();
-		List<Word> words = dbWords.getAll();
-		for (Word word : words)
-			wordMap.put(word.getId().toLowerCase(), word);
-	}
-
-	public Word get(Object w) {
-		String strWord = w.toString().toLowerCase();
-		Word word = wordMap.get(strWord);
-		if (word == null) {
-			word = new Word(strWord);
-			createWord(word);
-			wordMap.put(strWord, word);
-		}
-		return word;
-	}
-
-	public void add(Object w) {
-		get(w);
-	}
-
-	private Word createWord(Word word) {
-		if (createNow) {
-			try {
-				dbWords.createSingle(word);
-				newWords.add(word);
-			} catch (DatabaseException e) {
-				log.error("could not create word in database", e);
-				throw new RuntimeException(e);
-			}
-		}
-		return word;
-	}
-
-	public void create() throws DatabaseException {
-		List<Word> newWords = new ArrayList<>();
-		for (Entry<String, Word> e : wordMap.entrySet())
-			if (!e.getValue().isCreated())
-				newWords.add(e.getValue());
-		dbWords.createMultiple(newWords);
-		this.newWords.addAll(newWords);
-	}
-
-	public boolean isCreateNow() {
-		return createNow;
-	}
-
-	public void setCreateNow(boolean createNow) {
-		this.createNow = createNow;
-	}
-
-	public Set<Word> getNewWords() {
-		return newWords;
-	}
-
-}
diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java
index c46f5a25..177a791d 100644
--- a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java
+++ b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java
@@ -10,7 +10,6 @@ import org.mongodb.morphia.annotations.Embedded;
 import org.mongodb.morphia.annotations.Entity;
 import org.mongodb.morphia.annotations.Id;
 import org.mongodb.morphia.annotations.PrePersist;
-import org.mongodb.morphia.annotations.Transient;
 
 import de.vipra.util.Constants;
 import de.vipra.util.MongoUtils;
@@ -32,9 +31,6 @@ public class TopicFull implements Model<ObjectId>, Serializable {
 	@QueryIgnore(multi = true)
 	private List<TopicWord> words;
 
-	@Transient
-	private List<ArticleFull> articles;
-
 	private Date created;
 
 	private Date modified;
@@ -77,14 +73,6 @@ public class TopicFull implements Model<ObjectId>, Serializable {
 		this.words = topicWords;
 	}
 
-	public List<ArticleFull> getArticles() {
-		return articles;
-	}
-
-	public void setArticles(List<ArticleFull> articles) {
-		this.articles = articles;
-	}
-
 	public Date getCreated() {
 		return created;
 	}
-- 
GitLab