From 42db9bc3c8517550611e1baea23a74f399962015 Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Wed, 24 Feb 2016 20:29:49 +0100
Subject: [PATCH] updated build script, fixed jgibb modeling

fixed topic name generation
fixed topic word saving
public subdirectory added to gitignore
calculating topic share when modeling now
---
 build.sh                                      |  2 +-
 vipra-backend/.gitignore                      |  1 +
 .../de/vipra/rest/resource/InfoResource.java  |  1 -
 vipra-backend/src/main/webapp/WEB-INF/web.xml |  2 +-
 .../java/de/vipra/cmd/lda/JGibbAnalyzer.java  | 60 ++++++++++++++-----
 vipra-ui/app/js/controllers.js                |  7 +--
 .../main/java/de/vipra/util/Constants.java    |  6 --
 .../java/de/vipra/util/model/TopicRef.java    | 11 ++++
 8 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/build.sh b/build.sh
index 63f31a0d..3c49ceb7 100755
--- a/build.sh
+++ b/build.sh
@@ -57,7 +57,7 @@ echo "-------------------------------" >> $LOG
 cd ./vipra-ui
 ./build.sh >> $LOG 2>&1
 cd ..
-cp -r ./vipra-ui/public/* ./vipra-backend/src/main/webapp
+cp -r ./vipra-ui/public ./vipra-backend/src/main/webapp/public
 if [ $? -ne 0 ]; then
         echo "error"
         exit 1
diff --git a/vipra-backend/.gitignore b/vipra-backend/.gitignore
index 988b884b..04c8f6de 100644
--- a/vipra-backend/.gitignore
+++ b/vipra-backend/.gitignore
@@ -1,2 +1,3 @@
 *.class
 /target/
+/src/main/webapp/public/
diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java
index fbcd8576..9bd18963 100644
--- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java
+++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java
@@ -81,7 +81,6 @@ public class InfoResource {
 			info.put("const.topicautoname", Constants.TOPIC_AUTO_NAMING_WORDS);
 			info.put("const.ktopics", Constants.K_TOPICS);
 			info.put("const.ktopicwords", Constants.K_TOPIC_WORDS);
-			info.put("const.likeprecision", Constants.LIKELINESS_PRECISION);
 			info.put("const.minimumlike", Constants.MINIMUM_LIKELINESS);
 			info.put("const.topicthresh", Constants.TOPIC_THRESHOLD);
 			info.put("const.docminfreq", Constants.DOCUMENT_MIN_WORD_FREQ);
diff --git a/vipra-backend/src/main/webapp/WEB-INF/web.xml b/vipra-backend/src/main/webapp/WEB-INF/web.xml
index d28582b4..90ba036b 100644
--- a/vipra-backend/src/main/webapp/WEB-INF/web.xml
+++ b/vipra-backend/src/main/webapp/WEB-INF/web.xml
@@ -4,7 +4,7 @@
 	xsi:schemaLocation="http://xmlns.jcp.org/xml/ns/javaee http://xmlns.jcp.org/xml/ns/javaee/web-app_3_1.xsd"
 	version="3.1">
 	<welcome-file-list>
-		<welcome-file>index.html</welcome-file>
+		<welcome-file>public/index.html</welcome-file>
 	</welcome-file-list>
 	<servlet>
 		<servlet-name>jersey</servlet-name>
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
index fb8a117e..c2d35fb7 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
@@ -6,10 +6,13 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -47,6 +50,7 @@ public class JGibbAnalyzer extends Analyzer {
 	private LDACmdOption options;
 	private MongoService<ArticleFull, ObjectId> dbArticles;
 	private MongoService<TopicFull, ObjectId> dbTopics;
+	private MongoService<Word, String> dbWords;
 	private FilebaseIndex index;
 
 	protected JGibbAnalyzer() {
@@ -79,6 +83,7 @@ public class JGibbAnalyzer extends Analyzer {
 			config = Config.getConfig();
 			dbArticles = MongoService.getDatabaseService(config, ArticleFull.class);
 			dbTopics = MongoService.getDatabaseService(config, TopicFull.class);
+			dbWords = MongoService.getDatabaseService(config, Word.class);
 			index = new FilebaseIndex(new File(modelDir, "index"));
 		} catch (Exception e) {
 			throw new AnalyzerException(e);
@@ -106,6 +111,7 @@ public class JGibbAnalyzer extends Analyzer {
 
 		List<TopicFull> newTopics = new ArrayList<>(options.K);
 		Map<Integer, Topic> newTopicsMap = new HashMap<>(options.K);
+		Set<Word> newWords = new HashSet<>();
 
 		TopicFull newTopic = null;
 		List<TopicWord> topicWords = null;
@@ -123,13 +129,29 @@ public class JGibbAnalyzer extends Analyzer {
 			}
 
 			String[] parts = line.trim().split("\\s+");
-			TopicWord topicWord = new TopicWord(new Word(parts[0]), Double.parseDouble(parts[1]));
-			topicWords.add(topicWord);
+			double likeliness = Double.parseDouble(parts[1]);
+
+			// check word likeliness
+			if (likeliness >= Constants.MINIMUM_LIKELINESS) {
+				Word newWord = new Word(parts[0]);
+				TopicWord topicWord = new TopicWord(newWord, likeliness);
+				topicWords.add(topicWord);
+				newWords.add(newWord);
+			}
+		}
+
+		// sort topic words and generate topic name
+		for (TopicFull topic : newTopics) {
+			Collections.sort(topic.getWords(), Collections.reverseOrder());
+			topic.setName(TopicFull.getNameFromWords(topic.getWords()));
 		}
 
+		// recreate topics and words
 		dbTopics.drop();
+		dbWords.drop();
 		try {
 			dbTopics.createMultiple(newTopics);
+			dbWords.createMultiple(newWords);
 		} catch (DatabaseException e) {
 			throw new AnalyzerException(e);
 		}
@@ -151,29 +173,37 @@ public class JGibbAnalyzer extends Analyzer {
 				// extract topic ids and count them
 				CountMap<String> countMap = new CountMap<>();
 				Matcher matcher = topicIndexPattern.matcher(line);
+				double totalCount = 0;
 				while (matcher.find()) {
 					countMap.count(matcher.group(1));
+					totalCount++;
 				}
 
 				// create list of topics refs referencing topics with counted
 				// occurrences
 				List<TopicRef> newTopicRefs = new ArrayList<>();
 				for (Entry<String, Integer> entry : countMap.entrySet()) {
-					TopicRef ref = new TopicRef();
-					ref.setCount(entry.getValue());
-					ref.setTopic(newTopicsMap.get(Integer.parseInt(entry.getKey())));
-					newTopicRefs.add(ref);
+					// check if topic above threshold
+					double topicShare = entry.getValue() / totalCount;
+					if (topicShare >= Constants.TOPIC_THRESHOLD) {
+						TopicRef ref = new TopicRef();
+						ref.setCount(entry.getValue());
+						ref.setTopic(newTopicsMap.get(Integer.parseInt(entry.getKey())));
+						newTopicRefs.add(ref);
+					}
 				}
 
-				// update article with topic references (partial update)
-				ArticleFull article = new ArticleFull();
-				article.setId(index.get(articleIndex++));
-				article.setTopics(newTopicRefs);
-				try {
-					// TODO: using field name here. Hard to refactor
-					dbArticles.updateSingle(article, "topics");
-				} catch (DatabaseException e) {
-					log.error(e);
+				if (!newTopicRefs.isEmpty()) {
+					// update article with topic references (partial update)
+					ArticleFull article = new ArticleFull();
+					article.setId(index.get(articleIndex++));
+					article.setTopics(newTopicRefs);
+					try {
+						// TODO: using field name here. Hard to refactor
+						dbArticles.updateSingle(article, "topics");
+					} catch (DatabaseException e) {
+						log.error(e);
+					}
 				}
 			}
 			in.close();
diff --git a/vipra-ui/app/js/controllers.js b/vipra-ui/app/js/controllers.js
index daa5d471..71e13c62 100644
--- a/vipra-ui/app/js/controllers.js
+++ b/vipra-ui/app/js/controllers.js
@@ -302,13 +302,8 @@
       if($scope.article.topics) {
         var topicShareSeries = [],
             topics = $scope.article.topics;
-            topicsCount = 0;
-        for(var i = 0; i < topics.length; i++)
-          topicsCount += topics[i].count;
         for(var i = 0; i < topics.length; i++) {
-          var share = Vipra.toPercent(topics[i].count / topicsCount);
-          topics[i].share = share;
-          topicShareSeries.push({name: topics[i].topic.name.ellipsize(20), y: share});
+          topicShareSeries.push({name: topics[i].topic.name.ellipsize(20), y: topics[i].share});
         }
 
         // highcharts data
diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java
index 32d35b3e..c5bd5390 100644
--- a/vipra-util/src/main/java/de/vipra/util/Constants.java
+++ b/vipra-util/src/main/java/de/vipra/util/Constants.java
@@ -75,12 +75,6 @@ public class Constants {
 	 * library supports this parameter.
 	 */
 	public static final int K_TOPIC_WORDS = 50;
-
-	/**
-	 * Precision of likeliness numbers. Likeliness is calculated for words to
-	 * belong to topics.
-	 */
-	public static final int LIKELINESS_PRECISION = 6;
 	
 	/**
 	 * Minimum likeliness of words. Words with lower likeliness are ignored
diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java
index 00837669..3277e56e 100644
--- a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java
+++ b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java
@@ -11,8 +11,11 @@ public class TopicRef implements Comparable<TopicRef>, Serializable {
 
 	@Reference(ignoreMissing = true)
 	private Topic topic;
+
 	private Integer count;
 
+	private Double share;
+
 	public Integer getCount() {
 		return count;
 	}
@@ -29,6 +32,14 @@ public class TopicRef implements Comparable<TopicRef>, Serializable {
 		this.topic = topic;
 	}
 
+	public Double getShare() {
+		return share;
+	}
+
+	public void setShare(Double share) {
+		this.share = share;
+	}
+
 	@Override
 	public int compareTo(TopicRef arg0) {
 		return count - arg0.getCount();
-- 
GitLab