From fa749921554a00bd204ca11a67f1e57a6a290339 Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Thu, 28 Jan 2016 21:25:20 +0100
Subject: [PATCH] added index clearing to clear command

clear command now deletes all elasticsearch indexes
fixed paginator current page query when page is not set (1)
added ignoring article topics with low relevance
---
 ma-impl.sublime-workspace                     | 36 +++++++++++++
 .../de/vipra/cmd/lda/JGibbLDAAnalyzer.java    |  2 +-
 .../de/vipra/cmd/option/ClearCommand.java     |  7 +++
 .../de/vipra/cmd/option/ImportCommand.java    | 51 ++++++++++++-------
 vipra-ui/app/components/pagination-bar.js     | 18 ++++---
 .../main/java/de/vipra/util/Constants.java    |  6 +++
 .../java/de/vipra/util/model/TopicRef.java    | 19 ++++---
 7 files changed, 107 insertions(+), 32 deletions(-)

diff --git a/ma-impl.sublime-workspace b/ma-impl.sublime-workspace
index 975b92e6..19022d5e 100644
--- a/ma-impl.sublime-workspace
+++ b/ma-impl.sublime-workspace
@@ -279,6 +279,14 @@
 	},
 	"buffers":
 	[
+		{
+			"contents": "curl -XPOST 'http://localhost:9200/articles/_search' -d '{\"query\":{\"match\":{\"_all\":\"ibm\"}},\"_source\":{\"exclude\":[\"text\"]}}'",
+			"settings":
+			{
+				"buffer_size": 123,
+				"line_ending": "Unix"
+			}
+		}
 	],
 	"build_system": "",
 	"build_system_choices":
@@ -915,8 +923,36 @@
 	"groups":
 	[
 		{
+			"selected": 0,
 			"sheets":
 			[
+				{
+					"buffer": 0,
+					"semi_transient": false,
+					"settings":
+					{
+						"buffer_size": 123,
+						"regions":
+						{
+						},
+						"selection":
+						[
+							[
+								123,
+								123
+							]
+						],
+						"settings":
+						{
+							"syntax": "Packages/Text/Plain text.tmLanguage"
+						},
+						"translation.x": 0.0,
+						"translation.y": 0.0,
+						"zoom_level": 1.0
+					},
+					"stack_index": 0,
+					"type": "text"
+				}
 			]
 		}
 	],
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java
index 5fe15001..65a012c8 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java
@@ -166,7 +166,7 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer {
 					List<TopicRef> topicCount = new ArrayList<>(countMap.size());
 					for (Entry<String, Integer> e : countMap.entrySet()) {
 						TopicRef tc = new TopicRef();
-						tc.setTopicId(e.getKey());
+						tc.setTopicIndex(e.getKey());
 						tc.setCount(e.getValue());
 						topicCount.add(tc);
 					}
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java
index 5a9e7b54..6625adda 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java
@@ -7,7 +7,9 @@ import org.apache.commons.io.FileUtils;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.bson.types.ObjectId;
+import org.elasticsearch.client.Client;
 
+import de.vipra.cmd.es.ESClient;
 import de.vipra.cmd.model.ProcessedArticle;
 import de.vipra.util.Config;
 import de.vipra.util.ConsoleUtils;
@@ -27,6 +29,7 @@ public class ClearCommand implements Command {
 	private DatabaseService<TopicFull, ObjectId> dbTopics;
 	private DatabaseService<Word, String> dbWords;
 	private DatabaseService<Import, ObjectId> dbImports;
+	private Client elasticClient;
 
 	public ClearCommand(boolean defaults) {
 		this.defaults = defaults;
@@ -38,6 +41,7 @@ public class ClearCommand implements Command {
 		dbTopics = DatabaseService.getDatabaseService(config, TopicFull.class);
 		dbWords = DatabaseService.getDatabaseService(config, Word.class);
 		dbImports = DatabaseService.getDatabaseService(config, Import.class);
+		elasticClient = ESClient.getClient(config);
 
 		out.info("clearing database");
 		dbArticles.drop();
@@ -45,6 +49,9 @@ public class ClearCommand implements Command {
 		dbWords.drop();
 		dbImports.drop();
 
+		out.info("clearing index");
+		elasticClient.admin().indices().prepareDelete("_all").get();
+
 		try {
 			out.info("clearing filebase");
 			File dataDir = config.getDataDirectory();
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java
index 382cc4a0..0e43bd17 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java
@@ -7,6 +7,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
+import java.util.ListIterator;
 import java.util.Map;
 
 import org.apache.logging.log4j.LogManager;
@@ -25,6 +26,7 @@ import de.vipra.cmd.model.ProcessedArticle;
 import de.vipra.cmd.text.ProcessedText;
 import de.vipra.cmd.text.Processor;
 import de.vipra.util.Config;
+import de.vipra.util.Constants;
 import de.vipra.util.ConvertStream;
 import de.vipra.util.ElasticSerializer;
 import de.vipra.util.MongoUtils;
@@ -108,7 +110,7 @@ public class ImportCommand implements Command {
 	 * @throws Exception
 	 */
 	private Article importArticle(JSONObject obj) throws Exception {
-		out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\"");
+		out.info("importing \"" + obj.get("title") + "\"");
 		ProcessedArticle article = new ProcessedArticle();
 		article.fromJSON(obj);
 
@@ -238,28 +240,42 @@ public class ImportCommand implements Command {
 		 * save topic refs
 		 */
 		out.info("saving document topics");
-		ConvertStream<List<TopicRef>> topics = analyzer.getTopics();
+		ConvertStream<List<TopicRef>> topicStream = analyzer.getTopics();
 		FilebaseIndex index = filebase.getIndex();
 		Iterator<String> indexIter = index.iterator();
-		Iterator<List<TopicRef>> topicIter = topics.iterator();
-		while (indexIter.hasNext() && topicIter.hasNext()) {
-			List<TopicRef> topicCount = topicIter.next();
-			for (TopicRef tc : topicCount) {
-				String oid = topicIndexMap.get(tc.getTopicId());
-				tc.setTopicId(oid);
-				if (oid == null)
-					log.error("no object id for topic index " + tc.getTopicId());
-			}
+		Iterator<List<TopicRef>> topicRefsListIter = topicStream.iterator();
+		while (indexIter.hasNext() && topicRefsListIter.hasNext()) {
+			// get article from database
 			String id = indexIter.next();
-			ProcessedArticle a = dbArticles.getSingle(MongoUtils.objectId(id));
-			if (a != null)
-				a.setTopics(topicCount);
-			else
+			ProcessedArticle article = dbArticles.getSingle(MongoUtils.objectId(id));
+			if (article == null) {
 				log.error("no article found in db for id " + id);
+				continue;
+			}
+
+			double wordCount = article.getStats().getWordCount();
+
+			// insert topic references into article, ignoring low refs
+			List<TopicRef> topicRefs = topicRefsListIter.next();
+			for (ListIterator<TopicRef> topicRefsIter = topicRefs.listIterator(); topicRefsIter.hasNext();) {
+				TopicRef topicRef = topicRefsIter.next();
+				if ((topicRef.getCount() / wordCount) < Constants.TOPIC_THRESHOLD) {
+					topicRefsIter.remove();
+					continue;
+				}
+				String topicObjectId = topicIndexMap.get(topicRef.getTopicIndex());
+				if (topicObjectId != null)
+					topicRef.setTopicId(topicObjectId);
+				else
+					log.error("no object id for topic index " + topicRef.getTopicIndex());
+			}
+
+			article.setTopics(topicRefs);
+
 			try {
-				dbArticles.updateSingle(a);
+				dbArticles.updateSingle(article);
 			} catch (DatabaseException e) {
-				log.error("could not update article: " + a.getTitle() + " (" + a.getId() + ")");
+				log.error("could not update article: " + article.getTitle() + " (" + article.getId() + ")");
 			}
 		}
 		List<Word> importedWords = wordMap.getNewWords();
@@ -289,6 +305,7 @@ public class ImportCommand implements Command {
 		out.info("imported " + newArticlesCount + " new " + StringUtils.quantity(newArticlesCount, "article"));
 		out.info("imported " + newWordsCount + " new " + StringUtils.quantity(newWordsCount, "word"));
 		out.info(timer.toString());
+		out.info("done in " + StringUtils.timeString(timer.total()));
 	}
 
 }
diff --git a/vipra-ui/app/components/pagination-bar.js b/vipra-ui/app/components/pagination-bar.js
index d6654db9..42d7c324 100644
--- a/vipra-ui/app/components/pagination-bar.js
+++ b/vipra-ui/app/components/pagination-bar.js
@@ -4,28 +4,32 @@ export default Ember.Component.extend({
 
   elements: 2,
 
+  currentPage: Ember.computed('page', function() {
+    return parseInt(this.get('page') || 1);
+  }),
+
   prev: Ember.computed('page', function() {
-    return this.page > 1;
+    return this.get('currentPage') > 1;
   }),
 
   prevPrev: Ember.computed('page', function() {
-    return this.page > this.elements + 1;
+    return this.get('currentPage') > this.elements + 1;
   }),
 
   prevPage: Ember.computed('page', function() {
-    return this.page - 1;
+    return this.get('currentPage') - 1;
   }),
 
   next: Ember.computed('page', function() {
-    return this.page < Math.ceil(this.total/this.limit*1.0);
+    return this.get('currentPage') < Math.ceil(this.total/this.limit*1.0);
   }),
 
   nextNext: Ember.computed('page', function() {
-    return this.page < Math.ceil(this.total/this.limit*1.0) - this.elements;
+    return this.get('currentPage') < Math.ceil(this.total/this.limit*1.0) - this.elements;
   }),
 
   nextPage: Ember.computed('page', function() {
-    return this.page + 1;
+    return this.get('currentPage') + 1;
   }),
 
   lastPage: Ember.computed('page', function() {
@@ -34,7 +38,7 @@ export default Ember.Component.extend({
 
   pages: Ember.computed('total', 'page', 'limit', 'elements', function() {
     let pages = [],
-        page  = parseInt(this.page || 1),
+        page  = this.get('currentPage'),
         max   = Math.ceil(this.total/this.limit*1.0),
         start = Math.max(page - this.elements, 1),
         end   = Math.min(Math.max(page + this.elements, start + this.elements * 2), max);
diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java
index b4440ea1..8c3e176f 100644
--- a/vipra-util/src/main/java/de/vipra/util/Constants.java
+++ b/vipra-util/src/main/java/de/vipra/util/Constants.java
@@ -64,6 +64,12 @@ public class Constants {
 	 */
 	public static final int LIKELINESS_PRECISION = 6;
 
+	/**
+	 * Topics with a share greater or equal to this number are regarded as
+	 * accepted topics to that article. Value range: [0.0, 1.0]
+	 */
+	public static final double TOPIC_THRESHOLD = 0.01;
+
 	/**
 	 * Stopwords list. Extensive list of stopwords used to clean imported
 	 * articles of the most common words before topic modeling is applied.
diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java
index 7d84a4d6..35d8755e 100644
--- a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java
+++ b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java
@@ -6,24 +6,28 @@ import org.mongodb.morphia.annotations.Embedded;
 import org.mongodb.morphia.annotations.Reference;
 import org.mongodb.morphia.annotations.Transient;
 
+import de.vipra.util.MongoUtils;
+
 @SuppressWarnings("serial")
 @Embedded
 public class TopicRef implements Comparable<TopicRef>, Serializable {
 
 	@Transient
-	private String topicId;
+	private String topicIndex;
 	@Reference(ignoreMissing = true)
 	private Topic topic;
 	private int count;
 
-	public String getTopicId() {
-		return topicId;
+	public String getTopicIndex() {
+		return topicIndex;
+	}
+
+	public void setTopicIndex(String index) {
+		this.topicIndex = index;
 	}
 
 	public void setTopicId(String id) {
-		this.topicId = id;
-		this.topic = new Topic();
-		this.topic.setId(id);
+		this.topic = new Topic(MongoUtils.objectId(id));
 	}
 
 	public int getCount() {
@@ -49,7 +53,8 @@ public class TopicRef implements Comparable<TopicRef>, Serializable {
 
 	@Override
 	public String toString() {
-		return TopicRef.class.getSimpleName() + "[topicId:" + topicId + ",count:" + count + "]";
+		return TopicRef.class.getSimpleName() + "[topicIndex:" + topicIndex + ", topic: " + topic + ", count:" + count
+				+ "]";
 	}
 
 }
-- 
GitLab