From b3f14f063c906fcc00bd767a0b275a77b05bd24a Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Wed, 23 Dec 2015 20:04:09 +0100
Subject: [PATCH] added article stats (needs serialization)

---
 .../de/vipra/cmd/option/ImportCommand.java    | 58 ++++++++++-------
 .../de/vipra/cmd/option/ImportException.java  | 23 -------
 vipra-cmd/src/main/resources/log4j2.xml       |  1 +
 vipra-cmd/src/main/resources/log4j2dev.xml    |  2 +
 .../vipra/rest/resource/ArticleResource.java  | 11 +++-
 .../de/vipra/rest/service/ArticleService.java |  2 +-
 .../java/de/vipra/util/model/Article.java     | 18 ++++++
 .../de/vipra/util/model/ArticleStats.java     | 64 +++++++++++++++++++
 .../de/vipra/util/model/TermFrequency.java    | 41 ++++++++++++
 9 files changed, 171 insertions(+), 49 deletions(-)
 delete mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java
 create mode 100644 vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java
 create mode 100644 vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java

diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java
index 53dee5ce..36fcf9c7 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java
@@ -14,8 +14,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import de.vipra.cmd.ExecutionException;
-import de.vipra.cmd.lda.JGibbLDAAnalyzer;
-import de.vipra.cmd.lda.LDAAnalyzer;
 import de.vipra.cmd.model.Article;
 import de.vipra.cmd.text.LucenePreprocessor;
 import de.vipra.cmd.text.Preprocessor;
@@ -23,14 +21,35 @@ import de.vipra.util.Config;
 import de.vipra.util.ConfigException;
 import de.vipra.util.Constants;
 import de.vipra.util.StringUtils;
-import de.vipra.util.ex.DatabaseException;
+import de.vipra.util.model.ArticleStats;
 import de.vipra.util.service.DatabaseService;
 import de.vipra.util.service.FilebaseService;
 
 public class ImportCommand implements Command {
 
+	public class ImportException extends Exception {
+
+		private static final long serialVersionUID = 1L;
+
+		private final String id;
+
+		public ImportException(String msg, String id) {
+			super(msg);
+			this.id = id;
+		}
+
+		public ImportException(Exception e, String id) {
+			super(e);
+			this.id = id;
+		}
+
+		public String getId() {
+			return id;
+		}
+
+	}
+
 	public static final Logger log = LoggerFactory.getLogger(ImportCommand.class);
-	public static final Logger out = LoggerFactory.getLogger("shellout");
 
 	private ArrayList<File> files = new ArrayList<>();
 	private JSONParser parser = new JSONParser();
@@ -106,35 +125,28 @@ public class ImportCommand implements Command {
 		}
 	}
 
-	void importArticle(JSONObject obj) throws DatabaseException, ImportException {
-		out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\"");
+	void importArticle(JSONObject obj) throws ImportException {
+		log.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\"");
 		Article article = new Article();
 		article.fromJSON(obj);
-		String originalText = article.getText();
-
-		// 1. add article to mongodb
-		// this generates a unique object id
-		article = dbArticles.createSingle(article);
 
 		try {
-			// 2. preprocess text
+			// 1. preprocess text
 			// process text before topic modeling
 			Preprocessor preprocessor = new LucenePreprocessor();
-			String processedText = preprocessor.preprocess(originalText);
+			String processedText = preprocessor.preprocess(article.getText());
 
-			// 3. add article to filebase
+			// 2. generate word statistics
+			article.setStats(ArticleStats.generateFromText(processedText));
+
+			// 3. add article to mongodb
+			// this generates a unique object id
+			article = dbArticles.createSingle(article);
+
+			// 4. add article to filebase
 			// topic modeling works on files
 			article.setText(processedText);
 			fbArticles.createSingle(article);
-
-			// 4. topic modeling
-			// extract topics from processed text
-			LDAAnalyzer analyzer = new JGibbLDAAnalyzer();
-			Object what = analyzer.analyze(article);
-			// TODO implement
-
-			// 5. index article via elasticsearch
-			// fulltext index, include topics
 		} catch (Exception e) {
 			throw new ImportException(e, article.getId());
 		}
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java
deleted file mode 100644
index db2875e4..00000000
--- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java
+++ /dev/null
@@ -1,23 +0,0 @@
-package de.vipra.cmd.option;
-
-public class ImportException extends Exception {
-
-	private static final long serialVersionUID = 1L;
-
-	private final String id;
-
-	public ImportException(String msg, String id) {
-		super(msg);
-		this.id = id;
-	}
-
-	public ImportException(Exception e, String id) {
-		super(e);
-		this.id = id;
-	}
-
-	public String getId() {
-		return id;
-	}
-
-}
diff --git a/vipra-cmd/src/main/resources/log4j2.xml b/vipra-cmd/src/main/resources/log4j2.xml
index c28b8da1..3a6e439c 100644
--- a/vipra-cmd/src/main/resources/log4j2.xml
+++ b/vipra-cmd/src/main/resources/log4j2.xml
@@ -10,5 +10,6 @@
 			<AppenderRef ref="Console" />
 		</Root>
 		<Logger name="shellout" level="ALL"/>
+		<Logger name="org.mongodb" level="ERROR"/>
 	</Loggers>
 </Configuration>
\ No newline at end of file
diff --git a/vipra-cmd/src/main/resources/log4j2dev.xml b/vipra-cmd/src/main/resources/log4j2dev.xml
index 5fb48b4d..bb9050cf 100644
--- a/vipra-cmd/src/main/resources/log4j2dev.xml
+++ b/vipra-cmd/src/main/resources/log4j2dev.xml
@@ -9,5 +9,7 @@
 		<Root level="ALL">
 			<AppenderRef ref="Console" />
 		</Root>
+		<Logger name="shellout" level="ALL"/>
+		<Logger name="org.mongodb" level="ERROR"/>
 	</Loggers>
 </Configuration>
\ No newline at end of file
diff --git a/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java b/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java
index 526d14d3..2fbab02b 100644
--- a/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java
+++ b/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java
@@ -95,11 +95,18 @@ public class ArticleResource {
 	@DELETE
 	@Path("{id}")
 	public Response deleteArticle(@PathParam("id") String id) {
-		long deleted = service.deleteArticle(id);
+		ResponseWrapper<Article> res = new ResponseWrapper<>();
+		long deleted;
+		try {
+			deleted = service.deleteArticle(id);
+		} catch (DatabaseException e) {
+			res = new ResponseWrapper<>(new APIError(Response.Status.INTERNAL_SERVER_ERROR, "item could not be deleted",
+					"item could not be created due to an internal server error"));
+			return Response.serverError().entity(res).build();
+		}
 		int del = deleted > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) deleted;
 		switch (del) {
 		case 0:
-			ResponseWrapper<Article> res = new ResponseWrapper<>();
 			res.addError(new APIError(Response.Status.NOT_FOUND, "Article not found",
 					String.format(Messages.NOT_FOUND, "article", id)));
 			return Response.status(Response.Status.NOT_FOUND).entity(res).build();
diff --git a/vipra-rest/src/main/java/de/vipra/rest/service/ArticleService.java b/vipra-rest/src/main/java/de/vipra/rest/service/ArticleService.java
index 9f432a2a..3ed4a615 100644
--- a/vipra-rest/src/main/java/de/vipra/rest/service/ArticleService.java
+++ b/vipra-rest/src/main/java/de/vipra/rest/service/ArticleService.java
@@ -40,7 +40,7 @@ public class ArticleService extends DatabaseService<Article> {
 		return article;
 	}
 
-	public long deleteArticle(String id) {
+	public long deleteArticle(String id) throws DatabaseException {
 		return super.deleteSingle(id);
 	}
 
diff --git a/vipra-util/src/main/java/de/vipra/util/model/Article.java b/vipra-util/src/main/java/de/vipra/util/model/Article.java
index 60f2e6ed..a121c216 100644
--- a/vipra-util/src/main/java/de/vipra/util/model/Article.java
+++ b/vipra-util/src/main/java/de/vipra/util/model/Article.java
@@ -19,6 +19,8 @@ public class Article extends Model {
 	private String text;
 	private String url;
 	private Date date;
+	private boolean complete;
+	private ArticleStats stats;
 
 	public String getTitle() {
 		return title;
@@ -52,6 +54,22 @@ public class Article extends Model {
 		this.date = date;
 	}
 
+	public boolean isComplete() {
+		return complete;
+	}
+
+	public void setComplete(boolean complete) {
+		this.complete = complete;
+	}
+
+	public ArticleStats getStats() {
+		return stats;
+	}
+
+	public void setStats(ArticleStats stats) {
+		this.stats = stats;
+	}
+
 	public void setDate(String date) {
 		SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
 		try {
diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java
new file mode 100644
index 00000000..dc06372e
--- /dev/null
+++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java
@@ -0,0 +1,64 @@
+package de.vipra.util.model;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class ArticleStats {
+
+	private long wordCount;
+	private long uniqueWordCount;
+	private Map<String, TermFrequency> uniqueWords;
+
+	public long getWordCount() {
+		return wordCount;
+	}
+
+	public void setWordCount(long wordCount) {
+		this.wordCount = wordCount;
+	}
+
+	public long getUniqueWordCount() {
+		return uniqueWordCount;
+	}
+
+	public void setUniqueWordCount(long uniqueWordCount) {
+		this.uniqueWordCount = uniqueWordCount;
+	}
+
+	public Map<String, TermFrequency> getUniqueWords() {
+		return uniqueWords;
+	}
+
+	public void setUniqueWords(Map<String, TermFrequency> uniqueWords) {
+		this.uniqueWords = uniqueWords;
+	}
+
+	public static ArticleStats generateFromText(final String text) {
+		ArticleStats stats = new ArticleStats();
+		String[] words = text.split("\\s+");
+		stats.setWordCount(words.length);
+		Map<String, TermFrequency> uniqueWords = new HashMap<>();
+		long maxFrequency = 0;
+		// loop and count unique words
+		// also remember maximum frequency
+		for (String word : words) {
+			TermFrequency tf = uniqueWords.get(word);
+			if (tf == null) {
+				tf = new TermFrequency();
+			}
+			tf.incrementTermFrequency();
+			if (tf.getTermFrequency() > maxFrequency) {
+				maxFrequency = tf.getTermFrequency();
+			}
+			uniqueWords.put(word, tf);
+		}
+		// normalize frequencies
+		for (Map.Entry<String, TermFrequency> entry : uniqueWords.entrySet()) {
+			entry.getValue().normalizeTermFrequency(maxFrequency);
+		}
+		stats.setUniqueWordCount(uniqueWords.size());
+		stats.setUniqueWords(uniqueWords);
+		return stats;
+	}
+
+}
diff --git a/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java b/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java
new file mode 100644
index 00000000..8008d7ec
--- /dev/null
+++ b/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java
@@ -0,0 +1,41 @@
+package de.vipra.util.model;
+
+public class TermFrequency {
+
+	private long termFrequency = 0;
+	private long normalizedTermFrequency = 0;
+	private long inverseDocumentFrequency = 0;
+
+	public long getTermFrequency() {
+		return termFrequency;
+	}
+
+	public void setTermFrequency(long termFrequency) {
+		this.termFrequency = termFrequency;
+	}
+
+	public long getNormalizedTermFrequency() {
+		return normalizedTermFrequency;
+	}
+
+	public void setNormalizedTermFrequency(long normalizedTermFrequency) {
+		this.normalizedTermFrequency = normalizedTermFrequency;
+	}
+
+	public void normalizeTermFrequency(long max) {
+		setNormalizedTermFrequency(getNormalizedTermFrequency() / max);
+	}
+
+	public void incrementTermFrequency() {
+		setTermFrequency(getTermFrequency() + 1);
+	}
+
+	public long getInverseDocumentFrequency() {
+		return inverseDocumentFrequency;
+	}
+
+	public void setInverseDocumentFrequency(long inverseDocumentFrequency) {
+		this.inverseDocumentFrequency = inverseDocumentFrequency;
+	}
+
+}
-- 
GitLab