From b3f14f063c906fcc00bd767a0b275a77b05bd24a Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Wed, 23 Dec 2015 20:04:09 +0100 Subject: [PATCH] added article stats (needs serialization) --- .../de/vipra/cmd/option/ImportCommand.java | 58 ++++++++++------- .../de/vipra/cmd/option/ImportException.java | 23 ------- vipra-cmd/src/main/resources/log4j2.xml | 1 + vipra-cmd/src/main/resources/log4j2dev.xml | 2 + .../vipra/rest/resource/ArticleResource.java | 11 +++- .../de/vipra/rest/service/ArticleService.java | 2 +- .../java/de/vipra/util/model/Article.java | 18 ++++++ .../de/vipra/util/model/ArticleStats.java | 64 +++++++++++++++++++ .../de/vipra/util/model/TermFrequency.java | 41 ++++++++++++ 9 files changed, 171 insertions(+), 49 deletions(-) delete mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java create mode 100644 vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java create mode 100644 vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 53dee5ce..36fcf9c7 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -14,8 +14,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.vipra.cmd.ExecutionException; -import de.vipra.cmd.lda.JGibbLDAAnalyzer; -import de.vipra.cmd.lda.LDAAnalyzer; import de.vipra.cmd.model.Article; import de.vipra.cmd.text.LucenePreprocessor; import de.vipra.cmd.text.Preprocessor; @@ -23,14 +21,35 @@ import de.vipra.util.Config; import de.vipra.util.ConfigException; import de.vipra.util.Constants; import de.vipra.util.StringUtils; -import de.vipra.util.ex.DatabaseException; +import de.vipra.util.model.ArticleStats; import de.vipra.util.service.DatabaseService; import de.vipra.util.service.FilebaseService; public class ImportCommand implements Command { + public class ImportException extends Exception { + + private static final long serialVersionUID = 1L; + + private final String id; + + public ImportException(String msg, String id) { + super(msg); + this.id = id; + } + + public ImportException(Exception e, String id) { + super(e); + this.id = id; + } + + public String getId() { + return id; + } + + } + public static final Logger log = LoggerFactory.getLogger(ImportCommand.class); - public static final Logger out = LoggerFactory.getLogger("shellout"); private ArrayList<File> files = new ArrayList<>(); private JSONParser parser = new JSONParser(); @@ -106,35 +125,28 @@ public class ImportCommand implements Command { } } - void importArticle(JSONObject obj) throws DatabaseException, ImportException { - out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\""); + void importArticle(JSONObject obj) throws ImportException { + log.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\""); Article article = new Article(); article.fromJSON(obj); - String originalText = article.getText(); - - // 1. add article to mongodb - // this generates a unique object id - article = dbArticles.createSingle(article); try { - // 2. preprocess text + // 1. preprocess text // process text before topic modeling Preprocessor preprocessor = new LucenePreprocessor(); - String processedText = preprocessor.preprocess(originalText); + String processedText = preprocessor.preprocess(article.getText()); - // 3. add article to filebase + // 2. generate word statistics + article.setStats(ArticleStats.generateFromText(processedText)); + + // 3. add article to mongodb + // this generates a unique object id + article = dbArticles.createSingle(article); + + // 4. add article to filebase // topic modeling works on files article.setText(processedText); fbArticles.createSingle(article); - - // 4. topic modeling - // extract topics from processed text - LDAAnalyzer analyzer = new JGibbLDAAnalyzer(); - Object what = analyzer.analyze(article); - // TODO implement - - // 5. index article via elasticsearch - // fulltext index, include topics } catch (Exception e) { throw new ImportException(e, article.getId()); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java deleted file mode 100644 index db2875e4..00000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java +++ /dev/null @@ -1,23 +0,0 @@ -package de.vipra.cmd.option; - -public class ImportException extends Exception { - - private static final long serialVersionUID = 1L; - - private final String id; - - public ImportException(String msg, String id) { - super(msg); - this.id = id; - } - - public ImportException(Exception e, String id) { - super(e); - this.id = id; - } - - public String getId() { - return id; - } - -} diff --git a/vipra-cmd/src/main/resources/log4j2.xml b/vipra-cmd/src/main/resources/log4j2.xml index c28b8da1..3a6e439c 100644 --- a/vipra-cmd/src/main/resources/log4j2.xml +++ b/vipra-cmd/src/main/resources/log4j2.xml @@ -10,5 +10,6 @@ <AppenderRef ref="Console" /> </Root> <Logger name="shellout" level="ALL"/> + <Logger name="org.mongodb" level="ERROR"/> </Loggers> </Configuration> \ No newline at end of file diff --git a/vipra-cmd/src/main/resources/log4j2dev.xml b/vipra-cmd/src/main/resources/log4j2dev.xml index 5fb48b4d..bb9050cf 100644 --- a/vipra-cmd/src/main/resources/log4j2dev.xml +++ b/vipra-cmd/src/main/resources/log4j2dev.xml @@ -9,5 +9,7 @@ <Root level="ALL"> <AppenderRef ref="Console" /> </Root> + <Logger name="shellout" level="ALL"/> + <Logger name="org.mongodb" level="ERROR"/> </Loggers> </Configuration> \ No newline at end of file diff --git a/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java b/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java index 526d14d3..2fbab02b 100644 --- a/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java +++ b/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java @@ -95,11 +95,18 @@ public class ArticleResource { @DELETE @Path("{id}") public Response deleteArticle(@PathParam("id") String id) { - long deleted = service.deleteArticle(id); + ResponseWrapper<Article> res = new ResponseWrapper<>(); + long deleted; + try { + deleted = service.deleteArticle(id); + } catch (DatabaseException e) { + res = new ResponseWrapper<>(new APIError(Response.Status.INTERNAL_SERVER_ERROR, "item could not be deleted", + "item could not be created due to an internal server error")); + return Response.serverError().entity(res).build(); + } int del = deleted > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) deleted; switch (del) { case 0: - ResponseWrapper<Article> res = new ResponseWrapper<>(); res.addError(new APIError(Response.Status.NOT_FOUND, "Article not found", String.format(Messages.NOT_FOUND, "article", id))); return Response.status(Response.Status.NOT_FOUND).entity(res).build(); diff --git a/vipra-rest/src/main/java/de/vipra/rest/service/ArticleService.java b/vipra-rest/src/main/java/de/vipra/rest/service/ArticleService.java index 9f432a2a..3ed4a615 100644 --- a/vipra-rest/src/main/java/de/vipra/rest/service/ArticleService.java +++ b/vipra-rest/src/main/java/de/vipra/rest/service/ArticleService.java @@ -40,7 +40,7 @@ public class ArticleService extends DatabaseService<Article> { return article; } - public long deleteArticle(String id) { + public long deleteArticle(String id) throws DatabaseException { return super.deleteSingle(id); } diff --git a/vipra-util/src/main/java/de/vipra/util/model/Article.java b/vipra-util/src/main/java/de/vipra/util/model/Article.java index 60f2e6ed..a121c216 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Article.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Article.java @@ -19,6 +19,8 @@ public class Article extends Model { private String text; private String url; private Date date; + private boolean complete; + private ArticleStats stats; public String getTitle() { return title; @@ -52,6 +54,22 @@ public class Article extends Model { this.date = date; } + public boolean isComplete() { + return complete; + } + + public void setComplete(boolean complete) { + this.complete = complete; + } + + public ArticleStats getStats() { + return stats; + } + + public void setStats(ArticleStats stats) { + this.stats = stats; + } + public void setDate(String date) { SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); try { diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java new file mode 100644 index 00000000..dc06372e --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java @@ -0,0 +1,64 @@ +package de.vipra.util.model; + +import java.util.HashMap; +import java.util.Map; + +public class ArticleStats { + + private long wordCount; + private long uniqueWordCount; + private Map<String, TermFrequency> uniqueWords; + + public long getWordCount() { + return wordCount; + } + + public void setWordCount(long wordCount) { + this.wordCount = wordCount; + } + + public long getUniqueWordCount() { + return uniqueWordCount; + } + + public void setUniqueWordCount(long uniqueWordCount) { + this.uniqueWordCount = uniqueWordCount; + } + + public Map<String, TermFrequency> getUniqueWords() { + return uniqueWords; + } + + public void setUniqueWords(Map<String, TermFrequency> uniqueWords) { + this.uniqueWords = uniqueWords; + } + + public static ArticleStats generateFromText(final String text) { + ArticleStats stats = new ArticleStats(); + String[] words = text.split("\\s+"); + stats.setWordCount(words.length); + Map<String, TermFrequency> uniqueWords = new HashMap<>(); + long maxFrequency = 0; + // loop and count unique words + // also remember maximum frequency + for (String word : words) { + TermFrequency tf = uniqueWords.get(word); + if (tf == null) { + tf = new TermFrequency(); + } + tf.incrementTermFrequency(); + if (tf.getTermFrequency() > maxFrequency) { + maxFrequency = tf.getTermFrequency(); + } + uniqueWords.put(word, tf); + } + // normalize frequencies + for (Map.Entry<String, TermFrequency> entry : uniqueWords.entrySet()) { + entry.getValue().normalizeTermFrequency(maxFrequency); + } + stats.setUniqueWordCount(uniqueWords.size()); + stats.setUniqueWords(uniqueWords); + return stats; + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java b/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java new file mode 100644 index 00000000..8008d7ec --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java @@ -0,0 +1,41 @@ +package de.vipra.util.model; + +public class TermFrequency { + + private long termFrequency = 0; + private long normalizedTermFrequency = 0; + private long inverseDocumentFrequency = 0; + + public long getTermFrequency() { + return termFrequency; + } + + public void setTermFrequency(long termFrequency) { + this.termFrequency = termFrequency; + } + + public long getNormalizedTermFrequency() { + return normalizedTermFrequency; + } + + public void setNormalizedTermFrequency(long normalizedTermFrequency) { + this.normalizedTermFrequency = normalizedTermFrequency; + } + + public void normalizeTermFrequency(long max) { + setNormalizedTermFrequency(getNormalizedTermFrequency() / max); + } + + public void incrementTermFrequency() { + setTermFrequency(getTermFrequency() + 1); + } + + public long getInverseDocumentFrequency() { + return inverseDocumentFrequency; + } + + public void setInverseDocumentFrequency(long inverseDocumentFrequency) { + this.inverseDocumentFrequency = inverseDocumentFrequency; + } + +} -- GitLab