Skip to content
Snippets Groups Projects
Commit b3f14f06 authored by Eike Cochu's avatar Eike Cochu
Browse files

added article stats (needs serialization)

parent d5d79b9e
Branches
No related tags found
No related merge requests found
...@@ -14,8 +14,6 @@ import org.slf4j.Logger; ...@@ -14,8 +14,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import de.vipra.cmd.ExecutionException; import de.vipra.cmd.ExecutionException;
import de.vipra.cmd.lda.JGibbLDAAnalyzer;
import de.vipra.cmd.lda.LDAAnalyzer;
import de.vipra.cmd.model.Article; import de.vipra.cmd.model.Article;
import de.vipra.cmd.text.LucenePreprocessor; import de.vipra.cmd.text.LucenePreprocessor;
import de.vipra.cmd.text.Preprocessor; import de.vipra.cmd.text.Preprocessor;
...@@ -23,14 +21,35 @@ import de.vipra.util.Config; ...@@ -23,14 +21,35 @@ import de.vipra.util.Config;
import de.vipra.util.ConfigException; import de.vipra.util.ConfigException;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.StringUtils; import de.vipra.util.StringUtils;
import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleStats;
import de.vipra.util.service.DatabaseService; import de.vipra.util.service.DatabaseService;
import de.vipra.util.service.FilebaseService; import de.vipra.util.service.FilebaseService;
public class ImportCommand implements Command { public class ImportCommand implements Command {
public class ImportException extends Exception {
private static final long serialVersionUID = 1L;
private final String id;
public ImportException(String msg, String id) {
super(msg);
this.id = id;
}
public ImportException(Exception e, String id) {
super(e);
this.id = id;
}
public String getId() {
return id;
}
}
public static final Logger log = LoggerFactory.getLogger(ImportCommand.class); public static final Logger log = LoggerFactory.getLogger(ImportCommand.class);
public static final Logger out = LoggerFactory.getLogger("shellout");
private ArrayList<File> files = new ArrayList<>(); private ArrayList<File> files = new ArrayList<>();
private JSONParser parser = new JSONParser(); private JSONParser parser = new JSONParser();
...@@ -106,35 +125,28 @@ public class ImportCommand implements Command { ...@@ -106,35 +125,28 @@ public class ImportCommand implements Command {
} }
} }
void importArticle(JSONObject obj) throws DatabaseException, ImportException { void importArticle(JSONObject obj) throws ImportException {
out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\""); log.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\"");
Article article = new Article(); Article article = new Article();
article.fromJSON(obj); article.fromJSON(obj);
String originalText = article.getText();
// 1. add article to mongodb
// this generates a unique object id
article = dbArticles.createSingle(article);
try { try {
// 2. preprocess text // 1. preprocess text
// process text before topic modeling // process text before topic modeling
Preprocessor preprocessor = new LucenePreprocessor(); Preprocessor preprocessor = new LucenePreprocessor();
String processedText = preprocessor.preprocess(originalText); String processedText = preprocessor.preprocess(article.getText());
// 3. add article to filebase // 2. generate word statistics
article.setStats(ArticleStats.generateFromText(processedText));
// 3. add article to mongodb
// this generates a unique object id
article = dbArticles.createSingle(article);
// 4. add article to filebase
// topic modeling works on files // topic modeling works on files
article.setText(processedText); article.setText(processedText);
fbArticles.createSingle(article); fbArticles.createSingle(article);
// 4. topic modeling
// extract topics from processed text
LDAAnalyzer analyzer = new JGibbLDAAnalyzer();
Object what = analyzer.analyze(article);
// TODO implement
// 5. index article via elasticsearch
// fulltext index, include topics
} catch (Exception e) { } catch (Exception e) {
throw new ImportException(e, article.getId()); throw new ImportException(e, article.getId());
} }
......
package de.vipra.cmd.option;
public class ImportException extends Exception {
private static final long serialVersionUID = 1L;
private final String id;
public ImportException(String msg, String id) {
super(msg);
this.id = id;
}
public ImportException(Exception e, String id) {
super(e);
this.id = id;
}
public String getId() {
return id;
}
}
...@@ -10,5 +10,6 @@ ...@@ -10,5 +10,6 @@
<AppenderRef ref="Console" /> <AppenderRef ref="Console" />
</Root> </Root>
<Logger name="shellout" level="ALL"/> <Logger name="shellout" level="ALL"/>
<Logger name="org.mongodb" level="ERROR"/>
</Loggers> </Loggers>
</Configuration> </Configuration>
\ No newline at end of file
...@@ -9,5 +9,7 @@ ...@@ -9,5 +9,7 @@
<Root level="ALL"> <Root level="ALL">
<AppenderRef ref="Console" /> <AppenderRef ref="Console" />
</Root> </Root>
<Logger name="shellout" level="ALL"/>
<Logger name="org.mongodb" level="ERROR"/>
</Loggers> </Loggers>
</Configuration> </Configuration>
\ No newline at end of file
...@@ -95,11 +95,18 @@ public class ArticleResource { ...@@ -95,11 +95,18 @@ public class ArticleResource {
@DELETE @DELETE
@Path("{id}") @Path("{id}")
public Response deleteArticle(@PathParam("id") String id) { public Response deleteArticle(@PathParam("id") String id) {
long deleted = service.deleteArticle(id); ResponseWrapper<Article> res = new ResponseWrapper<>();
long deleted;
try {
deleted = service.deleteArticle(id);
} catch (DatabaseException e) {
res = new ResponseWrapper<>(new APIError(Response.Status.INTERNAL_SERVER_ERROR, "item could not be deleted",
"item could not be created due to an internal server error"));
return Response.serverError().entity(res).build();
}
int del = deleted > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) deleted; int del = deleted > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) deleted;
switch (del) { switch (del) {
case 0: case 0:
ResponseWrapper<Article> res = new ResponseWrapper<>();
res.addError(new APIError(Response.Status.NOT_FOUND, "Article not found", res.addError(new APIError(Response.Status.NOT_FOUND, "Article not found",
String.format(Messages.NOT_FOUND, "article", id))); String.format(Messages.NOT_FOUND, "article", id)));
return Response.status(Response.Status.NOT_FOUND).entity(res).build(); return Response.status(Response.Status.NOT_FOUND).entity(res).build();
......
...@@ -40,7 +40,7 @@ public class ArticleService extends DatabaseService<Article> { ...@@ -40,7 +40,7 @@ public class ArticleService extends DatabaseService<Article> {
return article; return article;
} }
public long deleteArticle(String id) { public long deleteArticle(String id) throws DatabaseException {
return super.deleteSingle(id); return super.deleteSingle(id);
} }
......
...@@ -19,6 +19,8 @@ public class Article extends Model { ...@@ -19,6 +19,8 @@ public class Article extends Model {
private String text; private String text;
private String url; private String url;
private Date date; private Date date;
private boolean complete;
private ArticleStats stats;
public String getTitle() { public String getTitle() {
return title; return title;
...@@ -52,6 +54,22 @@ public class Article extends Model { ...@@ -52,6 +54,22 @@ public class Article extends Model {
this.date = date; this.date = date;
} }
public boolean isComplete() {
return complete;
}
public void setComplete(boolean complete) {
this.complete = complete;
}
public ArticleStats getStats() {
return stats;
}
public void setStats(ArticleStats stats) {
this.stats = stats;
}
public void setDate(String date) { public void setDate(String date) {
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
try { try {
......
package de.vipra.util.model;
import java.util.HashMap;
import java.util.Map;
public class ArticleStats {
private long wordCount;
private long uniqueWordCount;
private Map<String, TermFrequency> uniqueWords;
public long getWordCount() {
return wordCount;
}
public void setWordCount(long wordCount) {
this.wordCount = wordCount;
}
public long getUniqueWordCount() {
return uniqueWordCount;
}
public void setUniqueWordCount(long uniqueWordCount) {
this.uniqueWordCount = uniqueWordCount;
}
public Map<String, TermFrequency> getUniqueWords() {
return uniqueWords;
}
public void setUniqueWords(Map<String, TermFrequency> uniqueWords) {
this.uniqueWords = uniqueWords;
}
public static ArticleStats generateFromText(final String text) {
ArticleStats stats = new ArticleStats();
String[] words = text.split("\\s+");
stats.setWordCount(words.length);
Map<String, TermFrequency> uniqueWords = new HashMap<>();
long maxFrequency = 0;
// loop and count unique words
// also remember maximum frequency
for (String word : words) {
TermFrequency tf = uniqueWords.get(word);
if (tf == null) {
tf = new TermFrequency();
}
tf.incrementTermFrequency();
if (tf.getTermFrequency() > maxFrequency) {
maxFrequency = tf.getTermFrequency();
}
uniqueWords.put(word, tf);
}
// normalize frequencies
for (Map.Entry<String, TermFrequency> entry : uniqueWords.entrySet()) {
entry.getValue().normalizeTermFrequency(maxFrequency);
}
stats.setUniqueWordCount(uniqueWords.size());
stats.setUniqueWords(uniqueWords);
return stats;
}
}
package de.vipra.util.model;
public class TermFrequency {
private long termFrequency = 0;
private long normalizedTermFrequency = 0;
private long inverseDocumentFrequency = 0;
public long getTermFrequency() {
return termFrequency;
}
public void setTermFrequency(long termFrequency) {
this.termFrequency = termFrequency;
}
public long getNormalizedTermFrequency() {
return normalizedTermFrequency;
}
public void setNormalizedTermFrequency(long normalizedTermFrequency) {
this.normalizedTermFrequency = normalizedTermFrequency;
}
public void normalizeTermFrequency(long max) {
setNormalizedTermFrequency(getNormalizedTermFrequency() / max);
}
public void incrementTermFrequency() {
setTermFrequency(getTermFrequency() + 1);
}
public long getInverseDocumentFrequency() {
return inverseDocumentFrequency;
}
public void setInverseDocumentFrequency(long inverseDocumentFrequency) {
this.inverseDocumentFrequency = inverseDocumentFrequency;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment