Skip to content
Snippets Groups Projects
Commit b3f14f06 authored by Eike Cochu's avatar Eike Cochu
Browse files

added article stats (needs serialization)

parent d5d79b9e
Branches
No related tags found
No related merge requests found
......@@ -14,8 +14,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.vipra.cmd.ExecutionException;
import de.vipra.cmd.lda.JGibbLDAAnalyzer;
import de.vipra.cmd.lda.LDAAnalyzer;
import de.vipra.cmd.model.Article;
import de.vipra.cmd.text.LucenePreprocessor;
import de.vipra.cmd.text.Preprocessor;
......@@ -23,14 +21,35 @@ import de.vipra.util.Config;
import de.vipra.util.ConfigException;
import de.vipra.util.Constants;
import de.vipra.util.StringUtils;
import de.vipra.util.ex.DatabaseException;
import de.vipra.util.model.ArticleStats;
import de.vipra.util.service.DatabaseService;
import de.vipra.util.service.FilebaseService;
public class ImportCommand implements Command {
public class ImportException extends Exception {
private static final long serialVersionUID = 1L;
private final String id;
public ImportException(String msg, String id) {
super(msg);
this.id = id;
}
public ImportException(Exception e, String id) {
super(e);
this.id = id;
}
public String getId() {
return id;
}
}
public static final Logger log = LoggerFactory.getLogger(ImportCommand.class);
public static final Logger out = LoggerFactory.getLogger("shellout");
private ArrayList<File> files = new ArrayList<>();
private JSONParser parser = new JSONParser();
......@@ -106,35 +125,28 @@ public class ImportCommand implements Command {
}
}
void importArticle(JSONObject obj) throws DatabaseException, ImportException {
out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\"");
void importArticle(JSONObject obj) throws ImportException {
log.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\"");
Article article = new Article();
article.fromJSON(obj);
String originalText = article.getText();
// 1. add article to mongodb
// this generates a unique object id
article = dbArticles.createSingle(article);
try {
// 2. preprocess text
// 1. preprocess text
// process text before topic modeling
Preprocessor preprocessor = new LucenePreprocessor();
String processedText = preprocessor.preprocess(originalText);
String processedText = preprocessor.preprocess(article.getText());
// 3. add article to filebase
// 2. generate word statistics
article.setStats(ArticleStats.generateFromText(processedText));
// 3. add article to mongodb
// this generates a unique object id
article = dbArticles.createSingle(article);
// 4. add article to filebase
// topic modeling works on files
article.setText(processedText);
fbArticles.createSingle(article);
// 4. topic modeling
// extract topics from processed text
LDAAnalyzer analyzer = new JGibbLDAAnalyzer();
Object what = analyzer.analyze(article);
// TODO implement
// 5. index article via elasticsearch
// fulltext index, include topics
} catch (Exception e) {
throw new ImportException(e, article.getId());
}
......
package de.vipra.cmd.option;
public class ImportException extends Exception {
private static final long serialVersionUID = 1L;
private final String id;
public ImportException(String msg, String id) {
super(msg);
this.id = id;
}
public ImportException(Exception e, String id) {
super(e);
this.id = id;
}
public String getId() {
return id;
}
}
......@@ -10,5 +10,6 @@
<AppenderRef ref="Console" />
</Root>
<Logger name="shellout" level="ALL"/>
<Logger name="org.mongodb" level="ERROR"/>
</Loggers>
</Configuration>
\ No newline at end of file
......@@ -9,5 +9,7 @@
<Root level="ALL">
<AppenderRef ref="Console" />
</Root>
<Logger name="shellout" level="ALL"/>
<Logger name="org.mongodb" level="ERROR"/>
</Loggers>
</Configuration>
\ No newline at end of file
......@@ -95,11 +95,18 @@ public class ArticleResource {
@DELETE
@Path("{id}")
public Response deleteArticle(@PathParam("id") String id) {
long deleted = service.deleteArticle(id);
ResponseWrapper<Article> res = new ResponseWrapper<>();
long deleted;
try {
deleted = service.deleteArticle(id);
} catch (DatabaseException e) {
res = new ResponseWrapper<>(new APIError(Response.Status.INTERNAL_SERVER_ERROR, "item could not be deleted",
"item could not be created due to an internal server error"));
return Response.serverError().entity(res).build();
}
int del = deleted > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) deleted;
switch (del) {
case 0:
ResponseWrapper<Article> res = new ResponseWrapper<>();
res.addError(new APIError(Response.Status.NOT_FOUND, "Article not found",
String.format(Messages.NOT_FOUND, "article", id)));
return Response.status(Response.Status.NOT_FOUND).entity(res).build();
......
......@@ -40,7 +40,7 @@ public class ArticleService extends DatabaseService<Article> {
return article;
}
public long deleteArticle(String id) {
public long deleteArticle(String id) throws DatabaseException {
return super.deleteSingle(id);
}
......
......@@ -19,6 +19,8 @@ public class Article extends Model {
private String text;
private String url;
private Date date;
private boolean complete;
private ArticleStats stats;
public String getTitle() {
return title;
......@@ -52,6 +54,22 @@ public class Article extends Model {
this.date = date;
}
public boolean isComplete() {
return complete;
}
public void setComplete(boolean complete) {
this.complete = complete;
}
public ArticleStats getStats() {
return stats;
}
public void setStats(ArticleStats stats) {
this.stats = stats;
}
public void setDate(String date) {
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
try {
......
package de.vipra.util.model;
import java.util.HashMap;
import java.util.Map;
public class ArticleStats {
private long wordCount;
private long uniqueWordCount;
private Map<String, TermFrequency> uniqueWords;
public long getWordCount() {
return wordCount;
}
public void setWordCount(long wordCount) {
this.wordCount = wordCount;
}
public long getUniqueWordCount() {
return uniqueWordCount;
}
public void setUniqueWordCount(long uniqueWordCount) {
this.uniqueWordCount = uniqueWordCount;
}
public Map<String, TermFrequency> getUniqueWords() {
return uniqueWords;
}
public void setUniqueWords(Map<String, TermFrequency> uniqueWords) {
this.uniqueWords = uniqueWords;
}
public static ArticleStats generateFromText(final String text) {
ArticleStats stats = new ArticleStats();
String[] words = text.split("\\s+");
stats.setWordCount(words.length);
Map<String, TermFrequency> uniqueWords = new HashMap<>();
long maxFrequency = 0;
// loop and count unique words
// also remember maximum frequency
for (String word : words) {
TermFrequency tf = uniqueWords.get(word);
if (tf == null) {
tf = new TermFrequency();
}
tf.incrementTermFrequency();
if (tf.getTermFrequency() > maxFrequency) {
maxFrequency = tf.getTermFrequency();
}
uniqueWords.put(word, tf);
}
// normalize frequencies
for (Map.Entry<String, TermFrequency> entry : uniqueWords.entrySet()) {
entry.getValue().normalizeTermFrequency(maxFrequency);
}
stats.setUniqueWordCount(uniqueWords.size());
stats.setUniqueWords(uniqueWords);
return stats;
}
}
package de.vipra.util.model;
public class TermFrequency {
private long termFrequency = 0;
private long normalizedTermFrequency = 0;
private long inverseDocumentFrequency = 0;
public long getTermFrequency() {
return termFrequency;
}
public void setTermFrequency(long termFrequency) {
this.termFrequency = termFrequency;
}
public long getNormalizedTermFrequency() {
return normalizedTermFrequency;
}
public void setNormalizedTermFrequency(long normalizedTermFrequency) {
this.normalizedTermFrequency = normalizedTermFrequency;
}
public void normalizeTermFrequency(long max) {
setNormalizedTermFrequency(getNormalizedTermFrequency() / max);
}
public void incrementTermFrequency() {
setTermFrequency(getTermFrequency() + 1);
}
public long getInverseDocumentFrequency() {
return inverseDocumentFrequency;
}
public void setInverseDocumentFrequency(long inverseDocumentFrequency) {
this.inverseDocumentFrequency = inverseDocumentFrequency;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment