Skip to content
Snippets Groups Projects
Commit d5d79b9e authored by Eike Cochu's avatar Eike Cochu
Browse files

added preprocessors

updated import option exception handling
parent a0f7bae4
Branches
No related tags found
No related merge requests found
Showing
with 313 additions and 80 deletions
......@@ -1024,61 +1024,61 @@
"auto_name": "",
"bh_regions":
[
"bh_tag",
"bh_tag_center",
"bh_tag_open",
"bh_tag_close",
"bh_tag_content",
"bh_double_quote",
"bh_double_quote_center",
"bh_double_quote_open",
"bh_double_quote_close",
"bh_double_quote_content",
"bh_curly",
"bh_curly_center",
"bh_curly_open",
"bh_curly_close",
"bh_curly_content",
"bh_single_quote",
"bh_single_quote_center",
"bh_single_quote_open",
"bh_single_quote_close",
"bh_single_quote_content",
"bh_regex",
"bh_regex_center",
"bh_regex_open",
"bh_regex_close",
"bh_regex_content",
"bh_c_define",
"bh_c_define_center",
"bh_c_define_open",
"bh_c_define_close",
"bh_c_define_content",
"bh_tag",
"bh_tag_center",
"bh_tag_open",
"bh_tag_close",
"bh_tag_content",
"bh_default",
"bh_default_center",
"bh_default_open",
"bh_default_close",
"bh_default_content",
"bh_single_quote",
"bh_single_quote_center",
"bh_single_quote_open",
"bh_single_quote_close",
"bh_single_quote_content",
"bh_unmatched",
"bh_unmatched_center",
"bh_unmatched_open",
"bh_unmatched_close",
"bh_unmatched_content",
"bh_round",
"bh_round_center",
"bh_round_open",
"bh_round_close",
"bh_round_content",
"bh_c_define",
"bh_c_define_center",
"bh_c_define_open",
"bh_c_define_close",
"bh_c_define_content",
"bh_double_quote",
"bh_double_quote_center",
"bh_double_quote_open",
"bh_double_quote_close",
"bh_double_quote_content",
"bh_angle",
"bh_angle_center",
"bh_angle_open",
"bh_angle_close",
"bh_angle_content",
"bh_round",
"bh_round_center",
"bh_round_open",
"bh_round_close",
"bh_round_content",
"bh_square",
"bh_square_center",
"bh_square_open",
"bh_square_close",
"bh_square_content"
"bh_square_content",
"bh_regex",
"bh_regex_center",
"bh_regex_open",
"bh_regex_close",
"bh_regex_content"
],
"default_dir": "/home/eike/Repositories/fu/ss15/ma/impl",
"incomplete_sync": null,
......
......@@ -15,6 +15,7 @@
<maven.compiler.target>1.7</maven.compiler.target>
<maven.compiler.source>1.7</maven.compiler.source>
<log4jVersion>2.4.1</log4jVersion>
<luceneVersion>5.4.0</luceneVersion>
</properties>
<dependencies>
......@@ -51,6 +52,18 @@
<version>3.5.2</version>
</dependency>
<!-- Lucene -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${luceneVersion}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${luceneVersion}</version>
</dependency>
<!-- Logging -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
......
......@@ -24,6 +24,8 @@ public class ExecutionException extends Exception {
public String getMessage() {
if (exceptions == null) {
return super.getMessage();
} else if (exceptions.size() == 1) {
return exceptions.get(0).getMessage();
} else {
StringBuilder sb = new StringBuilder("multiple errors:");
for (Exception e : exceptions) {
......
package de.vipra.cmd.lda;
import de.vipra.cmd.model.Article;
public class JGibbLDAAnalyzer implements LDAAnalyzer {
@Override
public Object analyze(Article article) throws LDAAnalyzerException {
// TODO Auto-generated method stub
return null;
}
}
package de.vipra.cmd.lda;
import de.vipra.cmd.model.Article;
public interface LDAAnalyzer {
public Object analyze(Article article) throws LDAAnalyzerException;
}
package de.vipra.cmd.lda;
public class LDAAnalyzerException extends Exception {
private static final long serialVersionUID = 1L;
public LDAAnalyzerException(String msg) {
super(msg);
}
public LDAAnalyzerException(Exception e) {
super(e);
}
}
package de.vipra.cmd.lda;
public class LDAWrapper {
}
package de.vipra.cmd.lda;
import de.vipra.cmd.model.Article;
public class LdacLDAAnalyzer implements LDAAnalyzer {
@Override
public Object analyze(Article article) throws LDAAnalyzerException {
// TODO Auto-generated method stub
return null;
}
}
......@@ -13,6 +13,8 @@ import de.vipra.cmd.model.Article;
import de.vipra.util.Config;
import de.vipra.util.ConfigException;
import de.vipra.util.Constants;
import de.vipra.util.ex.DatabaseException;
import de.vipra.util.ex.FilebaseException;
import de.vipra.util.service.DatabaseService;
import de.vipra.util.service.FilebaseService;
......@@ -26,6 +28,8 @@ public class DeleteCommand implements Command {
private DatabaseService<Article> dbArticles;
private FilebaseService<Article> fbArticles;
DeleteCommand() {}
public DeleteCommand(String[] strings) {
addIds(strings);
}
......@@ -40,10 +44,29 @@ public class DeleteCommand implements Command {
}
}
private void deleteEntry(String id) {
// 1. delete mongodb entry
// 2. delete file
void deleteEntry(String id) throws ExecutionException {
ArrayList<Exception> errors = new ArrayList<>();
try {
// 1. delete mongodb entry
dbArticles.deleteSingle(id);
} catch (DatabaseException e) {
errors.add(e);
}
try {
// 2. delete file
fbArticles.deleteSingle(id);
} catch (FilebaseException e) {
errors.add(e);
}
// 3. delete elasticsearch index entry
// TODO implement
if (errors.size() > 0) {
throw new ExecutionException(errors);
}
}
@Override
......
......@@ -5,9 +5,7 @@ import java.io.FileReader;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
......@@ -16,14 +14,16 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.vipra.cmd.ExecutionException;
import de.vipra.cmd.lda.JGibbLDAAnalyzer;
import de.vipra.cmd.lda.LDAAnalyzer;
import de.vipra.cmd.model.Article;
import de.vipra.cmd.text.LucenePreprocessor;
import de.vipra.cmd.text.Preprocessor;
import de.vipra.util.Config;
import de.vipra.util.ConfigException;
import de.vipra.util.Constants;
import de.vipra.util.FileUtils;
import de.vipra.util.StringUtils;
import de.vipra.util.ex.DatabaseException;
import de.vipra.util.ex.FilebaseException;
import de.vipra.util.service.DatabaseService;
import de.vipra.util.service.FilebaseService;
......@@ -38,6 +38,8 @@ public class ImportCommand implements Command {
private DatabaseService<Article> dbArticles;
private FilebaseService<Article> fbArticles;
ImportCommand() {}
public ImportCommand(String[] paths) throws ExecutionException {
addPaths(paths);
}
......@@ -92,6 +94,9 @@ public class ImportCommand implements Command {
for (Object object : array) {
try {
importArticle((JSONObject) object);
} catch (ImportException e) {
revertImport(e.getId());
errors.add(e);
} catch (Exception e) {
errors.add(e);
}
......@@ -101,41 +106,45 @@ public class ImportCommand implements Command {
}
}
private String removeStopWords(String text) throws IOException {
List<String> stopwordsList = FileUtils.readFile(FileUtils.getFile(Constants.STOPWORDS_FILE));
Set<String> stopwords = new HashSet<>(stopwordsList);
String[] words = text.split("\\s+");
StringBuilder sb = new StringBuilder();
for (String word : words) {
if (stopwords.contains(word)) {
continue;
}
sb.append(word).append(" ");
}
return sb.toString().trim();
}
private String preprocessText(String text) throws IOException {
text = text.toLowerCase();
text = removeStopWords(text);
text = text.replace("[^a-zA-Z0-9 ]", "");
return text;
}
private void importArticle(JSONObject obj) throws FilebaseException, DatabaseException, IOException {
void importArticle(JSONObject obj) throws DatabaseException, ImportException {
out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\"");
Article article = new Article();
article.fromJSON(obj);
String originalText = article.getText();
// add article to mongodb
// 1. add article to mongodb
// this generates a unique object id
article = dbArticles.createSingle(article);
// add article to filebase
article.setText(preprocessText(article.getText()));
article = fbArticles.createSingle(article);
try {
// 2. preprocess text
// process text before topic modeling
Preprocessor preprocessor = new LucenePreprocessor();
String processedText = preprocessor.preprocess(originalText);
// 3. add article to filebase
// topic modeling works on files
article.setText(processedText);
fbArticles.createSingle(article);
// 4. topic modeling
// extract topics from processed text
LDAAnalyzer analyzer = new JGibbLDAAnalyzer();
Object what = analyzer.analyze(article);
// TODO implement
// 5. index article via elasticsearch
// fulltext index, include topics
} catch (Exception e) {
throw new ImportException(e, article.getId());
}
}
// 3. index article via elasticsearch, include topics
// 4. topic modeling
private void revertImport(String id) throws ExecutionException {
if (id != null) {
DeleteCommand cmd = new DeleteCommand();
cmd.deleteEntry(id);
}
}
@Override
......
package de.vipra.cmd.option;
public class ImportException extends Exception {
private static final long serialVersionUID = 1L;
private final String id;
public ImportException(String msg, String id) {
super(msg);
this.id = id;
}
public ImportException(Exception e, String id) {
super(e);
this.id = id;
}
public String getId() {
return id;
}
}
package de.vipra.cmd.text;
import java.util.Arrays;
import java.util.HashSet;
public class CustomPreprocessor implements Preprocessor {
public static final HashSet<String> STOPWORDS = new HashSet<>(Arrays.asList(new String[] { "a", "about", "above",
"after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because",
"been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't",
"did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from",
"further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's",
"her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll",
"i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more",
"most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other",
"ought", "our", "ours ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll",
"she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
"them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're",
"they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we",
"we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where",
"where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't",
"you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" }));
private String removeStopWords(String text) {
String[] words = text.split("\\s+");
StringBuilder sb = new StringBuilder();
for (String word : words) {
if (STOPWORDS.contains(word)) {
continue;
}
sb.append(word).append(" ");
}
return sb.toString().trim();
}
@Override
public String preprocess(String input) {
input = input.toLowerCase();
input = removeStopWords(input);
input = input.replace("[^a-zA-Z0-9 ]", "");
return input;
}
}
package de.vipra.cmd.text;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import de.vipra.util.StringUtils;
public class LucenePreprocessor implements Preprocessor {
@Override
public String preprocess(String input) throws PreprocessorException {
Analyzer analyzer = new StandardAnalyzer();
TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
try {
stream.reset();
stream = new PorterStemFilter(stream);
stream = new TrimFilter(stream);
stream = new PatternReplaceFilter(stream, Pattern.compile("[^a-zA-Z0-9]"), "", true);
ArrayList<String> result = new ArrayList<>();
while (stream.incrementToken()) {
result.add(stream.getAttribute(CharTermAttribute.class).toString());
}
return StringUtils.join(result);
} catch (IOException e) {
throw new PreprocessorException(e);
} finally {
try {
stream.close();
} catch (IOException e) {}
analyzer.close();
}
}
}
package de.vipra.cmd.text;
public interface Preprocessor {
String preprocess(String input) throws PreprocessorException;
}
package de.vipra.cmd.text;
public class PreprocessorException extends Exception {
private static final long serialVersionUID = 1L;
public PreprocessorException(String msg) {
super(msg);
}
public PreprocessorException(Exception e) {
super(e);
}
}
package de.vipra.util;
import java.util.Iterator;
public class StringUtils {
public static String ellipsize(String input, int maxLength) {
......@@ -11,13 +13,15 @@ public class StringUtils {
}
public static String join(Iterable<String> it) {
StringBuilder sb = new StringBuilder();
String sep = "";
for (String s : it) {
sb.append(sep).append(s);
sep = " ";
Iterator<String> iter = it.iterator();
if (iter.hasNext()) {
StringBuilder sb = new StringBuilder(iter.next());
while (iter.hasNext()) {
sb.append(" ").append(iter.next());
}
return sb.toString();
}
return sb.toString();
return "";
}
}
......@@ -77,9 +77,13 @@ public class DatabaseService<T extends Model> implements Service<T, DatabaseExce
}
@Override
public long deleteSingle(String id) {
DeleteResult result = collection.deleteOne(Filters.eq("_id", objectId(id)));
return result.getDeletedCount();
public long deleteSingle(String id) throws DatabaseException {
try {
DeleteResult result = collection.deleteOne(Filters.eq("_id", objectId(id)));
return result.getDeletedCount();
} catch (Exception e) {
throw new DatabaseException("could not delete database entry: " + e.getMessage());
}
}
@Override
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment