diff --git a/ma-impl.sublime-workspace b/ma-impl.sublime-workspace index 6520e543d140111a3333797697328921b5ae677b..41ee82891d2c16a876e09eb175a776a51090d10e 100644 --- a/ma-impl.sublime-workspace +++ b/ma-impl.sublime-workspace @@ -1024,61 +1024,61 @@ "auto_name": "", "bh_regions": [ - "bh_tag", - "bh_tag_center", - "bh_tag_open", - "bh_tag_close", - "bh_tag_content", - "bh_double_quote", - "bh_double_quote_center", - "bh_double_quote_open", - "bh_double_quote_close", - "bh_double_quote_content", "bh_curly", "bh_curly_center", "bh_curly_open", "bh_curly_close", "bh_curly_content", - "bh_single_quote", - "bh_single_quote_center", - "bh_single_quote_open", - "bh_single_quote_close", - "bh_single_quote_content", - "bh_regex", - "bh_regex_center", - "bh_regex_open", - "bh_regex_close", - "bh_regex_content", - "bh_c_define", - "bh_c_define_center", - "bh_c_define_open", - "bh_c_define_close", - "bh_c_define_content", + "bh_tag", + "bh_tag_center", + "bh_tag_open", + "bh_tag_close", + "bh_tag_content", "bh_default", "bh_default_center", "bh_default_open", "bh_default_close", "bh_default_content", + "bh_single_quote", + "bh_single_quote_center", + "bh_single_quote_open", + "bh_single_quote_close", + "bh_single_quote_content", "bh_unmatched", "bh_unmatched_center", "bh_unmatched_open", "bh_unmatched_close", "bh_unmatched_content", - "bh_round", - "bh_round_center", - "bh_round_open", - "bh_round_close", - "bh_round_content", + "bh_c_define", + "bh_c_define_center", + "bh_c_define_open", + "bh_c_define_close", + "bh_c_define_content", + "bh_double_quote", + "bh_double_quote_center", + "bh_double_quote_open", + "bh_double_quote_close", + "bh_double_quote_content", "bh_angle", "bh_angle_center", "bh_angle_open", "bh_angle_close", "bh_angle_content", + "bh_round", + "bh_round_center", + "bh_round_open", + "bh_round_close", + "bh_round_content", "bh_square", "bh_square_center", "bh_square_open", "bh_square_close", - "bh_square_content" + "bh_square_content", + "bh_regex", + "bh_regex_center", + "bh_regex_open", + "bh_regex_close", + "bh_regex_content" ], "default_dir": "/home/eike/Repositories/fu/ss15/ma/impl", "incomplete_sync": null, diff --git a/vipra-cmd/pom.xml b/vipra-cmd/pom.xml index 870fd2ac6fa8372d5801639e2789eccac8897069..c237d34b268573ef5adaa509ab6edbe611fe5e40 100644 --- a/vipra-cmd/pom.xml +++ b/vipra-cmd/pom.xml @@ -15,6 +15,7 @@ <maven.compiler.target>1.7</maven.compiler.target> <maven.compiler.source>1.7</maven.compiler.source> <log4jVersion>2.4.1</log4jVersion> + <luceneVersion>5.4.0</luceneVersion> </properties> <dependencies> @@ -51,6 +52,18 @@ <version>3.5.2</version> </dependency> + <!-- Lucene --> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + <version>${luceneVersion}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <version>${luceneVersion}</version> + </dependency> + <!-- Logging --> <dependency> <groupId>org.apache.logging.log4j</groupId> diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/ExecutionException.java b/vipra-cmd/src/main/java/de/vipra/cmd/ExecutionException.java index b7df9a97e99674babfda4b8dac9f18bcd82641ec..6ea2de6b50588b2b370b39e056ebcad91ff4bae0 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/ExecutionException.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/ExecutionException.java @@ -24,6 +24,8 @@ public class ExecutionException extends Exception { public String getMessage() { if (exceptions == null) { return super.getMessage(); + } else if (exceptions.size() == 1) { + return exceptions.get(0).getMessage(); } else { StringBuilder sb = new StringBuilder("multiple errors:"); for (Exception e : exceptions) { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java new file mode 100644 index 0000000000000000000000000000000000000000..f5f9c4ef5bf4d4df71a4361a8ba4d99f10cac7e3 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java @@ -0,0 +1,13 @@ +package de.vipra.cmd.lda; + +import de.vipra.cmd.model.Article; + +public class JGibbLDAAnalyzer implements LDAAnalyzer { + + @Override + public Object analyze(Article article) throws LDAAnalyzerException { + // TODO Auto-generated method stub + return null; + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java new file mode 100644 index 0000000000000000000000000000000000000000..0d2f65f1322fc472bdc01ff105d6cb7a790586f2 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java @@ -0,0 +1,9 @@ +package de.vipra.cmd.lda; + +import de.vipra.cmd.model.Article; + +public interface LDAAnalyzer { + + public Object analyze(Article article) throws LDAAnalyzerException; + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzerException.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzerException.java new file mode 100644 index 0000000000000000000000000000000000000000..d80081d88d7c21c8b5331139dff94cefef3bc80b --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzerException.java @@ -0,0 +1,15 @@ +package de.vipra.cmd.lda; + +public class LDAAnalyzerException extends Exception { + + private static final long serialVersionUID = 1L; + + public LDAAnalyzerException(String msg) { + super(msg); + } + + public LDAAnalyzerException(Exception e) { + super(e); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAWrapper.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAWrapper.java deleted file mode 100644 index da37e231b3437ea126aaa41e9382819456f05c67..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAWrapper.java +++ /dev/null @@ -1,5 +0,0 @@ -package de.vipra.cmd.lda; - -public class LDAWrapper { - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java new file mode 100644 index 0000000000000000000000000000000000000000..e5da35c8f3ca1779e956cb7cf41470bcacb73388 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java @@ -0,0 +1,13 @@ +package de.vipra.cmd.lda; + +import de.vipra.cmd.model.Article; + +public class LdacLDAAnalyzer implements LDAAnalyzer { + + @Override + public Object analyze(Article article) throws LDAAnalyzerException { + // TODO Auto-generated method stub + return null; + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java index 350d543c7afd7c96acbf4f79ca1364115d8b94c1..acebb74215efae21c30b0cad56c9342c38c054ff 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java @@ -13,6 +13,8 @@ import de.vipra.cmd.model.Article; import de.vipra.util.Config; import de.vipra.util.ConfigException; import de.vipra.util.Constants; +import de.vipra.util.ex.DatabaseException; +import de.vipra.util.ex.FilebaseException; import de.vipra.util.service.DatabaseService; import de.vipra.util.service.FilebaseService; @@ -26,6 +28,8 @@ public class DeleteCommand implements Command { private DatabaseService<Article> dbArticles; private FilebaseService<Article> fbArticles; + DeleteCommand() {} + public DeleteCommand(String[] strings) { addIds(strings); } @@ -40,10 +44,29 @@ public class DeleteCommand implements Command { } } - private void deleteEntry(String id) { - // 1. delete mongodb entry - // 2. delete file + void deleteEntry(String id) throws ExecutionException { + ArrayList<Exception> errors = new ArrayList<>(); + + try { + // 1. delete mongodb entry + dbArticles.deleteSingle(id); + } catch (DatabaseException e) { + errors.add(e); + } + + try { + // 2. delete file + fbArticles.deleteSingle(id); + } catch (FilebaseException e) { + errors.add(e); + } + // 3. delete elasticsearch index entry + // TODO implement + + if (errors.size() > 0) { + throw new ExecutionException(errors); + } } @Override diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 82b713824b48e868f42690b2a1948300063115e5..53dee5cea56505c00f9a7db944b1245fe768e891 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -5,9 +5,7 @@ import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; import org.json.simple.JSONArray; import org.json.simple.JSONObject; @@ -16,14 +14,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.vipra.cmd.ExecutionException; +import de.vipra.cmd.lda.JGibbLDAAnalyzer; +import de.vipra.cmd.lda.LDAAnalyzer; import de.vipra.cmd.model.Article; +import de.vipra.cmd.text.LucenePreprocessor; +import de.vipra.cmd.text.Preprocessor; import de.vipra.util.Config; import de.vipra.util.ConfigException; import de.vipra.util.Constants; -import de.vipra.util.FileUtils; import de.vipra.util.StringUtils; import de.vipra.util.ex.DatabaseException; -import de.vipra.util.ex.FilebaseException; import de.vipra.util.service.DatabaseService; import de.vipra.util.service.FilebaseService; @@ -38,6 +38,8 @@ public class ImportCommand implements Command { private DatabaseService<Article> dbArticles; private FilebaseService<Article> fbArticles; + ImportCommand() {} + public ImportCommand(String[] paths) throws ExecutionException { addPaths(paths); } @@ -92,6 +94,9 @@ public class ImportCommand implements Command { for (Object object : array) { try { importArticle((JSONObject) object); + } catch (ImportException e) { + revertImport(e.getId()); + errors.add(e); } catch (Exception e) { errors.add(e); } @@ -101,41 +106,45 @@ public class ImportCommand implements Command { } } - private String removeStopWords(String text) throws IOException { - List<String> stopwordsList = FileUtils.readFile(FileUtils.getFile(Constants.STOPWORDS_FILE)); - Set<String> stopwords = new HashSet<>(stopwordsList); - String[] words = text.split("\\s+"); - StringBuilder sb = new StringBuilder(); - for (String word : words) { - if (stopwords.contains(word)) { - continue; - } - sb.append(word).append(" "); - } - return sb.toString().trim(); - } - - private String preprocessText(String text) throws IOException { - text = text.toLowerCase(); - text = removeStopWords(text); - text = text.replace("[^a-zA-Z0-9 ]", ""); - return text; - } - - private void importArticle(JSONObject obj) throws FilebaseException, DatabaseException, IOException { + void importArticle(JSONObject obj) throws DatabaseException, ImportException { out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\""); Article article = new Article(); article.fromJSON(obj); + String originalText = article.getText(); - // add article to mongodb + // 1. add article to mongodb + // this generates a unique object id article = dbArticles.createSingle(article); - // add article to filebase - article.setText(preprocessText(article.getText())); - article = fbArticles.createSingle(article); + try { + // 2. preprocess text + // process text before topic modeling + Preprocessor preprocessor = new LucenePreprocessor(); + String processedText = preprocessor.preprocess(originalText); + + // 3. add article to filebase + // topic modeling works on files + article.setText(processedText); + fbArticles.createSingle(article); + + // 4. topic modeling + // extract topics from processed text + LDAAnalyzer analyzer = new JGibbLDAAnalyzer(); + Object what = analyzer.analyze(article); + // TODO implement + + // 5. index article via elasticsearch + // fulltext index, include topics + } catch (Exception e) { + throw new ImportException(e, article.getId()); + } + } - // 3. index article via elasticsearch, include topics - // 4. topic modeling + private void revertImport(String id) throws ExecutionException { + if (id != null) { + DeleteCommand cmd = new DeleteCommand(); + cmd.deleteEntry(id); + } } @Override diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java new file mode 100644 index 0000000000000000000000000000000000000000..db2875e4290aba18bba62ac5b32005c167aeed55 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportException.java @@ -0,0 +1,23 @@ +package de.vipra.cmd.option; + +public class ImportException extends Exception { + + private static final long serialVersionUID = 1L; + + private final String id; + + public ImportException(String msg, String id) { + super(msg); + this.id = id; + } + + public ImportException(Exception e, String id) { + super(e); + this.id = id; + } + + public String getId() { + return id; + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java new file mode 100644 index 0000000000000000000000000000000000000000..ba7136286844a24978971087166c09ce9b89f4e0 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java @@ -0,0 +1,44 @@ +package de.vipra.cmd.text; + +import java.util.Arrays; +import java.util.HashSet; + +public class CustomPreprocessor implements Preprocessor { + + public static final HashSet<String> STOPWORDS = new HashSet<>(Arrays.asList(new String[] { "a", "about", "above", + "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", + "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", + "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", + "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", + "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", + "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", + "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", + "ought", "our", "ours ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", + "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", + "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", + "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", + "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", + "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", + "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" })); + + private String removeStopWords(String text) { + String[] words = text.split("\\s+"); + StringBuilder sb = new StringBuilder(); + for (String word : words) { + if (STOPWORDS.contains(word)) { + continue; + } + sb.append(word).append(" "); + } + return sb.toString().trim(); + } + + @Override + public String preprocess(String input) { + input = input.toLowerCase(); + input = removeStopWords(input); + input = input.replace("[^a-zA-Z0-9 ]", ""); + return input; + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java new file mode 100644 index 0000000000000000000000000000000000000000..446e17de2cf73435c09cb1180a1240391ff13bf0 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java @@ -0,0 +1,44 @@ +package de.vipra.cmd.text; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.miscellaneous.TrimFilter; +import org.apache.lucene.analysis.pattern.PatternReplaceFilter; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import de.vipra.util.StringUtils; + +public class LucenePreprocessor implements Preprocessor { + + @Override + public String preprocess(String input) throws PreprocessorException { + Analyzer analyzer = new StandardAnalyzer(); + TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); + try { + stream.reset(); + stream = new PorterStemFilter(stream); + stream = new TrimFilter(stream); + stream = new PatternReplaceFilter(stream, Pattern.compile("[^a-zA-Z0-9]"), "", true); + ArrayList<String> result = new ArrayList<>(); + while (stream.incrementToken()) { + result.add(stream.getAttribute(CharTermAttribute.class).toString()); + } + return StringUtils.join(result); + } catch (IOException e) { + throw new PreprocessorException(e); + } finally { + try { + stream.close(); + } catch (IOException e) {} + analyzer.close(); + } + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java new file mode 100644 index 0000000000000000000000000000000000000000..75dcb73713e4f10fa557ad22011d2179317ae9b0 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java @@ -0,0 +1,7 @@ +package de.vipra.cmd.text; + +public interface Preprocessor { + + String preprocess(String input) throws PreprocessorException; + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/PreprocessorException.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/PreprocessorException.java new file mode 100644 index 0000000000000000000000000000000000000000..d8c62b6648142a6f82fcf4da25bf2f3fe4140252 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/PreprocessorException.java @@ -0,0 +1,15 @@ +package de.vipra.cmd.text; + +public class PreprocessorException extends Exception { + + private static final long serialVersionUID = 1L; + + public PreprocessorException(String msg) { + super(msg); + } + + public PreprocessorException(Exception e) { + super(e); + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/StringUtils.java b/vipra-util/src/main/java/de/vipra/util/StringUtils.java index 3070548497f20a558d0ba0c039d6b1626075b92d..34a40c14fabcdb7d566c813a9c47ebd8f37aef58 100644 --- a/vipra-util/src/main/java/de/vipra/util/StringUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/StringUtils.java @@ -1,5 +1,7 @@ package de.vipra.util; +import java.util.Iterator; + public class StringUtils { public static String ellipsize(String input, int maxLength) { @@ -11,13 +13,15 @@ public class StringUtils { } public static String join(Iterable<String> it) { - StringBuilder sb = new StringBuilder(); - String sep = ""; - for (String s : it) { - sb.append(sep).append(s); - sep = " "; + Iterator<String> iter = it.iterator(); + if (iter.hasNext()) { + StringBuilder sb = new StringBuilder(iter.next()); + while (iter.hasNext()) { + sb.append(" ").append(iter.next()); + } + return sb.toString(); } - return sb.toString(); + return ""; } } diff --git a/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java b/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java index 9b774258c876da7dd1bd55b6d8be771c89baac58..51d979bb39c18cdd221302b00025c9b4b0ed228b 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java +++ b/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java @@ -77,9 +77,13 @@ public class DatabaseService<T extends Model> implements Service<T, DatabaseExce } @Override - public long deleteSingle(String id) { - DeleteResult result = collection.deleteOne(Filters.eq("_id", objectId(id))); - return result.getDeletedCount(); + public long deleteSingle(String id) throws DatabaseException { + try { + DeleteResult result = collection.deleteOne(Filters.eq("_id", objectId(id))); + return result.getDeletedCount(); + } catch (Exception e) { + throw new DatabaseException("could not delete database entry: " + e.getMessage()); + } } @Override