diff --git a/ma-impl.sublime-workspace b/ma-impl.sublime-workspace index 5c7ab3970c0c00b24f4e3162b9331b1591fb1b92..2fe3e4206bffc63a7a6bd2237de0613e53f32730 100644 --- a/ma-impl.sublime-workspace +++ b/ma-impl.sublime-workspace @@ -271,6 +271,15 @@ }, "buffers": [ + { + "contents": "1. import all new articles:\n * generate article statistics\n * into database\n * into filebase, using filebase adapter of selected tm library\n2. recreate topic modeling, using selected tm library\n * needs no articles in memory, works completely on files\n3. insert new topic model into database\n * how to interpret tm result?\n * how to relate tm result to articles?\n4. index new articles\n * needs title, processed text and topics\n\nfilebase writes into single file according to tm library\nneeds index where articles are stored in the file\n\nall new articles are held in memory?\n\noriginal text is not needed except for stats (original text length) and for db for ui browsing", + "settings": + { + "buffer_size": 675, + "line_ending": "Unix", + "name": "1. import all new articles:" + } + } ], "build_system": "", "build_system_choices": @@ -892,8 +901,97 @@ "groups": [ { + "selected": 0, "sheets": [ + { + "buffer": 0, + "semi_transient": false, + "settings": + { + "buffer_size": 675, + "regions": + { + }, + "selection": + [ + [ + 579, + 579 + ] + ], + "settings": + { + "BracketHighlighterBusy": false, + "auto_name": "1. import all new articles:", + "bh_regions": + [ + "bh_default", + "bh_default_center", + "bh_default_open", + "bh_default_close", + "bh_default_content", + "bh_square", + "bh_square_center", + "bh_square_open", + "bh_square_close", + "bh_square_content", + "bh_round", + "bh_round_center", + "bh_round_open", + "bh_round_close", + "bh_round_content", + "bh_c_define", + "bh_c_define_center", + "bh_c_define_open", + "bh_c_define_close", + "bh_c_define_content", + "bh_single_quote", + "bh_single_quote_center", + "bh_single_quote_open", + "bh_single_quote_close", + "bh_single_quote_content", + "bh_double_quote", + "bh_double_quote_center", + "bh_double_quote_open", + "bh_double_quote_close", + "bh_double_quote_content", + "bh_angle", + "bh_angle_center", + "bh_angle_open", + "bh_angle_close", + "bh_angle_content", + "bh_tag", + "bh_tag_center", + "bh_tag_open", + "bh_tag_close", + "bh_tag_content", + "bh_regex", + "bh_regex_center", + "bh_regex_open", + "bh_regex_close", + "bh_regex_content", + "bh_unmatched", + "bh_unmatched_center", + "bh_unmatched_open", + "bh_unmatched_close", + "bh_unmatched_content", + "bh_curly", + "bh_curly_center", + "bh_curly_open", + "bh_curly_close", + "bh_curly_content" + ], + "incomplete_sync": null, + "syntax": "Packages/Text/Plain text.tmLanguage" + }, + "translation.x": 0.0, + "translation.y": 0.0, + "zoom_level": 1.0 + }, + "stack_index": 0, + "type": "text" + } ] } ], diff --git a/vipra-cmd/.settings/org.eclipse.jdt.core.prefs b/vipra-cmd/.settings/org.eclipse.jdt.core.prefs index 78a9b4501f5ec8e63422f8199df3009d010cfbda..8995f4d00e9a6841edfa63849de383ea0fefa088 100644 --- a/vipra-cmd/.settings/org.eclipse.jdt.core.prefs +++ b/vipra-cmd/.settings/org.eclipse.jdt.core.prefs @@ -14,7 +14,7 @@ org.eclipse.jdt.core.compiler.source=1.7 org.eclipse.jdt.core.formatter.align_type_members_on_columns=false org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation=0 -org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant=48 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression=16 @@ -22,7 +22,7 @@ org.eclipse.jdt.core.formatter.alignment_for_assignment=0 org.eclipse.jdt.core.formatter.alignment_for_binary_expression=16 org.eclipse.jdt.core.formatter.alignment_for_compact_if=16 org.eclipse.jdt.core.formatter.alignment_for_conditional_expression=80 -org.eclipse.jdt.core.formatter.alignment_for_enum_constants=0 +org.eclipse.jdt.core.formatter.alignment_for_enum_constants=49 org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer=16 org.eclipse.jdt.core.formatter.alignment_for_method_declaration=0 org.eclipse.jdt.core.formatter.alignment_for_multiple_fields=16 @@ -31,7 +31,7 @@ org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_resources_in_try=80 org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation=16 org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration=16 -org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration=48 org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration=16 diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java index 899ac92aa72c3b7fbfe6c17125c3d8a6b2f7f4b9..46197f2bbf427e671873fbdd7be25add4614b470 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java @@ -12,10 +12,13 @@ import org.slf4j.LoggerFactory; import de.vipra.cmd.option.Command; import de.vipra.cmd.option.DeleteCommand; import de.vipra.cmd.option.ImportCommand; +import de.vipra.util.StringUtils; +import de.vipra.util.Timer; public class Main { public static final Logger log = LoggerFactory.getLogger(Main.class); + public static final Logger out = LoggerFactory.getLogger("shellout"); public static void main(String[] args) { CommandLineParser parser = new DefaultParser(); @@ -49,7 +52,11 @@ public class Main { } if (c != null) { + Timer t = new Timer(); + t.start(); c.run(); + long dur = t.stop(); + out.info("done in " + StringUtils.timeString(dur)); } else { options.printHelp(cmd); } diff --git a/vipra-util/src/main/java/de/vipra/util/ex/FilebaseException.java b/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java similarity index 89% rename from vipra-util/src/main/java/de/vipra/util/ex/FilebaseException.java rename to vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java index 99dd450e7b643a0e6bc19f12486b04a567dc98a2..ad5f6ef7b1ce8542117b2027033d5a8f43291bfa 100644 --- a/vipra-util/src/main/java/de/vipra/util/ex/FilebaseException.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java @@ -1,4 +1,4 @@ -package de.vipra.util.ex; +package de.vipra.cmd.ex; public class FilebaseException extends Exception { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/ex/ImportException.java b/vipra-cmd/src/main/java/de/vipra/cmd/ex/ImportException.java new file mode 100644 index 0000000000000000000000000000000000000000..d7f6da337f6dd0752ba4b094862033e23db22647 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/ex/ImportException.java @@ -0,0 +1,23 @@ +package de.vipra.cmd.ex; + +public class ImportException extends Exception { + + private static final long serialVersionUID = 1L; + + private final String id; + + public ImportException(String msg, String id) { + super(msg); + this.id = id; + } + + public ImportException(Exception e, String id) { + super(e); + this.id = id; + } + + public String getId() { + return id; + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzerException.java b/vipra-cmd/src/main/java/de/vipra/cmd/ex/LDAAnalyzerException.java similarity index 89% rename from vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzerException.java rename to vipra-cmd/src/main/java/de/vipra/cmd/ex/LDAAnalyzerException.java index d80081d88d7c21c8b5331139dff94cefef3bc80b..bf55ee835d9827c5c78d77a099b0f0aedae22078 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzerException.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/ex/LDAAnalyzerException.java @@ -1,4 +1,4 @@ -package de.vipra.cmd.lda; +package de.vipra.cmd.ex; public class LDAAnalyzerException extends Exception { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/PreprocessorException.java b/vipra-cmd/src/main/java/de/vipra/cmd/ex/PreprocessorException.java similarity index 89% rename from vipra-cmd/src/main/java/de/vipra/cmd/text/PreprocessorException.java rename to vipra-cmd/src/main/java/de/vipra/cmd/ex/PreprocessorException.java index d8c62b6648142a6f82fcf4da25bf2f3fe4140252..1b4f6ade0e0a299e38c65220ab07ec95201ba605 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/PreprocessorException.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/ex/PreprocessorException.java @@ -1,4 +1,4 @@ -package de.vipra.cmd.text; +package de.vipra.cmd.ex; public class PreprocessorException extends Exception { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java new file mode 100644 index 0000000000000000000000000000000000000000..96714d44f700c542d29e3734f1aea11e948976c1 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java @@ -0,0 +1,55 @@ +package de.vipra.cmd.file; + +import java.io.File; +import java.io.IOException; + +import de.vipra.cmd.ex.FilebaseException; +import de.vipra.cmd.model.Article; +import de.vipra.util.Config; +import de.vipra.util.Config.Key; +import de.vipra.util.ex.ConfigException; + +public abstract class Filebase { + + private final File dataDir; + private final FilebaseIndex index; + + public Filebase(File dataDir) throws FilebaseException { + this.dataDir = dataDir; + try { + this.index = new FilebaseIndex(new File(dataDir, "asd")); + } catch (IOException e) { + throw new FilebaseException("could not read index: " + e.getMessage()); + } + } + + public File getDataDir() { + return dataDir; + } + + public void writeIndex() throws IOException { + index.write(); + } + + public void remove(Article article) throws FilebaseException { + remove(article.getId()); + } + + public abstract void add(Article article) throws FilebaseException; + + public abstract void remove(String id) throws FilebaseException; + + public abstract void write() throws IOException; + + public static Filebase getFilebase(Config config) throws FilebaseException, ConfigException { + File dataDir = config.getDataDirectory(); + switch (config.getString(Key.ANALYZER).toLowerCase()) { + case "ldac": + return new LdacFilebase(dataDir); + case "jgibb": + default: + return new JGibbFilebase(dataDir); + } + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java new file mode 100644 index 0000000000000000000000000000000000000000..662f43df6562def721eee32c0afa1d1a0ffc2f73 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java @@ -0,0 +1,46 @@ +package de.vipra.cmd.file; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import de.vipra.util.Constants; +import de.vipra.util.FileUtils; + +public class FilebaseIndex { + + private final File file; + private final List<String> index; + + public FilebaseIndex(File file) throws IOException { + this.file = file; + if (file.exists()) { + index = new ArrayList<>(FileUtils.readFile(file)); + } else { + index = new ArrayList<>(); + } + } + + public void write() throws IOException { + FileUtils.writeLines(file, Constants.FB_ENCODING.name(), index, null, false); + } + + public int add(String id) { + int i = indexOf(id); + if (i == -1) { + index.add(id); + i = index.size() - 1; + } + return i; + } + + public int indexOf(String id) { + return index.indexOf(id); + } + + public boolean remove(String id) { + return index.remove(id); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java new file mode 100644 index 0000000000000000000000000000000000000000..1dd885ad808138387fbeeceda40b7b1fe925e187 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java @@ -0,0 +1,33 @@ +package de.vipra.cmd.file; + +import java.io.File; +import java.io.IOException; + +import de.vipra.cmd.ex.FilebaseException; +import de.vipra.cmd.model.Article; + +public class JGibbFilebase extends Filebase { + + public JGibbFilebase(File dataDir) throws FilebaseException { + super(dataDir); + // TODO Auto-generated constructor stub + } + + @Override + public void add(Article article) { + // TODO Auto-generated method stub + + } + + @Override + public void remove(String id) { + // TODO Auto-generated method stub + + } + + @Override + public void write() throws IOException { + writeIndex(); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java new file mode 100644 index 0000000000000000000000000000000000000000..d6c4bbb9221b32c905520fc42056d7ec96b2ce76 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java @@ -0,0 +1,33 @@ +package de.vipra.cmd.file; + +import java.io.File; +import java.io.IOException; + +import de.vipra.cmd.ex.FilebaseException; +import de.vipra.cmd.model.Article; + +public class LdacFilebase extends Filebase { + + public LdacFilebase(File dataDir) throws FilebaseException { + super(dataDir); + // TODO Auto-generated constructor stub + } + + @Override + public void add(Article article) { + // TODO Auto-generated method stub + + } + + @Override + public void remove(String id) { + // TODO Auto-generated method stub + + } + + @Override + public void write() throws IOException { + writeIndex(); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java index f5f9c4ef5bf4d4df71a4361a8ba4d99f10cac7e3..76f29e7801b2f7abdb622d44695472be9024e684 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java @@ -1,13 +1,17 @@ package de.vipra.cmd.lda; -import de.vipra.cmd.model.Article; +import de.vipra.cmd.ex.LDAAnalyzerException; -public class JGibbLDAAnalyzer implements LDAAnalyzer { +public class JGibbLDAAnalyzer extends LDAAnalyzer { @Override - public Object analyze(Article article) throws LDAAnalyzerException { + public String getName() { + return "JGibb Analyzer"; + } + + @Override + public void analyze() throws LDAAnalyzerException { // TODO Auto-generated method stub - return null; } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java index 0d2f65f1322fc472bdc01ff105d6cb7a790586f2..92885e5b4bca829a320ab6cdfb7c4751da20123b 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java @@ -1,9 +1,23 @@ package de.vipra.cmd.lda; -import de.vipra.cmd.model.Article; +import de.vipra.cmd.ex.LDAAnalyzerException; +import de.vipra.util.Config; +import de.vipra.util.Config.Key; -public interface LDAAnalyzer { +public abstract class LDAAnalyzer { + + public abstract String getName(); - public Object analyze(Article article) throws LDAAnalyzerException; + public abstract void analyze() throws LDAAnalyzerException; + + public static LDAAnalyzer getAnalyzer(Config config) { + switch (config.getString(Key.ANALYZER).toLowerCase()) { + case "ldac": + return new LdacLDAAnalyzer(); + case "jgibb": + default: + return new JGibbLDAAnalyzer(); + } + } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java index e5da35c8f3ca1779e956cb7cf41470bcacb73388..c8431e44d602c452b4d2db8dd798cc18789b3e54 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java @@ -1,13 +1,17 @@ package de.vipra.cmd.lda; -import de.vipra.cmd.model.Article; +import de.vipra.cmd.ex.LDAAnalyzerException; -public class LdacLDAAnalyzer implements LDAAnalyzer { +public class LdacLDAAnalyzer extends LDAAnalyzer { @Override - public Object analyze(Article article) throws LDAAnalyzerException { + public String getName() { + return "lda-c Analyzer"; + } + + @Override + public void analyze() throws LDAAnalyzerException { // TODO Auto-generated method stub - return null; } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java index acebb74215efae21c30b0cad56c9342c38c054ff..aa8c940024d69cb3f34fd5c5118763f486cc590a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java @@ -9,14 +9,14 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.vipra.cmd.ExecutionException; +import de.vipra.cmd.ex.FilebaseException; +import de.vipra.cmd.file.Filebase; import de.vipra.cmd.model.Article; import de.vipra.util.Config; -import de.vipra.util.ConfigException; import de.vipra.util.Constants; +import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; -import de.vipra.util.ex.FilebaseException; import de.vipra.util.service.DatabaseService; -import de.vipra.util.service.FilebaseService; public class DeleteCommand implements Command { @@ -26,7 +26,7 @@ public class DeleteCommand implements Command { private ArrayList<String> ids = new ArrayList<>(); private Config config; private DatabaseService<Article> dbArticles; - private FilebaseService<Article> fbArticles; + private Filebase filebase; DeleteCommand() {} @@ -56,7 +56,7 @@ public class DeleteCommand implements Command { try { // 2. delete file - fbArticles.deleteSingle(id); + filebase.remove(id); } catch (FilebaseException e) { errors.add(e); } @@ -72,10 +72,10 @@ public class DeleteCommand implements Command { @Override public void run() throws ExecutionException { try { - config = new Config(); - dbArticles = config.getDatabaseService(Constants.Collection.ARTICLES, Article.class); - fbArticles = config.getFilebaseService(Article.class); - } catch (IOException | ConfigException e) { + config = Config.getConfig(); + dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); + filebase = Filebase.getFilebase(config); + } catch (IOException | FilebaseException | ConfigException e) { throw new ExecutionException(e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index d31a756329b1cf8be736e8e5cc4d1290d9f14234..5a6559ba7b962dffe98daf1d97b8da876cfed938 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -1,6 +1,7 @@ package de.vipra.cmd.option; import java.io.File; +import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; @@ -10,45 +11,25 @@ import java.util.List; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.vipra.cmd.ExecutionException; +import de.vipra.cmd.ex.ImportException; +import de.vipra.cmd.file.Filebase; +import de.vipra.cmd.lda.LDAAnalyzer; import de.vipra.cmd.model.Article; -import de.vipra.cmd.text.LucenePreprocessor; import de.vipra.cmd.text.Preprocessor; import de.vipra.util.Config; -import de.vipra.util.ConfigException; import de.vipra.util.Constants; import de.vipra.util.StringUtils; +import de.vipra.util.Timer; import de.vipra.util.model.ArticleStats; import de.vipra.util.service.DatabaseService; -import de.vipra.util.service.FilebaseService; public class ImportCommand implements Command { - public class ImportException extends Exception { - - private static final long serialVersionUID = 1L; - - private final String id; - - public ImportException(String msg, String id) { - super(msg); - this.id = id; - } - - public ImportException(Exception e, String id) { - super(e); - this.id = id; - } - - public String getId() { - return id; - } - - } - public static final Logger log = LoggerFactory.getLogger(ImportCommand.class); public static final Logger out = LoggerFactory.getLogger("shellout"); @@ -56,7 +37,9 @@ public class ImportCommand implements Command { private JSONParser parser = new JSONParser(); private Config config; private DatabaseService<Article> dbArticles; - private FilebaseService<Article> fbArticles; + private Filebase filebase; + private Preprocessor preprocessor; + private LDAAnalyzer analyzer; ImportCommand() {} @@ -95,91 +78,101 @@ public class ImportCommand implements Command { } } - private void importFile(File file) throws Exception { - Object data = parser.parse(new FileReader(file)); - - try { - importArticles((JSONArray) data); - } catch (ClassCastException e) { - try { - importArticle((JSONObject) data); - } catch (ClassCastException e2) { - throw new ExecutionException("invalid json file format: " + file.getAbsolutePath()); - } - } - } - - private void importArticles(JSONArray array) throws ExecutionException { - List<Exception> errors = new ArrayList<>(); - for (Object object : array) { - try { - importArticle((JSONObject) object); - } catch (ImportException e) { - revertImport(e.getId()); - errors.add(e); - } catch (Exception e) { - errors.add(e); - } - } - if (errors.size() > 0) { - throw new ExecutionException(errors); - } - } - - void importArticle(JSONObject obj) throws ImportException { + /** + * import a single article into the database and filebase + * + * @param obj + * @return + * @throws ImportException + */ + Article importArticle(JSONObject obj) throws ImportException { out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\""); Article article = new Article(); article.fromJSON(obj); try { - // 1. preprocess text - // process text before topic modeling - Preprocessor preprocessor = new LucenePreprocessor(); - String processedText = preprocessor.preprocess(article.getText()); + // preprocess text and generate text statistics + String preprocessedText = preprocessor.preprocess(article.getText()); + ArticleStats articleStats = ArticleStats.generateFromText(preprocessedText); - // 2. generate word statistics - article.setStats(ArticleStats.generateFromText(processedText)); - - // 3. add article to mongodb - // this generates a unique object id + // add article to mongodb + article.setStats(articleStats); article = dbArticles.createSingle(article); - // 4. add article to filebase - // topic modeling works on files - article.setText(processedText); - fbArticles.createSingle(article); + // add article to filebase + article.setText(preprocessedText); + filebase.add(article); + + return article; } catch (Exception e) { throw new ImportException(e, article.getId()); } } - private void revertImport(String id) throws ExecutionException { - if (id != null) { - DeleteCommand cmd = new DeleteCommand(); - cmd.deleteEntry(id); + /** + * Imports a file into the database and the filebase + * + * @param file + * @throws ParseException + * @throws IOException + * @throws FileNotFoundException + * @throws ImportException + * @throws Exception + */ + private List<Article> importFile(File file) + throws FileNotFoundException, IOException, ParseException, ImportException { + Object data = parser.parse(new FileReader(file)); + + List<Article> articles = new ArrayList<Article>(); + + if (data instanceof JSONArray) { + for (Object object : (JSONArray) data) { + articles.add(importArticle((JSONObject) object)); + } + } else if (data instanceof JSONObject) { + articles.add(importArticle((JSONObject) data)); } + + return articles; } @Override public void run() throws ExecutionException { try { - config = new Config(); - dbArticles = config.getDatabaseService(Constants.Collection.ARTICLES, Article.class); - fbArticles = config.getFilebaseService(Article.class); - } catch (IOException | ConfigException e) { - throw new ExecutionException(e); - } - - List<Exception> ex = new ArrayList<>(); - for (File file : files) { - try { - importFile(file); - } catch (Exception e) { - ex.add(e); + config = Config.getConfig(); + dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); + filebase = Filebase.getFilebase(config); + preprocessor = Preprocessor.getPreprocessor(config); + analyzer = LDAAnalyzer.getAnalyzer(config); + + out.info("using data directory: " + filebase.getDataDir().getAbsolutePath()); + out.info("using preprocessor: " + preprocessor.getName()); + out.info("using analyzer: " + analyzer.getName()); + + Timer timer = new Timer(); + timer.start(); + + // import files into database and filebase + List<Article> articles = new ArrayList<>(); + for (File file : files) { + if (file.isFile() && file.exists()) + articles.addAll(importFile(file)); } - } - if (ex.size() > 0) { - throw new ExecutionException(ex); + long durImport = timer.lap(); + + // do topic modeling + analyzer.analyze(); + long durAnalyze = timer.lap(); + + // write file index + filebase.writeIndex(); + long durIndex = timer.lap(); + + out.info("imported " + articles.size() + " " + (articles.size() == 1 ? "article" : "articles")); + out.info("import: " + StringUtils.timeString(durImport) + ", analyze: " + StringUtils.timeString(durAnalyze) + + ", reindex: " + StringUtils.timeString(durIndex)); + } catch (Exception e) { + throw new ExecutionException(e); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java index ba7136286844a24978971087166c09ce9b89f4e0..6341c0a8d5ae1ce0fa4a5069cb03a4f884b1a8d3 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java @@ -1,31 +1,22 @@ package de.vipra.cmd.text; -import java.util.Arrays; import java.util.HashSet; +import java.util.List; +import java.util.Set; -public class CustomPreprocessor implements Preprocessor { - - public static final HashSet<String> STOPWORDS = new HashSet<>(Arrays.asList(new String[] { "a", "about", "above", - "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", - "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", - "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", - "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", - "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", - "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", - "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", - "ought", "our", "ours ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", - "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", - "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", - "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", - "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", - "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", - "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" })); +public class CustomPreprocessor extends Preprocessor { + + private final Set<String> stopWords; + + public CustomPreprocessor(List<String> stopWordsList) { + this.stopWords = new HashSet<>(stopWordsList); + } private String removeStopWords(String text) { String[] words = text.split("\\s+"); StringBuilder sb = new StringBuilder(); for (String word : words) { - if (STOPWORDS.contains(word)) { + if (stopWords.contains(word)) { continue; } sb.append(word).append(" "); @@ -33,6 +24,11 @@ public class CustomPreprocessor implements Preprocessor { return sb.toString().trim(); } + @Override + public String getName() { + return "Custom Preprocessor"; + } + @Override public String preprocess(String input) { input = input.toLowerCase(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java index 446e17de2cf73435c09cb1180a1240391ff13bf0..13935c7aaaad9eaded11af577815baee04cf617b 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java @@ -3,6 +3,7 @@ package de.vipra.cmd.text; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; +import java.util.List; import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; @@ -12,14 +13,27 @@ import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.pattern.PatternReplaceFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.CharArraySet; +import de.vipra.cmd.ex.PreprocessorException; import de.vipra.util.StringUtils; -public class LucenePreprocessor implements Preprocessor { +public class LucenePreprocessor extends Preprocessor { + + private final CharArraySet stopWords; + + public LucenePreprocessor(List<String> stopWords) { + this.stopWords = new CharArraySet(stopWords, false); + } + + @Override + public String getName() { + return "Lucene Preprocessor"; + } @Override public String preprocess(String input) throws PreprocessorException { - Analyzer analyzer = new StandardAnalyzer(); + Analyzer analyzer = new StandardAnalyzer(stopWords); TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); try { stream.reset(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java index 75dcb73713e4f10fa557ad22011d2179317ae9b0..f130cddb5b54c7e0263809915ffc62c340ff24eb 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java @@ -1,7 +1,32 @@ package de.vipra.cmd.text; -public interface Preprocessor { +import java.util.Arrays; +import java.util.List; - String preprocess(String input) throws PreprocessorException; +import de.vipra.cmd.ex.PreprocessorException; +import de.vipra.util.Config; +import de.vipra.util.Constants; +import de.vipra.util.Config.Key; + +public abstract class Preprocessor { + + public abstract String getName(); + + public abstract String preprocess(String input) throws PreprocessorException; + + public static Preprocessor getPreprocessor(Config config) { + List<String> stopWords = Arrays.asList(config.getString(Key.STOPWORDS).toLowerCase().split(",")); + if (stopWords.size() == 0) { + stopWords = Constants.STOPWORDS; + } + + switch (config.getString(Key.PREPROCESSOR)) { + case "custom": + return new CustomPreprocessor(stopWords); + case "lucene": + default: + return new LucenePreprocessor(stopWords); + } + } } diff --git a/vipra-cmd/src/main/resources/stopwords.txt b/vipra-cmd/src/main/resources/stopwords.txt deleted file mode 100644 index 1e35caf48842d63a8139485c99a9eedfbd1fb822..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/resources/stopwords.txt +++ /dev/null @@ -1,173 +0,0 @@ -a -about -above -after -again -against -all -am -an -and -any -are -aren't -as -at -be -because -been -before -being -below -between -both -but -by -can't -cannot -could -couldn't -did -didn't -do -does -doesn't -doing -don't -down -during -each -few -for -from -further -had -hadn't -has -hasn't -have -haven't -having -he -he'd -he'll -he's -her -here -here's -hers -herself -him -himself -his -how -how's -i -i'd -i'll -i'm -i've -if -in -into -is -isn't -it -it's -its -itself -let's -me -more -most -mustn't -my -myself -no -nor -not -of -off -on -once -only -or -other -ought -our -ours ourselves -out -over -own -same -shan't -she -she'd -she'll -she's -should -shouldn't -so -some -such -than -that -that's -the -their -theirs -them -themselves -then -there -there's -these -they -they'd -they'll -they're -they've -this -those -through -to -too -under -until -up -very -was -wasn't -we -we'd -we'll -we're -we've -were -weren't -what -what's -when -when's -where -where's -which -while -who -who's -whom -why -why's -with -won't -would -wouldn't -you -you'd -you'll -you're -you've -your -yours -yourself -yourselves \ No newline at end of file diff --git a/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java b/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java index 2fbab02b887383c9b601797d2ba780e3384d846a..d0c852b3d483fd1801f75daceade92fcf0df5e76 100644 --- a/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java +++ b/vipra-rest/src/main/java/de/vipra/rest/resource/ArticleResource.java @@ -25,8 +25,8 @@ import de.vipra.rest.model.Article; import de.vipra.rest.model.ResponseWrapper; import de.vipra.rest.service.ArticleService; import de.vipra.util.Config; -import de.vipra.util.ConfigException; import de.vipra.util.Mongo; +import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; @Path("articles") @@ -38,7 +38,7 @@ public class ArticleResource { final ArticleService service; public ArticleResource(@Context ServletContext servletContext) throws ConfigException, IOException { - Config config = new Config(); + Config config = Config.getConfig(); Mongo mongo = Mongo.getInstance(config); service = new ArticleService(mongo); } diff --git a/vipra-util/.settings/org.eclipse.jdt.core.prefs b/vipra-util/.settings/org.eclipse.jdt.core.prefs index 0e1f9aa3cc628583e32a023c1e33ee9a6003f133..dad9ba744c62155e74fd7c03364f031f47fe4d70 100644 --- a/vipra-util/.settings/org.eclipse.jdt.core.prefs +++ b/vipra-util/.settings/org.eclipse.jdt.core.prefs @@ -13,7 +13,7 @@ org.eclipse.jdt.core.compiler.source=1.7 org.eclipse.jdt.core.formatter.align_type_members_on_columns=false org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation=0 -org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant=48 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression=16 @@ -21,7 +21,7 @@ org.eclipse.jdt.core.formatter.alignment_for_assignment=0 org.eclipse.jdt.core.formatter.alignment_for_binary_expression=16 org.eclipse.jdt.core.formatter.alignment_for_compact_if=16 org.eclipse.jdt.core.formatter.alignment_for_conditional_expression=80 -org.eclipse.jdt.core.formatter.alignment_for_enum_constants=0 +org.eclipse.jdt.core.formatter.alignment_for_enum_constants=49 org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer=16 org.eclipse.jdt.core.formatter.alignment_for_method_declaration=0 org.eclipse.jdt.core.formatter.alignment_for_multiple_fields=16 @@ -30,7 +30,7 @@ org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_resources_in_try=80 org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation=16 org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration=16 -org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration=48 org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration=16 diff --git a/vipra-util/pom.xml b/vipra-util/pom.xml index f0bfa6d77d5be229a604bed02466d57b87f509af..e78ecfaf76923224b74a4f0d1621bb5f9845de43 100644 --- a/vipra-util/pom.xml +++ b/vipra-util/pom.xml @@ -19,7 +19,7 @@ <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> - + <!-- Logging --> <dependency> <groupId>org.apache.logging.log4j</groupId> diff --git a/vipra-util/src/main/java/de/vipra/util/Config.java b/vipra-util/src/main/java/de/vipra/util/Config.java index a2032a4c9c93f7cd35645c22b32af7f6a476a6c6..6a9a47fee15b121a1253566c396c3b4f30490aba 100644 --- a/vipra-util/src/main/java/de/vipra/util/Config.java +++ b/vipra-util/src/main/java/de/vipra/util/Config.java @@ -8,24 +8,37 @@ import java.util.Properties; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import de.vipra.util.model.Model; -import de.vipra.util.service.DatabaseService; -import de.vipra.util.service.FilebaseService; +import de.vipra.util.ex.ConfigException; public class Config { + public static enum Key { + DBHOST("db.host", Constants.DEFAULT_HOST), + DBPORT("db.port", Constants.DEFAULT_PORT), + DBNAME("db.name", Constants.DEFAULT_DB), + DATADIR("fs.datadir", null), + PREPROCESSOR("an.preprocessor", Constants.DEFAULT_PREPROCESSOR.name), + ANALYZER("an.analyzer", Constants.DEFAULT_ANALYZER.name), + STOPWORDS("an.stopwords", ""); + + private final String name; + private final Object defVal; + + Key(String name, Object defVal) { + this.name = name; + this.defVal = defVal; + } + } + public static final Logger log = LoggerFactory.getLogger(Config.class); + private static Config config; private final Properties props = new Properties(); - public Config() throws IOException, ConfigException { + private Config() throws IOException, ConfigException { load(FileUtils.getResource(Constants.CONFIG_FILE)); } - public Config(InputStream is) throws IOException, ConfigException { - load(is); - } - private void load(InputStream is) throws ConfigException, IOException { if (is == null) { log.error("config file input stream is null"); @@ -35,44 +48,46 @@ public class Config { } } - public String getString(String key) { - return getString(key, null); - } - - public String getString(String key, String defaultValue) { - return props.getProperty(key, defaultValue); + public String getString(Key key) { + return props.getProperty(key.name, (String) key.defVal); } - public Integer getInt(String key) { - return getInt(key, null); - } - - public Integer getInt(String key, Integer defaultValue) { + public Integer getInt(Key key) { try { - return Integer.parseInt(props.getProperty(key)); + return Integer.parseInt(props.getProperty(key.name)); } catch (NumberFormatException e) { - return defaultValue; + return (Integer) key.defVal; } } public File getDataDirectory() throws ConfigException { - File dataDir = new File(getString("fb.path")); + String path = getString(Key.DATADIR); + File dataDir; + if (path != null) { + dataDir = new File(path); + } else { + dataDir = getGenericDataDir(); + } + if (!dataDir.exists()) { if (!dataDir.mkdirs()) { throw new ConfigException("could not create data directory: " + dataDir.getAbsolutePath()); } } + return dataDir; } - public <T extends Model> DatabaseService<T> getDatabaseService(Constants.Collection collection, Class<T> clazz) - throws ConfigException { - Mongo mongo = Mongo.getInstance(this); - return new DatabaseService<T>(mongo, collection, clazz); + public static File getGenericDataDir() { + File base = PathUtils.appDataDir(); + return new File(base, Constants.FB_DIR); } - public <T extends Model> FilebaseService<T> getFilebaseService(Class<T> clazz) throws ConfigException { - return new FilebaseService<T>(getDataDirectory(), clazz); + public static Config getConfig() throws IOException, ConfigException { + if (config == null) { + config = new Config(); + } + return config; } } diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 0655844e35d857cf2a5ca6d280dae7956253cb50..c135c812feff62740b8b182b5cc282a7924d6462 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -1,7 +1,9 @@ package de.vipra.util; +import java.util.List; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; +import java.util.Arrays; public class Constants { @@ -9,14 +11,20 @@ public class Constants { public static final Charset FB_ENCODING = StandardCharsets.UTF_8; public static final String CONFIG_FILE = "config.properties"; - public static final String STOPWORDS_FILE = "stopwords.txt"; public static final String DEFAULT_HOST = "localhost"; public static final int DEFAULT_PORT = 27017; public static final String DEFAULT_DB = "test"; - + public static final String DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'"; + public static final Preprocessor DEFAULT_PREPROCESSOR = Preprocessor.LUCENE; + public static final Analyzer DEFAULT_ANALYZER = Analyzer.JGIBB; + + public static final List<String> STOPWORDS = Arrays.asList("a", "an", "and", "are", "as", "at", "be", "but", "by", + "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", + "then", "there", "these", "they", "this", "to", "was", "will", "with"); + public static enum Collection { ARTICLES("articles"); @@ -27,4 +35,26 @@ public class Constants { } } + public static enum Preprocessor { + CUSTOM("custom"), + LUCENE("lucene"); + + public final String name; + + private Preprocessor(String name) { + this.name = name; + } + } + + public static enum Analyzer { + LDAC("ldac"), + JGIBB("jgibb"); + + public final String name; + + private Analyzer(String name) { + this.name = name; + } + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/Mongo.java b/vipra-util/src/main/java/de/vipra/util/Mongo.java index 2973d5f5d2ccfc84ba21c0bfef0b80cf7a020be5..943f4383634dd01815161f64761f94d877b1e062 100644 --- a/vipra-util/src/main/java/de/vipra/util/Mongo.java +++ b/vipra-util/src/main/java/de/vipra/util/Mongo.java @@ -6,6 +6,9 @@ import org.slf4j.LoggerFactory; import com.mongodb.MongoClient; import com.mongodb.client.MongoDatabase; +import de.vipra.util.Config.Key; +import de.vipra.util.ex.ConfigException; + public class Mongo { public static final Logger log = LoggerFactory.getLogger(Mongo.class); @@ -16,9 +19,9 @@ public class Mongo { private final MongoDatabase database; private Mongo(Config config) throws ConfigException { - String host = config.getString("db.host", Constants.DEFAULT_HOST); - Integer port = config.getInt("db.port", Constants.DEFAULT_PORT); - String databaseName = config.getString("db.name", Constants.DEFAULT_DB); + String host = config.getString(Key.DBHOST); + Integer port = config.getInt(Key.DBPORT); + String databaseName = config.getString(Key.DBNAME); if (host == null || port == null || databaseName == null) { log.error("host/port/dbname missing in configuration"); diff --git a/vipra-util/src/main/java/de/vipra/util/PathUtils.java b/vipra-util/src/main/java/de/vipra/util/PathUtils.java index b5979fe2fa0c4329938f39808978af2aa7705f84..9b94dffbd43f8e4c3ea2466b53fb63e050750341 100644 --- a/vipra-util/src/main/java/de/vipra/util/PathUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/PathUtils.java @@ -4,17 +4,17 @@ import java.io.File; public class PathUtils { - public static File userConfigDir() { + public static File appDataDir() { String os = System.getProperty("os.name").toUpperCase(); File base = null; if (os.contains("WIN")) { base = new File(System.getProperty("APPDATA")); } else if (os.contains("MAC")) { - base = new File(System.getProperty("user.home") + File.pathSeparator + "Library" + File.pathSeparator + base = new File(System.getProperty("user.home") + File.separator + "Library" + File.separator + "ApplicationSupport"); - } else if (os.contains("NIX")) { + } else { base = new File( - System.getProperty("user.home") + File.pathSeparator + ".local" + File.pathSeparator + "share"); + System.getProperty("user.home") + File.separator + ".local" + File.separator + "share"); } return base; } diff --git a/vipra-util/src/main/java/de/vipra/util/StringUtils.java b/vipra-util/src/main/java/de/vipra/util/StringUtils.java index 34a40c14fabcdb7d566c813a9c47ebd8f37aef58..15f96f9f026099669bb724194f4fc88475c008cf 100644 --- a/vipra-util/src/main/java/de/vipra/util/StringUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/StringUtils.java @@ -1,6 +1,9 @@ package de.vipra.util; +import java.util.ArrayList; import java.util.Iterator; +import java.util.List; +import java.util.concurrent.TimeUnit; public class StringUtils { @@ -12,16 +15,59 @@ public class StringUtils { return input.substring(0, maxLength - ellip.length()).concat(ellip); } - public static String join(Iterable<String> it) { + public static String join(Iterable<String> it, String separator) { Iterator<String> iter = it.iterator(); if (iter.hasNext()) { StringBuilder sb = new StringBuilder(iter.next()); while (iter.hasNext()) { - sb.append(" ").append(iter.next()); + sb.append(separator).append(iter.next()); } return sb.toString(); } return ""; } + public static String join(Iterable<String> it) { + return join(it, " "); + } + + public static String timeString(long nanos) { + List<String> parts = new ArrayList<String>(6); + + long days = TimeUnit.NANOSECONDS.toDays(nanos); + if (days > 0) { + parts.add(days + "d"); + nanos -= TimeUnit.DAYS.toNanos(days); + } + + long hours = TimeUnit.NANOSECONDS.toHours(nanos); + if (hours > 0) { + parts.add(hours + "h"); + nanos -= TimeUnit.HOURS.toNanos(hours); + } + + long minutes = TimeUnit.NANOSECONDS.toMinutes(nanos); + if (minutes > 0) { + parts.add(minutes + "m"); + nanos -= TimeUnit.MINUTES.toNanos(minutes); + } + + long seconds = TimeUnit.NANOSECONDS.toSeconds(nanos); + if (seconds > 0) { + parts.add(seconds + "s"); + nanos -= TimeUnit.SECONDS.toNanos(seconds); + } + + long millis = TimeUnit.NANOSECONDS.toMillis(nanos); + if (millis > 0) { + parts.add(millis + "ms"); + } + + if (parts.size() == 0) { + parts.add("0ms"); + } + + return StringUtils.join(parts); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/Timer.java b/vipra-util/src/main/java/de/vipra/util/Timer.java new file mode 100644 index 0000000000000000000000000000000000000000..9ca70c5188281fe10cfd4ac7b05eced4b4fcdbb4 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/Timer.java @@ -0,0 +1,22 @@ +package de.vipra.util; + +public class Timer { + + private long start; + + public long start() { + start = System.nanoTime(); + return start; + } + + public long stop() { + return System.nanoTime() - start; + } + + public long lap() { + long lap = System.nanoTime() - start; + start = System.nanoTime(); + return lap; + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/ConfigException.java b/vipra-util/src/main/java/de/vipra/util/ex/ConfigException.java similarity index 86% rename from vipra-util/src/main/java/de/vipra/util/ConfigException.java rename to vipra-util/src/main/java/de/vipra/util/ex/ConfigException.java index d6404572e5f34f137c3e095ca1bb2e71d416ad9d..02afde9221b2c4cf5f18f73410d9d12f589df169 100644 --- a/vipra-util/src/main/java/de/vipra/util/ConfigException.java +++ b/vipra-util/src/main/java/de/vipra/util/ex/ConfigException.java @@ -1,4 +1,4 @@ -package de.vipra.util; +package de.vipra.util.ex; public class ConfigException extends Exception { diff --git a/vipra-util/src/main/java/de/vipra/util/model/Model.java b/vipra-util/src/main/java/de/vipra/util/model/Model.java index db4f5ec17c2de516d7a1ec83858a052be620ffe2..4872539814fede770d79d9e6fb1b31c756264f2a 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Model.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Model.java @@ -37,8 +37,7 @@ public abstract class Model implements BsonDocument { } public void writeToFile(File file) throws IOException { - String data = toFileString(); - FileUtils.writeStringToFile(file, data, Constants.FB_ENCODING, false); + FileUtils.writeStringToFile(file, toFileString(), Constants.FB_ENCODING, false); } public abstract String getType(); diff --git a/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java b/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java index c97fa1e07e12a5d4f293ff1e5f7f564432d68986..f9cd70dba8ecad0306354d1b0af85b3aad082658 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TermFrequency.java @@ -49,17 +49,17 @@ public class TermFrequency implements BsonDocument { @Override public Document toDocument() { Document document = new Document(); - document.put("tf", getTermFrequency()); - document.put("ntf", getNormalizedTermFrequency()); - document.put("idf", getInverseDocumentFrequency()); + document.put("termFrequency", getTermFrequency()); + document.put("normalizedTermFrequency", getNormalizedTermFrequency()); + document.put("inverseDocumentFrequency", getInverseDocumentFrequency()); return document; } @Override public void fromDocument(Document document) { - setTermFrequency(document.getLong("tf")); - setNormalizedTermFrequency(document.getDouble("ntf")); - setInverseDocumentFrequency(document.getDouble("idf")); + setTermFrequency(document.getLong("termFrequency")); + setNormalizedTermFrequency(document.getDouble("normalizedTermFrequency")); + setInverseDocumentFrequency(document.getDouble("inverseDocumentFrequency")); } } diff --git a/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java b/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java index 51d979bb39c18cdd221302b00025c9b4b0ed228b..c930da6cac33b6ff9243301ca0844c6aa20e0731 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java +++ b/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java @@ -14,8 +14,10 @@ import com.mongodb.client.model.Filters; import com.mongodb.client.result.DeleteResult; import com.mongodb.client.result.UpdateResult; +import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.Mongo; +import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.Model; @@ -100,4 +102,10 @@ public class DatabaseService<T extends Model> implements Service<T, DatabaseExce return result.getModifiedCount(); } + public static <T extends Model> DatabaseService<T> getDatabaseService(Config config, + Constants.Collection collection, Class<T> clazz) throws ConfigException { + Mongo mongo = Mongo.getInstance(config); + return new DatabaseService<T>(mongo, collection, clazz); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/service/FilebaseService.java b/vipra-util/src/main/java/de/vipra/util/service/FilebaseService.java deleted file mode 100644 index 49c8ec34e1dc8a357a3917a58303f97f540442d5..0000000000000000000000000000000000000000 --- a/vipra-util/src/main/java/de/vipra/util/service/FilebaseService.java +++ /dev/null @@ -1,92 +0,0 @@ -package de.vipra.util.service; - -import java.io.File; -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import de.vipra.util.ex.FilebaseException; -import de.vipra.util.model.Model; - -public class FilebaseService<T extends Model> implements Service<T, FilebaseException> { - - public static final Logger log = LoggerFactory.getLogger(FilebaseService.class); - - private final File directory; - private final Class<T> clazz; - - public FilebaseService(File directory, Class<T> clazz) { - this.directory = directory; - this.clazz = clazz; - } - - private T newT(File file) { - try { - T t = clazz.newInstance(); - t.fromFile(file); - return t; - } catch (InstantiationException | IllegalAccessException | IllegalArgumentException | SecurityException - | IOException e) { - log.error(e.getMessage()); - return null; - } - } - - public File getFile(String id) { - return new File(directory, id); - } - - @Override - public T getSingle(String id) { - File file = getFile(id); - return newT(file); - } - - @Override - public T createSingle(T t) throws FilebaseException { - if (t.getId() != null) { - File file = getFile(t.getId()); - if (file.exists()) { - if (!file.delete()) { - log.error("could not delete file for recreation: " + file.getAbsolutePath()); - } - } - try { - t.writeToFile(file); - log.info("file created: " + file.getAbsolutePath()); - } catch (IOException e) { - throw new FilebaseException(e); - } - } - return t; - } - - @Override - public long deleteSingle(String id) throws FilebaseException { - File file = getFile(id); - if (file.exists()) { - if (file.delete()) { - return 1; - } else { - throw new FilebaseException("could not delete file: " + file.getAbsolutePath()); - } - } - return 0; - } - - @Override - public long updateSingle(T t) throws FilebaseException { - File file = getFile(t.getId()); - if (file.exists()) { - try { - t.writeToFile(file); - return 1; - } catch (Exception e) { - throw new FilebaseException(e); - } - } - return 0; - } - -} diff --git a/vipra-util/src/main/java/de/vipra/util/service/Service.java b/vipra-util/src/main/java/de/vipra/util/service/Service.java index 824a6a97de4b86e335b9c8b7255d7a8de5fbcfe5..df6d29768d1e52a8dac3001a8940815455c875b4 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/Service.java +++ b/vipra-util/src/main/java/de/vipra/util/service/Service.java @@ -11,5 +11,5 @@ public interface Service<T extends Model, E extends Exception> { long deleteSingle(String id) throws E; long updateSingle(T t) throws E; - + }