diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java index 4556d2c1817a7d9e5c3e8215eb01b8d7a79e432f..320290eccea12b7ab706b9712fc0ca8587be5be6 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java @@ -14,12 +14,14 @@ import de.vipra.util.ex.ConfigException; public abstract class Filebase implements Closeable { private final File dataDir; + private final File dataFile; private final FilebaseIndex index; private final FilebaseVocabulary vocab; - public Filebase(File dataDir) throws FilebaseException { + public Filebase(File dataDir, String fileName) throws FilebaseException { this.dataDir = dataDir; try { + this.dataFile = new File(dataDir, fileName); this.index = new FilebaseIndex(new File(dataDir, Constants.INDEX_FILE)); this.vocab = new FilebaseVocabulary(new File(dataDir, Constants.VOCAB_FILE)); } catch (IOException e) { @@ -31,6 +33,18 @@ public abstract class Filebase implements Closeable { return dataDir; } + public File getDataFile() { + return dataFile; + } + + public FilebaseIndex getIndex() { + return index; + } + + public FilebaseVocabulary getVocab() { + return vocab; + } + public void remove(Article article) throws FilebaseException { remove(article.getId()); } @@ -40,7 +54,6 @@ public abstract class Filebase implements Closeable { write(); index.close(); vocab.close(); - } public abstract void add(Article article) throws FilebaseException; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java index 759850be3eb98a59e01423bcfe115222f5dd4298..227fa33c634ce659ba946ecabe4e6e097d1a3ef2 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java @@ -32,4 +32,21 @@ public class FilebaseVocabulary implements Closeable { write(); } + public void addVocabulary(String text) { + addVocabulary(text.split("\\s+")); + } + + public void addVocabulary(String[] text) { + for (String word : text) { + // TODO fix this + if (!vocables.contains(word)) { + vocables.add(word); + } + } + } + + public int index(String word) { + return vocables.indexOf(word); + } + } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java index 4332a82f6f3df5d9a2b77de779bdf50e870be055..38a2d8ca4b6c222134e24ca240104a2500543e7f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java @@ -1,34 +1,56 @@ package de.vipra.cmd.file; +import java.io.BufferedOutputStream; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import de.vipra.cmd.ex.FilebaseException; import de.vipra.cmd.model.Article; +import de.vipra.util.Constants; +import de.vipra.util.FileUtils; +import de.vipra.util.ex.NotImplementedException; public class JGibbFilebase extends Filebase { + private final File dataFile; + private final FilebaseIndex index; + private final FilebaseVocabulary vocab; + private final List<Article> articles; + public JGibbFilebase(File dataDir) throws FilebaseException { - super(dataDir); - // TODO Auto-generated constructor stub + super(dataDir, "jgibb"); + this.dataFile = getDataFile(); + this.index = getIndex(); + this.vocab = getVocab(); + this.articles = new ArrayList<>(); } @Override public void add(Article article) { - // TODO Auto-generated method stub - + String[] words = article.getProcessedText().getText().split("\\s+"); + vocab.addVocabulary(words); + index.add(article.getId()); + articles.add(article); } @Override public void remove(String id) { - // TODO Auto-generated method stub - + throw new NotImplementedException(); } @Override public void write() throws IOException { - // TODO Auto-generated method stub - + int lineCount = FileUtils.countLines(dataFile) + articles.size(); + + BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(dataFile, true)); + for (Article a : articles) { + bw.write(a.getProcessedText().getText().getBytes(Constants.FB_ENCODING)); + bw.write(System.lineSeparator().getBytes(Constants.FB_ENCODING)); + } + bw.close(); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java index 5699752c7efd26e69ff51e10bb4a5f8f6c8d0a6d..67b210a9c655ed1c3dcc5bef8d59863e576a901f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java @@ -5,11 +5,12 @@ import java.io.IOException; import de.vipra.cmd.ex.FilebaseException; import de.vipra.cmd.model.Article; +import de.vipra.util.ex.NotImplementedException; public class LdacFilebase extends Filebase { public LdacFilebase(File dataDir) throws FilebaseException { - super(dataDir); + super(dataDir, "ldac"); // TODO Auto-generated constructor stub } @@ -21,8 +22,7 @@ public class LdacFilebase extends Filebase { @Override public void remove(String id) { - // TODO Auto-generated method stub - + throw new NotImplementedException(); } @Override diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java index 92885e5b4bca829a320ab6cdfb7c4751da20123b..6cc4e62dd1892d42e24eb7aaf2214dff80e38c8a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java @@ -2,19 +2,21 @@ package de.vipra.cmd.lda; import de.vipra.cmd.ex.LDAAnalyzerException; import de.vipra.util.Config; +import de.vipra.util.Constants; import de.vipra.util.Config.Key; public abstract class LDAAnalyzer { - + public abstract String getName(); public abstract void analyze() throws LDAAnalyzerException; public static LDAAnalyzer getAnalyzer(Config config) { - switch (config.getString(Key.ANALYZER).toLowerCase()) { - case "ldac": + switch (Constants.Analyzer.fromString(config.getString(Key.ANALYZER))) { + case LDAC: return new LdacLDAAnalyzer(); - case "jgibb": + case JGIBB: + case DEFAULT: default: return new JGibbLDAAnalyzer(); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/model/Article.java b/vipra-cmd/src/main/java/de/vipra/cmd/model/Article.java index a29161db1d48d79108ae468ade2823b424aeed17..7edcca04e1c916a2c207c52b84957cf38546af64 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/model/Article.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/model/Article.java @@ -2,8 +2,20 @@ package de.vipra.cmd.model; import org.json.simple.JSONObject; +import de.vipra.cmd.text.ProcessedText; + public class Article extends de.vipra.util.model.Article { + private ProcessedText processedText; + + public ProcessedText getProcessedText() { + return processedText; + } + + public void setProcessedText(ProcessedText processedText) { + this.processedText = processedText; + } + public void fromJSON(JSONObject obj) { if (obj.containsKey("title")) setTitle(obj.get("title").toString()); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 6396b63b5414604878e024c09123ea328ad85923..a9bc04dc50f46b19d25972c1ba5601aa2452fc74 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -20,7 +20,8 @@ import de.vipra.cmd.ex.ImportException; import de.vipra.cmd.file.Filebase; import de.vipra.cmd.lda.LDAAnalyzer; import de.vipra.cmd.model.Article; -import de.vipra.cmd.text.Preprocessor; +import de.vipra.cmd.text.Processor; +import de.vipra.cmd.text.ProcessedText; import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.StringUtils; @@ -38,7 +39,7 @@ public class ImportCommand implements Command { private Config config; private DatabaseService<Article> dbArticles; private Filebase filebase; - private Preprocessor preprocessor; + private Processor preprocessor; private LDAAnalyzer analyzer; ImportCommand() {} @@ -70,7 +71,7 @@ public class ImportCommand implements Command { File[] files = file.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { - return dir.isFile(); + return dir.isFile() && dir.exists(); } }); @@ -92,15 +93,15 @@ public class ImportCommand implements Command { try { // preprocess text and generate text statistics - String preprocessedText = preprocessor.preprocess(article.getText()); - ArticleStats articleStats = ArticleStats.generateFromText(preprocessedText); + ProcessedText processedText = preprocessor.preprocess(article.getText()); + ArticleStats articleStats = ArticleStats.generateFromText(processedText.getText()); // add article to mongodb + article.setProcessedText(processedText); article.setStats(articleStats); article = dbArticles.createSingle(article); // add article to filebase - article.setText(preprocessedText); filebase.add(article); return article; @@ -142,7 +143,7 @@ public class ImportCommand implements Command { config = Config.getConfig(); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); filebase = Filebase.getFilebase(config); - preprocessor = Preprocessor.getPreprocessor(config); + preprocessor = Processor.getPreprocessor(config); analyzer = LDAAnalyzer.getAnalyzer(config); out.info("using data directory: " + filebase.getDataDir().getAbsolutePath()); @@ -155,8 +156,7 @@ public class ImportCommand implements Command { // import files into database and filebase List<Article> articles = new ArrayList<>(); for (File file : files) { - if (file.isFile() && file.exists()) - articles.addAll(importFile(file)); + articles.addAll(importFile(file)); } long durImport = timer.lap(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomProcessor.java similarity index 64% rename from vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java rename to vipra-cmd/src/main/java/de/vipra/cmd/text/CustomProcessor.java index 6341c0a8d5ae1ce0fa4a5069cb03a4f884b1a8d3..eb299fa3f2e610e74151978b15bb365e3f07924c 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomPreprocessor.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/CustomProcessor.java @@ -4,11 +4,14 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -public class CustomPreprocessor extends Preprocessor { +import de.vipra.util.Constants; + +public class CustomProcessor extends Processor { private final Set<String> stopWords; - public CustomPreprocessor(List<String> stopWordsList) { + public CustomProcessor(List<String> stopWordsList) { + super("Custom Processor"); this.stopWords = new HashSet<>(stopWordsList); } @@ -25,16 +28,11 @@ public class CustomPreprocessor extends Preprocessor { } @Override - public String getName() { - return "Custom Preprocessor"; - } - - @Override - public String preprocess(String input) { + public ProcessedText preprocess(String input) { input = input.toLowerCase(); input = removeStopWords(input); - input = input.replace("[^a-zA-Z0-9 ]", ""); - return input; + input = input.replace(Constants.CHARS_DISALLOWED, ""); + return new ProcessedText(input); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/LuceneProcessor.java similarity index 77% rename from vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java rename to vipra-cmd/src/main/java/de/vipra/cmd/text/LuceneProcessor.java index 13935c7aaaad9eaded11af577815baee04cf617b..569704869f2bff14fdbcbde53e051905182f130d 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/LucenePreprocessor.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/LuceneProcessor.java @@ -16,35 +16,32 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharArraySet; import de.vipra.cmd.ex.PreprocessorException; +import de.vipra.util.Constants; import de.vipra.util.StringUtils; -public class LucenePreprocessor extends Preprocessor { +public class LuceneProcessor extends Processor { private final CharArraySet stopWords; - public LucenePreprocessor(List<String> stopWords) { + public LuceneProcessor(List<String> stopWords) { + super("Lucene Processor"); this.stopWords = new CharArraySet(stopWords, false); } @Override - public String getName() { - return "Lucene Preprocessor"; - } - - @Override - public String preprocess(String input) throws PreprocessorException { + public ProcessedText preprocess(String input) throws PreprocessorException { Analyzer analyzer = new StandardAnalyzer(stopWords); TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); try { stream.reset(); stream = new PorterStemFilter(stream); stream = new TrimFilter(stream); - stream = new PatternReplaceFilter(stream, Pattern.compile("[^a-zA-Z0-9]"), "", true); + stream = new PatternReplaceFilter(stream, Pattern.compile(Constants.CHARS_DISALLOWED), "", true); ArrayList<String> result = new ArrayList<>(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } - return StringUtils.join(result); + return new ProcessedText(StringUtils.join(result)); } catch (IOException e) { throw new PreprocessorException(e); } finally { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java deleted file mode 100644 index 3de9ceb5dd3b61248177bf7cc6ea606bc41aa1f6..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/Preprocessor.java +++ /dev/null @@ -1,33 +0,0 @@ -package de.vipra.cmd.text; - -import java.util.Arrays; -import java.util.List; - -import de.vipra.cmd.ex.PreprocessorException; -import de.vipra.util.Config; -import de.vipra.util.Constants; -import de.vipra.util.Config.Key; - -public abstract class Preprocessor { - - public abstract String getName(); - - public abstract String preprocess(String input) throws PreprocessorException; - - public static Preprocessor getPreprocessor(Config config) { - List<String> stopWords = Arrays.asList(config.getString(Key.STOPWORDS).toLowerCase().split(",")); - if (stopWords.size() == 0) { - stopWords = Constants.STOPWORDS; - } - - switch (Constants.Preprocessor.fromString(config.getString(Key.PREPROCESSOR))) { - case CUSTOM: - return new CustomPreprocessor(stopWords); - case LUCENE: - case DEFAULT: - default: - return new LucenePreprocessor(stopWords); - } - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java new file mode 100644 index 0000000000000000000000000000000000000000..10a4d01dc114da8b37494707526b7ceceddeb382 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java @@ -0,0 +1,15 @@ +package de.vipra.cmd.text; + +public final class ProcessedText { + + private final String text; + + public ProcessedText(String text) { + this.text = text; + } + + public String getText() { + return text; + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java new file mode 100644 index 0000000000000000000000000000000000000000..a9c4ed1380b19ab3fb7ebd148725f38b6d48961c --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java @@ -0,0 +1,41 @@ +package de.vipra.cmd.text; + +import java.util.Arrays; +import java.util.List; + +import de.vipra.cmd.ex.PreprocessorException; +import de.vipra.util.Config; +import de.vipra.util.Constants; +import de.vipra.util.Config.Key; + +public abstract class Processor { + + private final String name; + + public Processor(String name) { + this.name = name; + } + + public String getName() { + return name; + } + + public abstract ProcessedText preprocess(String input) throws PreprocessorException; + + public static Processor getPreprocessor(Config config) { + List<String> stopWords = Arrays.asList(config.getString(Key.STOPWORDS).toLowerCase().split(",")); + if (stopWords.size() == 0) { + stopWords = Constants.STOPWORDS; + } + + switch (Constants.Processor.fromString(config.getString(Key.PREPROCESSOR))) { + case CUSTOM: + return new CustomProcessor(stopWords); + case LUCENE: + case DEFAULT: + default: + return new LuceneProcessor(stopWords); + } + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index ea2c0db084820a5c1d8287f211b3eac18f45af57..f88b43093558b650b05c1e7c8b6b34a6ba142462 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -20,7 +20,9 @@ public class Constants { public static final String DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'"; - public static final Preprocessor DEFAULT_PREPROCESSOR = Preprocessor.LUCENE; + public static final String CHARS_DISALLOWED = "[^a-zA-Z0-9]"; + + public static final Processor DEFAULT_PREPROCESSOR = Processor.LUCENE; public static final Analyzer DEFAULT_ANALYZER = Analyzer.JGIBB; public static final List<String> STOPWORDS = Arrays.asList("a", "an", "and", "are", "as", "at", "be", "but", "by", @@ -37,24 +39,24 @@ public class Constants { } } - public static enum Preprocessor { + public static enum Processor { CUSTOM("custom"), LUCENE("lucene"), DEFAULT(LUCENE); public final String name; - private Preprocessor(String name) { + private Processor(String name) { this.name = name; } - private Preprocessor(Preprocessor def) { + private Processor(Processor def) { this.name = def.name; } - public static Preprocessor fromString(String text) { + public static Processor fromString(String text) { if (text != null) { - for (Preprocessor b : Preprocessor.values()) { + for (Processor b : Processor.values()) { if (text.equalsIgnoreCase(b.name)) { return b; } diff --git a/vipra-util/src/main/java/de/vipra/util/FileUtils.java b/vipra-util/src/main/java/de/vipra/util/FileUtils.java index 42151f8d752026df2da9d7eee8e9e1288209a9c4..a548f9c4895d1c6bd10b8946e7984e1c597a59b5 100644 --- a/vipra-util/src/main/java/de/vipra/util/FileUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/FileUtils.java @@ -1,6 +1,8 @@ package de.vipra.util; +import java.io.BufferedInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; @@ -21,4 +23,25 @@ public class FileUtils extends org.apache.commons.io.FileUtils { return is; } + public static int countLines(File file) throws IOException { + InputStream is = new BufferedInputStream(new FileInputStream(file)); + try { + byte[] c = new byte[1024]; + int count = 0; + int readChars = 0; + boolean empty = true; + while ((readChars = is.read(c)) != -1) { + empty = false; + for (int i = 0; i < readChars; ++i) { + if (c[i] == '\n') { + ++count; + } + } + } + return (count == 0 && !empty) ? 1 : count; + } finally { + is.close(); + } + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/PathUtils.java b/vipra-util/src/main/java/de/vipra/util/PathUtils.java index 9b94dffbd43f8e4c3ea2466b53fb63e050750341..dd8a00b63a5f6e035308fada36dcda7e6aa52e6b 100644 --- a/vipra-util/src/main/java/de/vipra/util/PathUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/PathUtils.java @@ -13,8 +13,7 @@ public class PathUtils { base = new File(System.getProperty("user.home") + File.separator + "Library" + File.separator + "ApplicationSupport"); } else { - base = new File( - System.getProperty("user.home") + File.separator + ".local" + File.separator + "share"); + base = new File(System.getProperty("user.home") + File.separator + ".local" + File.separator + "share"); } return base; } diff --git a/vipra-util/src/main/java/de/vipra/util/ex/NotImplementedException.java b/vipra-util/src/main/java/de/vipra/util/ex/NotImplementedException.java new file mode 100644 index 0000000000000000000000000000000000000000..4c77585d4cb0930541227c6a438cbb2384dcfaf9 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/ex/NotImplementedException.java @@ -0,0 +1,7 @@ +package de.vipra.util.ex; + +public class NotImplementedException extends RuntimeException { + + private static final long serialVersionUID = 1L; + +} diff --git a/vipra-util/src/main/java/de/vipra/util/service/Service.java b/vipra-util/src/main/java/de/vipra/util/service/Service.java index df6d29768d1e52a8dac3001a8940815455c875b4..824a6a97de4b86e335b9c8b7255d7a8de5fbcfe5 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/Service.java +++ b/vipra-util/src/main/java/de/vipra/util/service/Service.java @@ -11,5 +11,5 @@ public interface Service<T extends Model, E extends Exception> { long deleteSingle(String id) throws E; long updateSingle(T t) throws E; - + }