Skip to content
Snippets Groups Projects
Commit d80a34fa authored by Eike Cochu's avatar Eike Cochu
Browse files

updated Filebase implementation

renamed Preprocessor -> Processor
parent aac3b9e6
Branches
No related tags found
No related merge requests found
Showing
with 203 additions and 55 deletions
...@@ -14,12 +14,14 @@ import de.vipra.util.ex.ConfigException; ...@@ -14,12 +14,14 @@ import de.vipra.util.ex.ConfigException;
public abstract class Filebase implements Closeable { public abstract class Filebase implements Closeable {
private final File dataDir; private final File dataDir;
private final File dataFile;
private final FilebaseIndex index; private final FilebaseIndex index;
private final FilebaseVocabulary vocab; private final FilebaseVocabulary vocab;
public Filebase(File dataDir) throws FilebaseException { public Filebase(File dataDir, String fileName) throws FilebaseException {
this.dataDir = dataDir; this.dataDir = dataDir;
try { try {
this.dataFile = new File(dataDir, fileName);
this.index = new FilebaseIndex(new File(dataDir, Constants.INDEX_FILE)); this.index = new FilebaseIndex(new File(dataDir, Constants.INDEX_FILE));
this.vocab = new FilebaseVocabulary(new File(dataDir, Constants.VOCAB_FILE)); this.vocab = new FilebaseVocabulary(new File(dataDir, Constants.VOCAB_FILE));
} catch (IOException e) { } catch (IOException e) {
...@@ -31,6 +33,18 @@ public abstract class Filebase implements Closeable { ...@@ -31,6 +33,18 @@ public abstract class Filebase implements Closeable {
return dataDir; return dataDir;
} }
public File getDataFile() {
return dataFile;
}
public FilebaseIndex getIndex() {
return index;
}
public FilebaseVocabulary getVocab() {
return vocab;
}
public void remove(Article article) throws FilebaseException { public void remove(Article article) throws FilebaseException {
remove(article.getId()); remove(article.getId());
} }
...@@ -40,7 +54,6 @@ public abstract class Filebase implements Closeable { ...@@ -40,7 +54,6 @@ public abstract class Filebase implements Closeable {
write(); write();
index.close(); index.close();
vocab.close(); vocab.close();
} }
public abstract void add(Article article) throws FilebaseException; public abstract void add(Article article) throws FilebaseException;
......
...@@ -32,4 +32,21 @@ public class FilebaseVocabulary implements Closeable { ...@@ -32,4 +32,21 @@ public class FilebaseVocabulary implements Closeable {
write(); write();
} }
public void addVocabulary(String text) {
addVocabulary(text.split("\\s+"));
}
public void addVocabulary(String[] text) {
for (String word : text) {
// TODO fix this
if (!vocables.contains(word)) {
vocables.add(word);
}
}
}
public int index(String word) {
return vocables.indexOf(word);
}
} }
package de.vipra.cmd.file; package de.vipra.cmd.file;
import java.io.BufferedOutputStream;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import de.vipra.cmd.ex.FilebaseException; import de.vipra.cmd.ex.FilebaseException;
import de.vipra.cmd.model.Article; import de.vipra.cmd.model.Article;
import de.vipra.util.Constants;
import de.vipra.util.FileUtils;
import de.vipra.util.ex.NotImplementedException;
public class JGibbFilebase extends Filebase { public class JGibbFilebase extends Filebase {
private final File dataFile;
private final FilebaseIndex index;
private final FilebaseVocabulary vocab;
private final List<Article> articles;
public JGibbFilebase(File dataDir) throws FilebaseException { public JGibbFilebase(File dataDir) throws FilebaseException {
super(dataDir); super(dataDir, "jgibb");
// TODO Auto-generated constructor stub this.dataFile = getDataFile();
this.index = getIndex();
this.vocab = getVocab();
this.articles = new ArrayList<>();
} }
@Override @Override
public void add(Article article) { public void add(Article article) {
// TODO Auto-generated method stub String[] words = article.getProcessedText().getText().split("\\s+");
vocab.addVocabulary(words);
index.add(article.getId());
articles.add(article);
} }
@Override @Override
public void remove(String id) { public void remove(String id) {
// TODO Auto-generated method stub throw new NotImplementedException();
} }
@Override @Override
public void write() throws IOException { public void write() throws IOException {
// TODO Auto-generated method stub int lineCount = FileUtils.countLines(dataFile) + articles.size();
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(dataFile, true));
for (Article a : articles) {
bw.write(a.getProcessedText().getText().getBytes(Constants.FB_ENCODING));
bw.write(System.lineSeparator().getBytes(Constants.FB_ENCODING));
}
bw.close();
} }
} }
...@@ -5,11 +5,12 @@ import java.io.IOException; ...@@ -5,11 +5,12 @@ import java.io.IOException;
import de.vipra.cmd.ex.FilebaseException; import de.vipra.cmd.ex.FilebaseException;
import de.vipra.cmd.model.Article; import de.vipra.cmd.model.Article;
import de.vipra.util.ex.NotImplementedException;
public class LdacFilebase extends Filebase { public class LdacFilebase extends Filebase {
public LdacFilebase(File dataDir) throws FilebaseException { public LdacFilebase(File dataDir) throws FilebaseException {
super(dataDir); super(dataDir, "ldac");
// TODO Auto-generated constructor stub // TODO Auto-generated constructor stub
} }
...@@ -21,8 +22,7 @@ public class LdacFilebase extends Filebase { ...@@ -21,8 +22,7 @@ public class LdacFilebase extends Filebase {
@Override @Override
public void remove(String id) { public void remove(String id) {
// TODO Auto-generated method stub throw new NotImplementedException();
} }
@Override @Override
......
...@@ -2,19 +2,21 @@ package de.vipra.cmd.lda; ...@@ -2,19 +2,21 @@ package de.vipra.cmd.lda;
import de.vipra.cmd.ex.LDAAnalyzerException; import de.vipra.cmd.ex.LDAAnalyzerException;
import de.vipra.util.Config; import de.vipra.util.Config;
import de.vipra.util.Constants;
import de.vipra.util.Config.Key; import de.vipra.util.Config.Key;
public abstract class LDAAnalyzer { public abstract class LDAAnalyzer {
public abstract String getName(); public abstract String getName();
public abstract void analyze() throws LDAAnalyzerException; public abstract void analyze() throws LDAAnalyzerException;
public static LDAAnalyzer getAnalyzer(Config config) { public static LDAAnalyzer getAnalyzer(Config config) {
switch (config.getString(Key.ANALYZER).toLowerCase()) { switch (Constants.Analyzer.fromString(config.getString(Key.ANALYZER))) {
case "ldac": case LDAC:
return new LdacLDAAnalyzer(); return new LdacLDAAnalyzer();
case "jgibb": case JGIBB:
case DEFAULT:
default: default:
return new JGibbLDAAnalyzer(); return new JGibbLDAAnalyzer();
} }
......
...@@ -2,8 +2,20 @@ package de.vipra.cmd.model; ...@@ -2,8 +2,20 @@ package de.vipra.cmd.model;
import org.json.simple.JSONObject; import org.json.simple.JSONObject;
import de.vipra.cmd.text.ProcessedText;
public class Article extends de.vipra.util.model.Article { public class Article extends de.vipra.util.model.Article {
private ProcessedText processedText;
public ProcessedText getProcessedText() {
return processedText;
}
public void setProcessedText(ProcessedText processedText) {
this.processedText = processedText;
}
public void fromJSON(JSONObject obj) { public void fromJSON(JSONObject obj) {
if (obj.containsKey("title")) if (obj.containsKey("title"))
setTitle(obj.get("title").toString()); setTitle(obj.get("title").toString());
......
...@@ -20,7 +20,8 @@ import de.vipra.cmd.ex.ImportException; ...@@ -20,7 +20,8 @@ import de.vipra.cmd.ex.ImportException;
import de.vipra.cmd.file.Filebase; import de.vipra.cmd.file.Filebase;
import de.vipra.cmd.lda.LDAAnalyzer; import de.vipra.cmd.lda.LDAAnalyzer;
import de.vipra.cmd.model.Article; import de.vipra.cmd.model.Article;
import de.vipra.cmd.text.Preprocessor; import de.vipra.cmd.text.Processor;
import de.vipra.cmd.text.ProcessedText;
import de.vipra.util.Config; import de.vipra.util.Config;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.StringUtils; import de.vipra.util.StringUtils;
...@@ -38,7 +39,7 @@ public class ImportCommand implements Command { ...@@ -38,7 +39,7 @@ public class ImportCommand implements Command {
private Config config; private Config config;
private DatabaseService<Article> dbArticles; private DatabaseService<Article> dbArticles;
private Filebase filebase; private Filebase filebase;
private Preprocessor preprocessor; private Processor preprocessor;
private LDAAnalyzer analyzer; private LDAAnalyzer analyzer;
ImportCommand() {} ImportCommand() {}
...@@ -70,7 +71,7 @@ public class ImportCommand implements Command { ...@@ -70,7 +71,7 @@ public class ImportCommand implements Command {
File[] files = file.listFiles(new FilenameFilter() { File[] files = file.listFiles(new FilenameFilter() {
@Override @Override
public boolean accept(File dir, String name) { public boolean accept(File dir, String name) {
return dir.isFile(); return dir.isFile() && dir.exists();
} }
}); });
...@@ -92,15 +93,15 @@ public class ImportCommand implements Command { ...@@ -92,15 +93,15 @@ public class ImportCommand implements Command {
try { try {
// preprocess text and generate text statistics // preprocess text and generate text statistics
String preprocessedText = preprocessor.preprocess(article.getText()); ProcessedText processedText = preprocessor.preprocess(article.getText());
ArticleStats articleStats = ArticleStats.generateFromText(preprocessedText); ArticleStats articleStats = ArticleStats.generateFromText(processedText.getText());
// add article to mongodb // add article to mongodb
article.setProcessedText(processedText);
article.setStats(articleStats); article.setStats(articleStats);
article = dbArticles.createSingle(article); article = dbArticles.createSingle(article);
// add article to filebase // add article to filebase
article.setText(preprocessedText);
filebase.add(article); filebase.add(article);
return article; return article;
...@@ -142,7 +143,7 @@ public class ImportCommand implements Command { ...@@ -142,7 +143,7 @@ public class ImportCommand implements Command {
config = Config.getConfig(); config = Config.getConfig();
dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class);
filebase = Filebase.getFilebase(config); filebase = Filebase.getFilebase(config);
preprocessor = Preprocessor.getPreprocessor(config); preprocessor = Processor.getPreprocessor(config);
analyzer = LDAAnalyzer.getAnalyzer(config); analyzer = LDAAnalyzer.getAnalyzer(config);
out.info("using data directory: " + filebase.getDataDir().getAbsolutePath()); out.info("using data directory: " + filebase.getDataDir().getAbsolutePath());
...@@ -155,8 +156,7 @@ public class ImportCommand implements Command { ...@@ -155,8 +156,7 @@ public class ImportCommand implements Command {
// import files into database and filebase // import files into database and filebase
List<Article> articles = new ArrayList<>(); List<Article> articles = new ArrayList<>();
for (File file : files) { for (File file : files) {
if (file.isFile() && file.exists()) articles.addAll(importFile(file));
articles.addAll(importFile(file));
} }
long durImport = timer.lap(); long durImport = timer.lap();
......
...@@ -4,11 +4,14 @@ import java.util.HashSet; ...@@ -4,11 +4,14 @@ import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
public class CustomPreprocessor extends Preprocessor { import de.vipra.util.Constants;
public class CustomProcessor extends Processor {
private final Set<String> stopWords; private final Set<String> stopWords;
public CustomPreprocessor(List<String> stopWordsList) { public CustomProcessor(List<String> stopWordsList) {
super("Custom Processor");
this.stopWords = new HashSet<>(stopWordsList); this.stopWords = new HashSet<>(stopWordsList);
} }
...@@ -25,16 +28,11 @@ public class CustomPreprocessor extends Preprocessor { ...@@ -25,16 +28,11 @@ public class CustomPreprocessor extends Preprocessor {
} }
@Override @Override
public String getName() { public ProcessedText preprocess(String input) {
return "Custom Preprocessor";
}
@Override
public String preprocess(String input) {
input = input.toLowerCase(); input = input.toLowerCase();
input = removeStopWords(input); input = removeStopWords(input);
input = input.replace("[^a-zA-Z0-9 ]", ""); input = input.replace(Constants.CHARS_DISALLOWED, "");
return input; return new ProcessedText(input);
} }
} }
...@@ -16,35 +16,32 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; ...@@ -16,35 +16,32 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import de.vipra.cmd.ex.PreprocessorException; import de.vipra.cmd.ex.PreprocessorException;
import de.vipra.util.Constants;
import de.vipra.util.StringUtils; import de.vipra.util.StringUtils;
public class LucenePreprocessor extends Preprocessor { public class LuceneProcessor extends Processor {
private final CharArraySet stopWords; private final CharArraySet stopWords;
public LucenePreprocessor(List<String> stopWords) { public LuceneProcessor(List<String> stopWords) {
super("Lucene Processor");
this.stopWords = new CharArraySet(stopWords, false); this.stopWords = new CharArraySet(stopWords, false);
} }
@Override @Override
public String getName() { public ProcessedText preprocess(String input) throws PreprocessorException {
return "Lucene Preprocessor";
}
@Override
public String preprocess(String input) throws PreprocessorException {
Analyzer analyzer = new StandardAnalyzer(stopWords); Analyzer analyzer = new StandardAnalyzer(stopWords);
TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
try { try {
stream.reset(); stream.reset();
stream = new PorterStemFilter(stream); stream = new PorterStemFilter(stream);
stream = new TrimFilter(stream); stream = new TrimFilter(stream);
stream = new PatternReplaceFilter(stream, Pattern.compile("[^a-zA-Z0-9]"), "", true); stream = new PatternReplaceFilter(stream, Pattern.compile(Constants.CHARS_DISALLOWED), "", true);
ArrayList<String> result = new ArrayList<>(); ArrayList<String> result = new ArrayList<>();
while (stream.incrementToken()) { while (stream.incrementToken()) {
result.add(stream.getAttribute(CharTermAttribute.class).toString()); result.add(stream.getAttribute(CharTermAttribute.class).toString());
} }
return StringUtils.join(result); return new ProcessedText(StringUtils.join(result));
} catch (IOException e) { } catch (IOException e) {
throw new PreprocessorException(e); throw new PreprocessorException(e);
} finally { } finally {
......
package de.vipra.cmd.text;
public final class ProcessedText {
private final String text;
public ProcessedText(String text) {
this.text = text;
}
public String getText() {
return text;
}
}
...@@ -8,25 +8,33 @@ import de.vipra.util.Config; ...@@ -8,25 +8,33 @@ import de.vipra.util.Config;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.Config.Key; import de.vipra.util.Config.Key;
public abstract class Preprocessor { public abstract class Processor {
public abstract String getName(); private final String name;
public abstract String preprocess(String input) throws PreprocessorException; public Processor(String name) {
this.name = name;
}
public String getName() {
return name;
}
public abstract ProcessedText preprocess(String input) throws PreprocessorException;
public static Preprocessor getPreprocessor(Config config) { public static Processor getPreprocessor(Config config) {
List<String> stopWords = Arrays.asList(config.getString(Key.STOPWORDS).toLowerCase().split(",")); List<String> stopWords = Arrays.asList(config.getString(Key.STOPWORDS).toLowerCase().split(","));
if (stopWords.size() == 0) { if (stopWords.size() == 0) {
stopWords = Constants.STOPWORDS; stopWords = Constants.STOPWORDS;
} }
switch (Constants.Preprocessor.fromString(config.getString(Key.PREPROCESSOR))) { switch (Constants.Processor.fromString(config.getString(Key.PREPROCESSOR))) {
case CUSTOM: case CUSTOM:
return new CustomPreprocessor(stopWords); return new CustomProcessor(stopWords);
case LUCENE: case LUCENE:
case DEFAULT: case DEFAULT:
default: default:
return new LucenePreprocessor(stopWords); return new LuceneProcessor(stopWords);
} }
} }
......
...@@ -20,7 +20,9 @@ public class Constants { ...@@ -20,7 +20,9 @@ public class Constants {
public static final String DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'"; public static final String DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'";
public static final Preprocessor DEFAULT_PREPROCESSOR = Preprocessor.LUCENE; public static final String CHARS_DISALLOWED = "[^a-zA-Z0-9]";
public static final Processor DEFAULT_PREPROCESSOR = Processor.LUCENE;
public static final Analyzer DEFAULT_ANALYZER = Analyzer.JGIBB; public static final Analyzer DEFAULT_ANALYZER = Analyzer.JGIBB;
public static final List<String> STOPWORDS = Arrays.asList("a", "an", "and", "are", "as", "at", "be", "but", "by", public static final List<String> STOPWORDS = Arrays.asList("a", "an", "and", "are", "as", "at", "be", "but", "by",
...@@ -37,24 +39,24 @@ public class Constants { ...@@ -37,24 +39,24 @@ public class Constants {
} }
} }
public static enum Preprocessor { public static enum Processor {
CUSTOM("custom"), CUSTOM("custom"),
LUCENE("lucene"), LUCENE("lucene"),
DEFAULT(LUCENE); DEFAULT(LUCENE);
public final String name; public final String name;
private Preprocessor(String name) { private Processor(String name) {
this.name = name; this.name = name;
} }
private Preprocessor(Preprocessor def) { private Processor(Processor def) {
this.name = def.name; this.name = def.name;
} }
public static Preprocessor fromString(String text) { public static Processor fromString(String text) {
if (text != null) { if (text != null) {
for (Preprocessor b : Preprocessor.values()) { for (Processor b : Processor.values()) {
if (text.equalsIgnoreCase(b.name)) { if (text.equalsIgnoreCase(b.name)) {
return b; return b;
} }
......
package de.vipra.util; package de.vipra.util;
import java.io.BufferedInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.file.Files; import java.nio.file.Files;
...@@ -21,4 +23,25 @@ public class FileUtils extends org.apache.commons.io.FileUtils { ...@@ -21,4 +23,25 @@ public class FileUtils extends org.apache.commons.io.FileUtils {
return is; return is;
} }
public static int countLines(File file) throws IOException {
InputStream is = new BufferedInputStream(new FileInputStream(file));
try {
byte[] c = new byte[1024];
int count = 0;
int readChars = 0;
boolean empty = true;
while ((readChars = is.read(c)) != -1) {
empty = false;
for (int i = 0; i < readChars; ++i) {
if (c[i] == '\n') {
++count;
}
}
}
return (count == 0 && !empty) ? 1 : count;
} finally {
is.close();
}
}
} }
...@@ -13,8 +13,7 @@ public class PathUtils { ...@@ -13,8 +13,7 @@ public class PathUtils {
base = new File(System.getProperty("user.home") + File.separator + "Library" + File.separator base = new File(System.getProperty("user.home") + File.separator + "Library" + File.separator
+ "ApplicationSupport"); + "ApplicationSupport");
} else { } else {
base = new File( base = new File(System.getProperty("user.home") + File.separator + ".local" + File.separator + "share");
System.getProperty("user.home") + File.separator + ".local" + File.separator + "share");
} }
return base; return base;
} }
......
package de.vipra.util.ex;
public class NotImplementedException extends RuntimeException {
private static final long serialVersionUID = 1L;
}
...@@ -11,5 +11,5 @@ public interface Service<T extends Model, E extends Exception> { ...@@ -11,5 +11,5 @@ public interface Service<T extends Model, E extends Exception> {
long deleteSingle(String id) throws E; long deleteSingle(String id) throws E;
long updateSingle(T t) throws E; long updateSingle(T t) throws E;
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment