Skip to content
Snippets Groups Projects
Commit d80a34fa authored by Eike Cochu's avatar Eike Cochu
Browse files

updated Filebase implementation

renamed Preprocessor -> Processor
parent aac3b9e6
Branches
No related tags found
No related merge requests found
Showing
with 203 additions and 55 deletions
......@@ -14,12 +14,14 @@ import de.vipra.util.ex.ConfigException;
public abstract class Filebase implements Closeable {
private final File dataDir;
private final File dataFile;
private final FilebaseIndex index;
private final FilebaseVocabulary vocab;
public Filebase(File dataDir) throws FilebaseException {
public Filebase(File dataDir, String fileName) throws FilebaseException {
this.dataDir = dataDir;
try {
this.dataFile = new File(dataDir, fileName);
this.index = new FilebaseIndex(new File(dataDir, Constants.INDEX_FILE));
this.vocab = new FilebaseVocabulary(new File(dataDir, Constants.VOCAB_FILE));
} catch (IOException e) {
......@@ -31,6 +33,18 @@ public abstract class Filebase implements Closeable {
return dataDir;
}
public File getDataFile() {
return dataFile;
}
public FilebaseIndex getIndex() {
return index;
}
public FilebaseVocabulary getVocab() {
return vocab;
}
public void remove(Article article) throws FilebaseException {
remove(article.getId());
}
......@@ -40,7 +54,6 @@ public abstract class Filebase implements Closeable {
write();
index.close();
vocab.close();
}
public abstract void add(Article article) throws FilebaseException;
......
......@@ -32,4 +32,21 @@ public class FilebaseVocabulary implements Closeable {
write();
}
public void addVocabulary(String text) {
addVocabulary(text.split("\\s+"));
}
public void addVocabulary(String[] text) {
for (String word : text) {
// TODO fix this
if (!vocables.contains(word)) {
vocables.add(word);
}
}
}
public int index(String word) {
return vocables.indexOf(word);
}
}
package de.vipra.cmd.file;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import de.vipra.cmd.ex.FilebaseException;
import de.vipra.cmd.model.Article;
import de.vipra.util.Constants;
import de.vipra.util.FileUtils;
import de.vipra.util.ex.NotImplementedException;
public class JGibbFilebase extends Filebase {
private final File dataFile;
private final FilebaseIndex index;
private final FilebaseVocabulary vocab;
private final List<Article> articles;
public JGibbFilebase(File dataDir) throws FilebaseException {
super(dataDir);
// TODO Auto-generated constructor stub
super(dataDir, "jgibb");
this.dataFile = getDataFile();
this.index = getIndex();
this.vocab = getVocab();
this.articles = new ArrayList<>();
}
@Override
public void add(Article article) {
// TODO Auto-generated method stub
String[] words = article.getProcessedText().getText().split("\\s+");
vocab.addVocabulary(words);
index.add(article.getId());
articles.add(article);
}
@Override
public void remove(String id) {
// TODO Auto-generated method stub
throw new NotImplementedException();
}
@Override
public void write() throws IOException {
// TODO Auto-generated method stub
int lineCount = FileUtils.countLines(dataFile) + articles.size();
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(dataFile, true));
for (Article a : articles) {
bw.write(a.getProcessedText().getText().getBytes(Constants.FB_ENCODING));
bw.write(System.lineSeparator().getBytes(Constants.FB_ENCODING));
}
bw.close();
}
}
......@@ -5,11 +5,12 @@ import java.io.IOException;
import de.vipra.cmd.ex.FilebaseException;
import de.vipra.cmd.model.Article;
import de.vipra.util.ex.NotImplementedException;
public class LdacFilebase extends Filebase {
public LdacFilebase(File dataDir) throws FilebaseException {
super(dataDir);
super(dataDir, "ldac");
// TODO Auto-generated constructor stub
}
......@@ -21,8 +22,7 @@ public class LdacFilebase extends Filebase {
@Override
public void remove(String id) {
// TODO Auto-generated method stub
throw new NotImplementedException();
}
@Override
......
......@@ -2,19 +2,21 @@ package de.vipra.cmd.lda;
import de.vipra.cmd.ex.LDAAnalyzerException;
import de.vipra.util.Config;
import de.vipra.util.Constants;
import de.vipra.util.Config.Key;
public abstract class LDAAnalyzer {
public abstract String getName();
public abstract void analyze() throws LDAAnalyzerException;
public static LDAAnalyzer getAnalyzer(Config config) {
switch (config.getString(Key.ANALYZER).toLowerCase()) {
case "ldac":
switch (Constants.Analyzer.fromString(config.getString(Key.ANALYZER))) {
case LDAC:
return new LdacLDAAnalyzer();
case "jgibb":
case JGIBB:
case DEFAULT:
default:
return new JGibbLDAAnalyzer();
}
......
......@@ -2,8 +2,20 @@ package de.vipra.cmd.model;
import org.json.simple.JSONObject;
import de.vipra.cmd.text.ProcessedText;
public class Article extends de.vipra.util.model.Article {
private ProcessedText processedText;
public ProcessedText getProcessedText() {
return processedText;
}
public void setProcessedText(ProcessedText processedText) {
this.processedText = processedText;
}
public void fromJSON(JSONObject obj) {
if (obj.containsKey("title"))
setTitle(obj.get("title").toString());
......
......@@ -20,7 +20,8 @@ import de.vipra.cmd.ex.ImportException;
import de.vipra.cmd.file.Filebase;
import de.vipra.cmd.lda.LDAAnalyzer;
import de.vipra.cmd.model.Article;
import de.vipra.cmd.text.Preprocessor;
import de.vipra.cmd.text.Processor;
import de.vipra.cmd.text.ProcessedText;
import de.vipra.util.Config;
import de.vipra.util.Constants;
import de.vipra.util.StringUtils;
......@@ -38,7 +39,7 @@ public class ImportCommand implements Command {
private Config config;
private DatabaseService<Article> dbArticles;
private Filebase filebase;
private Preprocessor preprocessor;
private Processor preprocessor;
private LDAAnalyzer analyzer;
ImportCommand() {}
......@@ -70,7 +71,7 @@ public class ImportCommand implements Command {
File[] files = file.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return dir.isFile();
return dir.isFile() && dir.exists();
}
});
......@@ -92,15 +93,15 @@ public class ImportCommand implements Command {
try {
// preprocess text and generate text statistics
String preprocessedText = preprocessor.preprocess(article.getText());
ArticleStats articleStats = ArticleStats.generateFromText(preprocessedText);
ProcessedText processedText = preprocessor.preprocess(article.getText());
ArticleStats articleStats = ArticleStats.generateFromText(processedText.getText());
// add article to mongodb
article.setProcessedText(processedText);
article.setStats(articleStats);
article = dbArticles.createSingle(article);
// add article to filebase
article.setText(preprocessedText);
filebase.add(article);
return article;
......@@ -142,7 +143,7 @@ public class ImportCommand implements Command {
config = Config.getConfig();
dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class);
filebase = Filebase.getFilebase(config);
preprocessor = Preprocessor.getPreprocessor(config);
preprocessor = Processor.getPreprocessor(config);
analyzer = LDAAnalyzer.getAnalyzer(config);
out.info("using data directory: " + filebase.getDataDir().getAbsolutePath());
......@@ -155,8 +156,7 @@ public class ImportCommand implements Command {
// import files into database and filebase
List<Article> articles = new ArrayList<>();
for (File file : files) {
if (file.isFile() && file.exists())
articles.addAll(importFile(file));
articles.addAll(importFile(file));
}
long durImport = timer.lap();
......
......@@ -4,11 +4,14 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class CustomPreprocessor extends Preprocessor {
import de.vipra.util.Constants;
public class CustomProcessor extends Processor {
private final Set<String> stopWords;
public CustomPreprocessor(List<String> stopWordsList) {
public CustomProcessor(List<String> stopWordsList) {
super("Custom Processor");
this.stopWords = new HashSet<>(stopWordsList);
}
......@@ -25,16 +28,11 @@ public class CustomPreprocessor extends Preprocessor {
}
@Override
public String getName() {
return "Custom Preprocessor";
}
@Override
public String preprocess(String input) {
public ProcessedText preprocess(String input) {
input = input.toLowerCase();
input = removeStopWords(input);
input = input.replace("[^a-zA-Z0-9 ]", "");
return input;
input = input.replace(Constants.CHARS_DISALLOWED, "");
return new ProcessedText(input);
}
}
......@@ -16,35 +16,32 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import de.vipra.cmd.ex.PreprocessorException;
import de.vipra.util.Constants;
import de.vipra.util.StringUtils;
public class LucenePreprocessor extends Preprocessor {
public class LuceneProcessor extends Processor {
private final CharArraySet stopWords;
public LucenePreprocessor(List<String> stopWords) {
public LuceneProcessor(List<String> stopWords) {
super("Lucene Processor");
this.stopWords = new CharArraySet(stopWords, false);
}
@Override
public String getName() {
return "Lucene Preprocessor";
}
@Override
public String preprocess(String input) throws PreprocessorException {
public ProcessedText preprocess(String input) throws PreprocessorException {
Analyzer analyzer = new StandardAnalyzer(stopWords);
TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
try {
stream.reset();
stream = new PorterStemFilter(stream);
stream = new TrimFilter(stream);
stream = new PatternReplaceFilter(stream, Pattern.compile("[^a-zA-Z0-9]"), "", true);
stream = new PatternReplaceFilter(stream, Pattern.compile(Constants.CHARS_DISALLOWED), "", true);
ArrayList<String> result = new ArrayList<>();
while (stream.incrementToken()) {
result.add(stream.getAttribute(CharTermAttribute.class).toString());
}
return StringUtils.join(result);
return new ProcessedText(StringUtils.join(result));
} catch (IOException e) {
throw new PreprocessorException(e);
} finally {
......
package de.vipra.cmd.text;
public final class ProcessedText {
private final String text;
public ProcessedText(String text) {
this.text = text;
}
public String getText() {
return text;
}
}
......@@ -8,25 +8,33 @@ import de.vipra.util.Config;
import de.vipra.util.Constants;
import de.vipra.util.Config.Key;
public abstract class Preprocessor {
public abstract class Processor {
public abstract String getName();
private final String name;
public abstract String preprocess(String input) throws PreprocessorException;
public Processor(String name) {
this.name = name;
}
public String getName() {
return name;
}
public abstract ProcessedText preprocess(String input) throws PreprocessorException;
public static Preprocessor getPreprocessor(Config config) {
public static Processor getPreprocessor(Config config) {
List<String> stopWords = Arrays.asList(config.getString(Key.STOPWORDS).toLowerCase().split(","));
if (stopWords.size() == 0) {
stopWords = Constants.STOPWORDS;
}
switch (Constants.Preprocessor.fromString(config.getString(Key.PREPROCESSOR))) {
switch (Constants.Processor.fromString(config.getString(Key.PREPROCESSOR))) {
case CUSTOM:
return new CustomPreprocessor(stopWords);
return new CustomProcessor(stopWords);
case LUCENE:
case DEFAULT:
default:
return new LucenePreprocessor(stopWords);
return new LuceneProcessor(stopWords);
}
}
......
......@@ -20,7 +20,9 @@ public class Constants {
public static final String DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'";
public static final Preprocessor DEFAULT_PREPROCESSOR = Preprocessor.LUCENE;
public static final String CHARS_DISALLOWED = "[^a-zA-Z0-9]";
public static final Processor DEFAULT_PREPROCESSOR = Processor.LUCENE;
public static final Analyzer DEFAULT_ANALYZER = Analyzer.JGIBB;
public static final List<String> STOPWORDS = Arrays.asList("a", "an", "and", "are", "as", "at", "be", "but", "by",
......@@ -37,24 +39,24 @@ public class Constants {
}
}
public static enum Preprocessor {
public static enum Processor {
CUSTOM("custom"),
LUCENE("lucene"),
DEFAULT(LUCENE);
public final String name;
private Preprocessor(String name) {
private Processor(String name) {
this.name = name;
}
private Preprocessor(Preprocessor def) {
private Processor(Processor def) {
this.name = def.name;
}
public static Preprocessor fromString(String text) {
public static Processor fromString(String text) {
if (text != null) {
for (Preprocessor b : Preprocessor.values()) {
for (Processor b : Processor.values()) {
if (text.equalsIgnoreCase(b.name)) {
return b;
}
......
package de.vipra.util;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
......@@ -21,4 +23,25 @@ public class FileUtils extends org.apache.commons.io.FileUtils {
return is;
}
public static int countLines(File file) throws IOException {
InputStream is = new BufferedInputStream(new FileInputStream(file));
try {
byte[] c = new byte[1024];
int count = 0;
int readChars = 0;
boolean empty = true;
while ((readChars = is.read(c)) != -1) {
empty = false;
for (int i = 0; i < readChars; ++i) {
if (c[i] == '\n') {
++count;
}
}
}
return (count == 0 && !empty) ? 1 : count;
} finally {
is.close();
}
}
}
......@@ -13,8 +13,7 @@ public class PathUtils {
base = new File(System.getProperty("user.home") + File.separator + "Library" + File.separator
+ "ApplicationSupport");
} else {
base = new File(
System.getProperty("user.home") + File.separator + ".local" + File.separator + "share");
base = new File(System.getProperty("user.home") + File.separator + ".local" + File.separator + "share");
}
return base;
}
......
package de.vipra.util.ex;
public class NotImplementedException extends RuntimeException {
private static final long serialVersionUID = 1L;
}
......@@ -11,5 +11,5 @@ public interface Service<T extends Model, E extends Exception> {
long deleteSingle(String id) throws E;
long updateSingle(T t) throws E;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment