diff --git a/TODO b/TODO index 1da64172f79be512a41c8384cd4547425232c3cf..286b0293a7604dcf57254de1b32e2cbe829a7a90 100644 --- a/TODO +++ b/TODO @@ -2,6 +2,9 @@ cmd ☐ implement delete operation ☐ implement filebase remove ☐ implement elasticsearch indexing + ☐ allow other document input formats + ☐ do not read whole file into memory + ☐ on save topics: retain topic names? rest ☐ implement etag caching \ No newline at end of file diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java index 02dcda7b2cc00e2b780f5f956086a81324729cc8..2ea001f7a6e7ffeee8c2f3f5ee8a1d27d0496f87 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java @@ -66,8 +66,6 @@ public abstract class Filebase implements Closeable { public static Filebase getFilebase(Config config) throws FilebaseException, ConfigException { File dataDir = config.getDataDirectory(); switch (Constants.Analyzer.fromString(config.getString(Key.ANALYZER))) { - case LDAC: - return new LdacFilebase(dataDir); case JGIBB: case DEFAULT: default: diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java index 516a5caa17a60095b98b8e5b815c4256d470846a..dae34993e28849e996a328c7ba7e30856098d842 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java @@ -4,12 +4,13 @@ import java.io.Closeable; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import de.vipra.util.Constants; import de.vipra.util.FileUtils; -public class FilebaseIndex implements Closeable { +public class FilebaseIndex implements Closeable, Iterable<String> { private final File file; private final List<String> index; @@ -23,7 +24,7 @@ public class FilebaseIndex implements Closeable { } } - private void write() throws IOException { + public void write() throws IOException { FileUtils.writeLines(file, Constants.FB_ENCODING.name(), index, null, false); } @@ -40,6 +41,10 @@ public class FilebaseIndex implements Closeable { return index.indexOf(id); } + public String get(int i) { + return index.get(i); + } + public boolean remove(String id) { return index.remove(id); } @@ -49,4 +54,9 @@ public class FilebaseIndex implements Closeable { write(); } + @Override + public Iterator<String> iterator() { + return index.iterator(); + } + } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java index 227fa33c634ce659ba946ecabe4e6e097d1a3ef2..a910d941e47c895f88925ff1160fecb8e0de69dd 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java @@ -4,12 +4,13 @@ import java.io.Closeable; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import de.vipra.util.Constants; import de.vipra.util.FileUtils; -public class FilebaseVocabulary implements Closeable { +public class FilebaseVocabulary implements Closeable, Iterable<String> { private File file; private List<String> vocables; @@ -23,7 +24,7 @@ public class FilebaseVocabulary implements Closeable { } } - private void write() throws IOException { + public void write() throws IOException { FileUtils.writeLines(file, Constants.FB_ENCODING.name(), vocables, null, false); } @@ -49,4 +50,9 @@ public class FilebaseVocabulary implements Closeable { return vocables.indexOf(word); } + @Override + public Iterator<String> iterator() { + return vocables.iterator(); + } + } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java index b92fcf9a1d1ada9daf0251a7b50f75b36bd7441e..1195e0990a3d417c34af4841f02b17d2ad955335 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java @@ -17,6 +17,8 @@ public class JGibbFilebase extends Filebase { private final FilebaseVocabulary vocab; private final List<Article> articles; + private final int bufferMaxSize = 100; + public JGibbFilebase(File dataDir) throws FilebaseException { super(dataDir, "jgibb"); this.modelFile = getModelFile(); @@ -26,11 +28,19 @@ public class JGibbFilebase extends Filebase { } @Override - public void add(Article article) { + public void add(Article article) throws FilebaseException { String[] words = article.getProcessedText().getText().split("\\s+"); vocab.addVocabulary(words); index.add(article.getId()); articles.add(article); + + if (articles.size() >= bufferMaxSize) { + try { + write(); + } catch (IOException e) { + throw new FilebaseException(e); + } + } } @Override @@ -40,20 +50,23 @@ public class JGibbFilebase extends Filebase { @Override public void write() throws IOException { - boolean linesep = modelFile.exists(); - RandomAccessFile raf = new RandomAccessFile(modelFile, "rw"); - - // write articles - raf.seek(raf.length()); - for (Article a : articles) { - if (linesep) - raf.writeBytes(System.lineSeparator()); - else - linesep = true; - raf.writeBytes(a.getProcessedText().getText()); - } + if (!articles.isEmpty()) { + boolean linesep = modelFile.exists(); + RandomAccessFile raf = new RandomAccessFile(modelFile, "rw"); + + // write articles + raf.seek(raf.length()); + for (Article a : articles) { + if (linesep) + raf.writeBytes(System.lineSeparator()); + else + linesep = true; + raf.writeBytes(a.getProcessedText().getText()); + } - raf.close(); + raf.close(); + articles.clear(); + } } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java index 67b210a9c655ed1c3dcc5bef8d59863e576a901f..2360d4af76b0aa4a6a1032ae5da2486e465c8e55 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/LdacFilebase.java @@ -15,13 +15,13 @@ public class LdacFilebase extends Filebase { } @Override - public void add(Article article) { + public void add(Article article) throws FilebaseException { // TODO Auto-generated method stub } @Override - public void remove(String id) { + public void remove(String id) throws FilebaseException { throw new NotImplementedException(); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java index 41e21180d4f7d20fb71f4734a0dcaf809aec96eb..ffebc4d5b5f1ee5055d47d679772e9c0189acea7 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java @@ -1,13 +1,25 @@ package de.vipra.cmd.lda; +import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.vipra.cmd.ex.LDAAnalyzerException; +import de.vipra.cmd.model.Article; import de.vipra.util.Config; +import de.vipra.util.FileUtils; import de.vipra.util.ex.ConfigException; +import de.vipra.util.ex.DatabaseException; +import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicWord; +import de.vipra.util.service.DatabaseService; import jgibblda.Estimator; import jgibblda.Inferencer; import jgibblda.LDACmdOption; @@ -38,7 +50,7 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { modelDir = new File(dataDir, "jgibb"); options.dir = modelDir.getAbsolutePath(); - options.estc = new File(modelDir, "model-final.tassign").exists(); + options.estc = new File(modelDir, "jgibb.tassign").exists(); options.est = !options.estc; modelFile = new File(modelDir, "jgibb"); @@ -53,7 +65,6 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { throw new LDAAnalyzerException("model file does not exist: " + modelFile.getAbsolutePath()); } estimate(); - // inference(); } private void estimate() { @@ -62,10 +73,53 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { estimator.estimate(); } + @SuppressWarnings("unused") private void inference() { Inferencer inferencer = new Inferencer(); inferencer.init(options); Model newModel = inferencer.inference(); } + private List<Topic> readTopics() throws IOException { + File twords = new File(modelDir, "jgibb.twords"); + List<String> lines = FileUtils.readFile(twords); + List<Topic> topics = new ArrayList<>(); + List<TopicWord> topicWords = null; + for (String line : lines) { + if (line.startsWith("\t")) { + String[] parts = line.trim().split("\\s+"); + topicWords.add(new TopicWord(parts[0], Double.parseDouble(parts[1]))); + } else { + if (topicWords != null) + topics.add(new Topic(topicWords)); + topicWords = new ArrayList<>(); + } + } + return topics; + } + + @Override + public void save(DatabaseService<Article> dbArticles, DatabaseService<Topic> dbTopics) throws LDAAnalyzerException { + try { + List<Topic> topics = readTopics(); + + // recreate topics in database + dbTopics.drop(); + for (Topic topic : topics) { + dbTopics.createSingle(topic); + } + + // read document topics + BufferedReader reader = new BufferedReader( + new InputStreamReader(new FileInputStream(new File(modelDir, "jgibb.tassign")))); + String line; + while ((line = reader.readLine()) != null) { + String[] parts = line.trim().split("\\s+"); + + } + } catch (IOException | DatabaseException e) { + throw new LDAAnalyzerException(e); + } + } + } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java index b7285a722466489bfca80dd5a61111c7634540c4..c352eb8958bacab4c5a665249a5f8df451482016 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java @@ -1,8 +1,11 @@ package de.vipra.cmd.lda; import de.vipra.cmd.ex.LDAAnalyzerException; +import de.vipra.cmd.model.Article; import de.vipra.util.Config; import de.vipra.util.Constants; +import de.vipra.util.model.Topic; +import de.vipra.util.service.DatabaseService; import de.vipra.util.Config.Key; public abstract class LDAAnalyzer { @@ -21,12 +24,12 @@ public abstract class LDAAnalyzer { public abstract void analyze() throws LDAAnalyzerException; + public abstract void save(DatabaseService<Article> dbArticles, DatabaseService<Topic> dbTopics) + throws LDAAnalyzerException; + public static LDAAnalyzer getAnalyzer(Config config) throws LDAAnalyzerException { LDAAnalyzer analyzer = null; switch (Constants.Analyzer.fromString(config.getString(Key.ANALYZER))) { - case LDAC: - analyzer = new LdacLDAAnalyzer(); - break; case JGIBB: case DEFAULT: default: diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java deleted file mode 100644 index 0b65ea364aeb314a59ea8f6b61bdd8d20627219b..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LdacLDAAnalyzer.java +++ /dev/null @@ -1,23 +0,0 @@ -package de.vipra.cmd.lda; - -import de.vipra.cmd.ex.LDAAnalyzerException; -import de.vipra.util.Config; - -public class LdacLDAAnalyzer extends LDAAnalyzer { - - protected LdacLDAAnalyzer() { - super("lda-c Analyzer"); - } - - @Override - public void init(Config config) throws LDAAnalyzerException { - // TODO Auto-generated method stub - - } - - @Override - public void analyze() throws LDAAnalyzerException { - // TODO Auto-generated method stub - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index a61a8da38ef75ab6891988ffb0be214d12139bfa..5b6c822606761c09573dad4b683731a217319855 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -6,7 +6,6 @@ import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; -import java.util.List; import org.json.simple.JSONArray; import org.json.simple.JSONObject; @@ -27,6 +26,7 @@ import de.vipra.util.Constants; import de.vipra.util.StringUtils; import de.vipra.util.Timer; import de.vipra.util.model.ArticleStats; +import de.vipra.util.model.Topic; import de.vipra.util.service.DatabaseService; public class ImportCommand implements Command { @@ -38,6 +38,7 @@ public class ImportCommand implements Command { private JSONParser parser = new JSONParser(); private Config config; private DatabaseService<Article> dbArticles; + private DatabaseService<Topic> dbTopics; private Filebase filebase; private Processor preprocessor; private LDAAnalyzer analyzer; @@ -82,7 +83,7 @@ public class ImportCommand implements Command { * @return * @throws ImportException */ - Article importArticle(JSONObject obj) throws ImportException { + void importArticle(JSONObject obj) throws ImportException { out.debug("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\""); Article article = new Article(); article.fromJSON(obj); @@ -99,8 +100,6 @@ public class ImportCommand implements Command { // add article to filebase filebase.add(article); - - return article; } catch (Exception e) { throw new ImportException(e, article.getId()); } @@ -116,21 +115,22 @@ public class ImportCommand implements Command { * @throws ImportException * @throws Exception */ - private List<Article> importFile(File file) - throws FileNotFoundException, IOException, ParseException, ImportException { + private long importFile(File file) throws FileNotFoundException, IOException, ParseException, ImportException { Object data = parser.parse(new FileReader(file)); - List<Article> articles = new ArrayList<Article>(); + long imported = 0; if (data instanceof JSONArray) { for (Object object : (JSONArray) data) { - articles.add(importArticle((JSONObject) object)); + importArticle((JSONObject) object); + imported++; } } else if (data instanceof JSONObject) { - articles.add(importArticle((JSONObject) data)); + importArticle((JSONObject) data); + imported++; } - return articles; + return imported; } @Override @@ -138,6 +138,7 @@ public class ImportCommand implements Command { try { config = Config.getConfig(); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); filebase = Filebase.getFilebase(config); preprocessor = Processor.getPreprocessor(config); analyzer = LDAAnalyzer.getAnalyzer(config); @@ -151,25 +152,28 @@ public class ImportCommand implements Command { // import files into database and filebase out.info("file import"); - List<Article> articles = new ArrayList<>(); + long imported = 0; for (File file : files) { - articles.addAll(importFile(file)); + imported += importFile(file); } long durImport = timer.lap(); // write filebase out.info("writing file index"); filebase.close(); - long durIndex = timer.lap(); + timer.lap(); // do topic modeling out.info("topic modeling"); analyzer.analyze(); long durAnalyze = timer.lap(); - out.info("imported " + articles.size() + " " + (articles.size() == 1 ? "article" : "articles")); - out.info("import: " + StringUtils.timeString(durImport) + ", analyze: " + StringUtils.timeString(durAnalyze) - + ", reindex: " + StringUtils.timeString(durIndex)); + out.info("saving topic models"); + analyzer.save(dbArticles, dbTopics); + + out.info("imported " + imported + " " + (imported == 1 ? "article" : "articles")); + out.info("import: " + StringUtils.timeString(durImport) + ", analyze: " + + StringUtils.timeString(durAnalyze)); } catch (Exception e) { throw new ExecutionException(e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java index 6cf6800b095c97a2383897e0603f75d9ce096902..53d7a73872906ab12f136b09d4eff7d0b051d299 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java @@ -7,7 +7,7 @@ public class StatsCommand implements Command { @Override public void run() throws ExecutionException { // TODO Auto-generated method stub - + } } diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 34e8043a88f1b12981d7397706cf569b86a2f070..0594c27dcbe210369807aa1409b5240ea2600f85 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -30,7 +30,8 @@ public class Constants { "then", "there", "these", "they", "this", "to", "was", "will", "with"); public static enum Collection { - ARTICLES("articles"); + ARTICLES("articles"), + TOPICS("topics"); public final String name; @@ -67,7 +68,6 @@ public class Constants { } public static enum Analyzer { - LDAC("ldac"), JGIBB("jgibb"), DEFAULT(JGIBB); diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java index 2b4cc36c6b3c7ca1381e5b2d3d93effbd25d743f..fca89724ff36297f31d18c1b7928d224d979d1c3 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java +++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java @@ -47,6 +47,7 @@ public class ArticleStats implements BsonDocument { stats.setWordCount(words.length); Map<String, TermFrequency> uniqueWords = new HashMap<>(); long maxFrequency = 0; + // loop and count unique words // also remember maximum frequency for (String word : words) { @@ -60,10 +61,12 @@ public class ArticleStats implements BsonDocument { } uniqueWords.put(word, tf); } + // normalize frequencies for (Map.Entry<String, TermFrequency> entry : uniqueWords.entrySet()) { entry.getValue().normalizeTermFrequency(maxFrequency); } + stats.setUniqueWordCount(uniqueWords.size()); stats.setUniqueWords(uniqueWords); return stats; diff --git a/vipra-util/src/main/java/de/vipra/util/model/Topic.java b/vipra-util/src/main/java/de/vipra/util/model/Topic.java new file mode 100644 index 0000000000000000000000000000000000000000..168a4c944dd43314129e9dbe77013668e2517a7f --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/Topic.java @@ -0,0 +1,65 @@ +package de.vipra.util.model; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.bson.Document; + +public class Topic extends Model { + + private List<String> names; + private List<TopicWord> words; + + public Topic() {} + + public Topic(List<TopicWord> words) { + this.words = words; + } + + public List<String> getNames() { + return names; + } + + public void setNames(List<String> names) { + this.names = names; + } + + public List<TopicWord> getWords() { + return words; + } + + public void setWords(List<TopicWord> words) { + this.words = words; + } + + @Override + public String getType() { + return Topic.class.getSimpleName().toLowerCase(); + } + + @Override + public void fromDocument(Document document) { + // TODO Auto-generated method stub + + } + + @Override + public Document toDocument() { + // TODO Auto-generated method stub + return null; + } + + @Override + public void fromFile(File file) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public String toFileString() { + // TODO Auto-generated method stub + return null; + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java new file mode 100644 index 0000000000000000000000000000000000000000..22fe8ab0f99190ea044a446e174c8c5ef8138bdc --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java @@ -0,0 +1,31 @@ +package de.vipra.util.model; + +public class TopicWord { + + private String word; + private double likeliness; + + public TopicWord() {} + + public TopicWord(String word, double likeliness) { + this.word = word; + this.likeliness = likeliness; + } + + public String getWord() { + return word; + } + + public void setWord(String word) { + this.word = word; + } + + public double getLikeliness() { + return likeliness; + } + + public void setLikeliness(double likeliness) { + this.likeliness = likeliness; + } + +}