From 2e1beffc1b9410f662d53802cd94da2e28e8d5ab Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Thu, 11 Feb 2016 00:34:42 +0100 Subject: [PATCH] added dtm file model added dtm file model removed dynnmf filemodel and analyzer --- tasks.todo | 2 + .../src/main/java/de/vipra/cmd/Main.java | 4 +- .../de/vipra/cmd/ex/FilebaseException.java | 4 + .../java/de/vipra/cmd/file/DTMDateIndex.java | 110 ++++++++++++++++++ .../java/de/vipra/cmd/file/DTMFilebase.java | 95 +++++++++++++++ ...baseVocabulary.java => DTMVocabulary.java} | 41 ++++++- .../de/vipra/cmd/file/DynNMFFilebase.java | 66 ----------- .../main/java/de/vipra/cmd/file/Filebase.java | 32 +++-- .../java/de/vipra/cmd/file/FilebaseIndex.java | 2 +- .../java/de/vipra/cmd/file/JGibbFilebase.java | 25 ++-- .../main/java/de/vipra/cmd/lda/Analyzer.java | 4 +- .../{DynNMFAnalyzer.java => DTMAnalyzer.java} | 6 +- .../de/vipra/cmd/option/ImportCommand.java | 33 +++++- .../de/vipra/cmd/option/StatsCommand.java | 8 -- .../src/main/resources/config.properties | 2 +- .../java/de/vipra/util/AbstractCache.java | 4 +- .../src/main/java/de/vipra/util/Config.java | 13 +-- .../main/java/de/vipra/util/Constants.java | 42 +++++-- .../main/java/de/vipra/util/FileUtils.java | 48 +++++++- .../java/de/vipra/util/model/FileModel.java | 2 +- 20 files changed, 399 insertions(+), 144 deletions(-) create mode 100644 tasks.todo create mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java create mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java rename vipra-cmd/src/main/java/de/vipra/cmd/file/{FilebaseVocabulary.java => DTMVocabulary.java} (54%) delete mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/file/DynNMFFilebase.java rename vipra-cmd/src/main/java/de/vipra/cmd/lda/{DynNMFAnalyzer.java => DTMAnalyzer.java} (87%) diff --git a/tasks.todo b/tasks.todo new file mode 100644 index 00000000..6cda7ed7 --- /dev/null +++ b/tasks.todo @@ -0,0 +1,2 @@ + ☐ topic/word network + ☐ enable topic editing/labeling \ No newline at end of file diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java index eaa25a59..923605d7 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java @@ -6,11 +6,11 @@ import static de.vipra.cmd.CmdOptions.OPT_DEBUG; import static de.vipra.cmd.CmdOptions.OPT_DEFAULTS; import static de.vipra.cmd.CmdOptions.OPT_HELP; import static de.vipra.cmd.CmdOptions.OPT_IMPORT; +import static de.vipra.cmd.CmdOptions.OPT_MODELING; import static de.vipra.cmd.CmdOptions.OPT_SHELL; import static de.vipra.cmd.CmdOptions.OPT_SILENT; import static de.vipra.cmd.CmdOptions.OPT_STATS; import static de.vipra.cmd.CmdOptions.OPT_TEST; -import static de.vipra.cmd.CmdOptions.OPT_MODELING; import java.util.ArrayList; import java.util.List; @@ -37,9 +37,9 @@ import de.vipra.cmd.option.ClearCommand; import de.vipra.cmd.option.Command; import de.vipra.cmd.option.ConfigCommand; import de.vipra.cmd.option.ImportCommand; +import de.vipra.cmd.option.ModelingCommand; import de.vipra.cmd.option.StatsCommand; import de.vipra.cmd.option.TestCommand; -import de.vipra.cmd.option.ModelingCommand; public class Main { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java b/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java index ad5f6ef7..22df17d5 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java @@ -12,4 +12,8 @@ public class FilebaseException extends Exception { super(e); } + public FilebaseException(String msg, Exception e) { + super(msg, e); + } + } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java new file mode 100644 index 00000000..7951fdc7 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java @@ -0,0 +1,110 @@ +package de.vipra.cmd.file; + +import java.io.BufferedWriter; +import java.io.Closeable; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import de.vipra.util.Constants; +import de.vipra.util.Constants.WindowResolution; +import de.vipra.util.FileUtils; + +public class DTMDateIndex implements Closeable, Iterable<DTMDateIndex.DTMDateIndexEntry> { + + public static class DTMDateIndexEntry implements Comparable<DTMDateIndexEntry> { + public Date date; + public boolean exists; + public String line; + + public DTMDateIndexEntry(Date date, boolean exists, String line) { + this.date = date; + this.exists = exists; + this.line = line; + } + + @Override + public int compareTo(DTMDateIndexEntry o) { + if (o == null) + return 1; + if (date == null) + return -1; + return this.date.compareTo(o.date); + } + } + + private final File file; + private final WindowResolution windowResolution; + private final List<DTMDateIndexEntry> entries; + private final SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); + + public DTMDateIndex(File file, WindowResolution windowResolution) throws IOException, ParseException { + this.file = file; + this.windowResolution = windowResolution; + if (file.exists()) { + List<String> dates = FileUtils.readFile(file); + this.entries = new ArrayList<>(dates.size()); + for (String date : dates) { + this.entries.add(new DTMDateIndexEntry(df.parse(date), true, null)); + } + } else { + this.entries = new ArrayList<>(); + } + } + + public void add(Date date, String line) { + this.entries.add(new DTMDateIndexEntry(date, false, line)); + } + + @Override + public Iterator<DTMDateIndexEntry> iterator() { + Collections.sort(entries); + return entries.iterator(); + } + + @Override + public void close() throws IOException { + List<String> windows = new ArrayList<>(); + Map<String, Integer> windowSizes = new HashMap<>(); + + // write date index + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false))); + for (DTMDateIndexEntry entry : entries) { + writer.write(df.format(entry.date)); + writer.write(Constants.LINE_SEP); + + String window = windowResolution.fromDate(entry.date); + Integer count = windowSizes.get(window); + if (count == null) { + windowSizes.put(window, 1); + windows.add(window); + } else { + windowSizes.put(window, count + 1); + } + } + writer.close(); + + // write window index + File seqFile = new File(file.getParentFile(), "dtm-seq.dat"); + writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seqFile, false))); + writer.write(Integer.toString(windows.size())); + writer.write(Constants.LINE_SEP); + Collections.sort(windows); + for (String window : windows) { + writer.write(Integer.toString(windowSizes.get(window))); + writer.write(Constants.LINE_SEP); + } + writer.close(); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java new file mode 100644 index 00000000..8c81bbfc --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java @@ -0,0 +1,95 @@ +package de.vipra.cmd.file; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.text.ParseException; +import java.util.Iterator; +import java.util.List; + +import de.vipra.cmd.ex.FilebaseException; +import de.vipra.cmd.file.DTMDateIndex.DTMDateIndexEntry; +import de.vipra.util.Config; +import de.vipra.util.Constants; +import de.vipra.util.FileUtils; +import de.vipra.util.ex.ConfigException; +import de.vipra.util.model.ArticleFull; + +public class DTMFilebase extends Filebase { + + private final DTMDateIndex index; + private final DTMVocabulary vocab; + private final File modelFile; + + public DTMFilebase(File dataDir) throws FilebaseException { + super(dataDir, "dtm"); + Config config; + try { + config = Config.getConfig(); + } catch (IOException | ConfigException e) { + throw new FilebaseException(e); + } + try { + this.index = new DTMDateIndex(getModelFile("dtm-dates.dat"), config.windowResolution); + } catch (IOException | ParseException e) { + throw new FilebaseException("could not read date index file", e); + } + try { + this.vocab = new DTMVocabulary(getModelFile("dtm-vocab.dat")); + } catch (IOException e) { + throw new FilebaseException("could not read vocabulary file", e); + } + this.modelFile = getModelFile("dtm-mult.dat"); + } + + @Override + public void write(List<ArticleFull> articles) throws IOException { + if (!articles.isEmpty()) { + // index new articles + for (ArticleFull article : articles) { + index.add(article.getDate(), vocab.indexText(article.getProcessedText())); + } + + // write temp file + File modelFileTmp = getModelFile("dtm-mult.dat.tmp"); + Iterator<String> lines = null; + if (modelFile.exists()) + lines = FileUtils.iterateFileLines(modelFile); + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(modelFileTmp))); + for (DTMDateIndexEntry e : index) { + if (e.exists) { + if (lines == null) { + writer.close(); + throw new IOException("index inconsistency: missing article file"); + } + writer.write(lines.next()); + } else { + e.exists = true; + writer.write(e.line); + } + writer.write(Constants.LINE_SEP); + } + writer.close(); + + // replace model file by temp file + if (modelFile.exists() && !modelFile.delete()) + throw new IOException("could not delete file " + modelFile.getAbsolutePath()); + if (!modelFileTmp.renameTo(modelFile)) + throw new IOException( + "could not rename tmp file " + modelFileTmp.getAbsolutePath() + " to " + modelFile.getName()); + + } + } + + @Override + public void close() throws IOException { + super.close(); + + // write vocabulary and windows + vocab.close(); + index.close(); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java similarity index 54% rename from vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java rename to vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java index 8f8b3b6f..99948366 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java @@ -4,22 +4,24 @@ import java.io.Closeable; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import de.vipra.util.Constants; import de.vipra.util.FileUtils; -public class FilebaseVocabulary implements Closeable, Iterable<String> { +public class DTMVocabulary implements Closeable, Iterable<String> { private File file; private List<String> vocables; private Map<String, Integer> vocablesMap; - private int nextIndex = 0; + private int nextIndex = 1; - public FilebaseVocabulary(File file) throws IOException { + public DTMVocabulary(File file) throws IOException { this.file = file; if (file.exists()) { vocables = new ArrayList<>(FileUtils.readFile(file)); @@ -33,7 +35,7 @@ public class FilebaseVocabulary implements Closeable, Iterable<String> { } public void write() throws IOException { - FileUtils.writeLines(file, Constants.FB_ENCODING.name(), vocables, null, false); + FileUtils.writeLines(file, Constants.FILEBASE_ENCODING.name(), vocables, null, false); } public void addVocabulary(String text) { @@ -51,13 +53,42 @@ public class FilebaseVocabulary implements Closeable, Iterable<String> { public int index(String word) { Integer index = vocablesMap.get(word); - return index == null ? -1 : index; + if (index == null) { + index = nextIndex++; + vocablesMap.put(word, index); + vocables.add(word); + } + return index; } public int size() { return vocablesMap.size(); } + public String indexText(String in) { + // count unique words + List<String> wordList = Arrays.asList(in.split("\\s+")); + Map<String, Integer> wordMap = new HashMap<>(wordList.size()); + for (String word : wordList) { + Integer count = wordMap.get(word); + if (count == null) + wordMap.put(word, 1); + else + wordMap.put(word, count + 1); + } + + // assemble string + // <unique word count> <index1>:<count1> <index2>:<count2> ... + StringBuilder sb = new StringBuilder(); + sb.append(wordMap.size()); + for (Entry<String, Integer> e : wordMap.entrySet()) { + int index = index(e.getKey()); + sb.append(" ").append(index).append(":").append(e.getValue()); + } + + return sb.toString(); + } + @Override public void close() throws IOException { write(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DynNMFFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DynNMFFilebase.java deleted file mode 100644 index abb7a473..00000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DynNMFFilebase.java +++ /dev/null @@ -1,66 +0,0 @@ -package de.vipra.cmd.file; - -import java.io.File; -import java.io.IOException; -import java.util.Calendar; -import java.util.Date; -import java.util.GregorianCalendar; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import de.vipra.cmd.ex.FilebaseException; -import de.vipra.util.CalendarUtils; -import de.vipra.util.Constants.WindowResolution; -import de.vipra.util.FileUtils; -import de.vipra.util.model.ArticleFull; - -public class DynNMFFilebase extends Filebase { - - private final File modelDir; - private final WindowResolution windowResolution; - private final Map<String, File> dirMap; - - public DynNMFFilebase(File dataDir, WindowResolution windowResolution) throws FilebaseException { - super(dataDir, "dynlda"); - this.modelDir = super.getModelDir(); - this.windowResolution = windowResolution; - this.dirMap = new HashMap<>(); - } - - @Override - public void write(List<ArticleFull> articles) throws IOException { - if (!articles.isEmpty()) { - for (ArticleFull article : articles) { - File windowDir = getWindowDir(article.getDate()); - File articleFile = new File(windowDir, article.getId().toString()); - FileUtils.writeStringToFile(articleFile, article.getTitle() + "\n" + article.getProcessedText()); - } - } - } - - private File getWindowDir(Date date) { - Calendar c = new GregorianCalendar(); - c.setTime(date); - String dirName = "" + c.get(Calendar.YEAR); - switch (windowResolution) { - case QUARTERLY: - dirName += "-" + CalendarUtils.getQuarter(c); - break; - case MONTHLY: - dirName += "-" + c.get(Calendar.MONTH); - break; - case YEARLY: - default: - break; - } - File dir = dirMap.get(dirName); - if (dir == null) { - dir = new File(modelDir, dirName); - dir.mkdirs(); - dirMap.put(dirName, dir); - } - return dir; - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java index 58c0f596..df10b337 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java @@ -6,6 +6,9 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + import de.vipra.cmd.ex.FilebaseException; import de.vipra.util.Config; import de.vipra.util.Constants; @@ -14,14 +17,13 @@ import de.vipra.util.model.ArticleFull; public abstract class Filebase implements Closeable { + public static final Logger log = LogManager.getLogger(Filebase.class); + private final String modelName; private final File modelDir; private final FilebaseIndex index; - private final FilebaseVocabulary vocab; private final List<ArticleFull> articles; - private final int bufferMaxSize = 100; - public Filebase(File dataDir, String modelName) throws FilebaseException { this.modelName = modelName; this.modelDir = new File(dataDir, modelName); @@ -31,19 +33,20 @@ public abstract class Filebase implements Closeable { } } try { - this.index = new FilebaseIndex(new File(modelDir, Constants.INDEX_FILE)); - this.vocab = new FilebaseVocabulary(new File(modelDir, Constants.VOCAB_FILE)); + this.index = new FilebaseIndex(getModelFile("index")); } catch (IOException e) { throw new FilebaseException("could not read index: " + e.getMessage()); } - this.articles = new ArrayList<>(bufferMaxSize); + this.articles = new ArrayList<>(Constants.IMPORT_BUFFER_MAX); } public File getModelDir() { return modelDir; } - public File getModelFile() { + public File getModelFile(String fileName) { + if (fileName != null) + return new File(modelDir, fileName); return new File(modelDir, modelName); } @@ -51,10 +54,6 @@ public abstract class Filebase implements Closeable { return index; } - public FilebaseVocabulary getVocab() { - return vocab; - } - public List<ArticleFull> getArticles() { return articles; } @@ -63,18 +62,17 @@ public abstract class Filebase implements Closeable { public void close() throws IOException { write(articles); index.close(); - vocab.close(); } public void add(ArticleFull article) throws FilebaseException { - String[] words = article.getProcessedText().split("\\s+"); - vocab.addVocabulary(words); index.add(article.getId().toString()); articles.add(article); - if (articles.size() >= bufferMaxSize) { + if (articles.size() >= Constants.IMPORT_BUFFER_MAX) { try { + log.info("buffer filled, writing filebase"); write(articles); + articles.clear(); } catch (IOException e) { throw new FilebaseException(e); } @@ -86,8 +84,8 @@ public abstract class Filebase implements Closeable { public static Filebase getFilebase(Config config) throws FilebaseException, ConfigException { File dataDir = config.getDataDirectory(); switch (config.analyzer) { - case DYNNMF: - return new DynNMFFilebase(dataDir, config.windowResolution); + case DTM: + return new DTMFilebase(dataDir); case JGIBB: return new JGibbFilebase(dataDir); default: diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java index 9efba97a..28f7a47a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java @@ -25,7 +25,7 @@ public class FilebaseIndex implements Closeable, Iterable<String> { } public void write() throws IOException { - FileUtils.writeLines(file, Constants.FB_ENCODING.name(), index, null, false); + FileUtils.writeLines(file, Constants.FILEBASE_ENCODING.name(), index, null, false); } public int add(String id) { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java index 0883a698..95003d31 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java @@ -1,8 +1,10 @@ package de.vipra.cmd.file; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; -import java.io.RandomAccessFile; +import java.io.OutputStreamWriter; import java.util.List; import de.vipra.cmd.ex.FilebaseException; @@ -14,27 +16,16 @@ public class JGibbFilebase extends Filebase { public JGibbFilebase(File dataDir) throws FilebaseException { super(dataDir, "jgibb"); - this.modelFile = getModelFile(); + this.modelFile = getModelFile(null); } @Override public void write(List<ArticleFull> articles) throws IOException { if (!articles.isEmpty()) { - boolean linesep = modelFile.exists(); - RandomAccessFile raf = new RandomAccessFile(modelFile, "rw"); - - // write articles - raf.seek(raf.length()); - for (ArticleFull a : articles) { - if (linesep) - raf.writeBytes(System.lineSeparator()); - else - linesep = true; - raf.writeBytes(a.getProcessedText()); - } - - raf.close(); - articles.clear(); + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(modelFile))); + for (ArticleFull article : articles) + writer.write(article.getProcessedText()); + writer.close(); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java index 0e1c87c7..01cc8d15 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java @@ -49,8 +49,8 @@ public abstract class Analyzer { public static Analyzer getAnalyzer(Config config, WordMap wordMap) throws AnalyzerException { Analyzer analyzer = null; switch (config.analyzer) { - case DYNNMF: - analyzer = new DynNMFAnalyzer(); + case DTM: + analyzer = new DTMAnalyzer(); break; case JGIBB: analyzer = new JGibbAnalyzer(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DynNMFAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java similarity index 87% rename from vipra-cmd/src/main/java/de/vipra/cmd/lda/DynNMFAnalyzer.java rename to vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index 8112c866..30607070 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DynNMFAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -9,10 +9,10 @@ import de.vipra.util.WordMap; import de.vipra.util.model.TopicFull; import de.vipra.util.model.TopicRef; -public class DynNMFAnalyzer extends Analyzer { +public class DTMAnalyzer extends Analyzer { - protected DynNMFAnalyzer() { - super("Dynamic NMF Analyzer"); + protected DTMAnalyzer() { + super("Dynamic Topic Model Analyzer"); } @Override diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index bb34530d..88a8e4bc 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -18,9 +18,11 @@ import de.vipra.cmd.file.Filebase; import de.vipra.cmd.text.ProcessedText; import de.vipra.cmd.text.Processor; import de.vipra.util.Config; +import de.vipra.util.Constants; import de.vipra.util.StringUtils; import de.vipra.util.Timer; import de.vipra.util.WordMap; +import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.Article; import de.vipra.util.model.ArticleFull; import de.vipra.util.model.ArticleStats; @@ -29,6 +31,28 @@ import de.vipra.util.service.DatabaseService; public class ImportCommand implements Command { + public static class ArticleBuffer { + + private DatabaseService<ArticleFull, ObjectId> dbArticles; + private List<ArticleFull> articles = new ArrayList<>(Constants.IMPORT_BUFFER_MAX); + + public ArticleBuffer(DatabaseService<ArticleFull, ObjectId> dbArticles) { + this.dbArticles = dbArticles; + } + + public void add(ArticleFull article) throws DatabaseException { + articles.add(article); + if (articles.size() >= Constants.IMPORT_BUFFER_MAX) { + save(); + } + } + + public void save() throws DatabaseException { + dbArticles.createMultiple(articles); + articles.clear(); + } + } + public static final Logger log = LogManager.getLogger(ImportCommand.class); public static final Logger out = LogManager.getLogger("shellout"); @@ -40,6 +64,7 @@ public class ImportCommand implements Command { private Filebase filebase; private Processor preprocessor; private WordMap wordMap; + private ArticleBuffer articleBuffer; /** * Import command to import articles into the database, do topic modeling @@ -99,7 +124,7 @@ public class ImportCommand implements Command { // add article to mongodb article.setProcessedText(processedText.getText()); article.setStats(articleStats); - article = dbArticles.createSingle(article); + articleBuffer.add(article); // add words if (config.saveAllWords) { @@ -145,6 +170,7 @@ public class ImportCommand implements Command { private ArticleFull articleFromJSON(JSONObject obj) { ArticleFull article = new ArticleFull(); + article.setId(new ObjectId()); if (obj.containsKey("title")) article.setTitle(obj.get("title").toString()); if (obj.containsKey("text")) @@ -164,6 +190,7 @@ public class ImportCommand implements Command { filebase = Filebase.getFilebase(config); preprocessor = Processor.getProcessor(config); wordMap = new WordMap(dbWords); + articleBuffer = new ArticleBuffer(dbArticles); out.info("using data directory: " + config.getDataDirectory().getAbsolutePath()); out.info("using preprocessor: " + preprocessor.getName()); @@ -176,6 +203,7 @@ public class ImportCommand implements Command { */ out.info("file import"); List<Article> importedArticles = importFiles(files); + articleBuffer.save(); timer.lap("import"); /* @@ -184,13 +212,12 @@ public class ImportCommand implements Command { out.info("writing file index"); filebase.close(); timer.lap("filebase write"); - + /* * save words */ out.info("saving words"); Set<Word> importedWords = wordMap.getNewWords(); - timer.lap("saving topic refs and indexing"); wordMap.create(); timer.lap("saving words"); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java index 8264c003..79fa23e8 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java @@ -1,14 +1,10 @@ package de.vipra.cmd.option; -import java.io.File; - import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.bson.types.ObjectId; -import de.vipra.cmd.file.Filebase; import de.vipra.util.Config; -import de.vipra.util.StringUtils; import de.vipra.util.model.Article; import de.vipra.util.model.Topic; import de.vipra.util.model.Word; @@ -20,14 +16,11 @@ public class StatsCommand implements Command { public static final Logger out = LogManager.getLogger("shellout"); private Config config; - private Filebase filebase; private DatabaseService<Article, ObjectId> dbArticles; private DatabaseService<Topic, ObjectId> dbTopics; private DatabaseService<Word, String> dbWords; private void stats() { - File modelFile = filebase.getModelFile(); - out.info("filebase size: " + StringUtils.humanReadableByteCount(modelFile.length(), true)); out.info("# of articles: " + dbArticles.count()); out.info("# of topics : " + dbTopics.count()); out.info("# of words : " + dbWords.count()); @@ -36,7 +29,6 @@ public class StatsCommand implements Command { @Override public void run() throws Exception { config = Config.getConfig(); - filebase = Filebase.getFilebase(config); dbArticles = DatabaseService.getDatabaseService(config, Article.class); dbTopics = DatabaseService.getDatabaseService(config, Topic.class); dbWords = DatabaseService.getDatabaseService(config, Word.class); diff --git a/vipra-cmd/src/main/resources/config.properties b/vipra-cmd/src/main/resources/config.properties index 0778073f..f7c0fa9e 100644 --- a/vipra-cmd/src/main/resources/config.properties +++ b/vipra-cmd/src/main/resources/config.properties @@ -2,5 +2,5 @@ db.host=localhost db.port=27017 db.name=test tm.processor=corenlp -tm.analyzer=jgibb +tm.analyzer=dtm tm.saveallwords=false \ No newline at end of file diff --git a/vipra-util/src/main/java/de/vipra/util/AbstractCache.java b/vipra-util/src/main/java/de/vipra/util/AbstractCache.java index f80100ac..8e72bda8 100644 --- a/vipra-util/src/main/java/de/vipra/util/AbstractCache.java +++ b/vipra-util/src/main/java/de/vipra/util/AbstractCache.java @@ -7,9 +7,9 @@ public interface AbstractCache<T, U> { void put(T t, U u); void remove(T t); - + boolean contains(T t); - + void clear(); } diff --git a/vipra-util/src/main/java/de/vipra/util/Config.java b/vipra-util/src/main/java/de/vipra/util/Config.java index b03c6999..94cfb3b7 100644 --- a/vipra-util/src/main/java/de/vipra/util/Config.java +++ b/vipra-util/src/main/java/de/vipra/util/Config.java @@ -31,13 +31,13 @@ public class Config { */ @ConfigKey("db.host") - public String databaseHost = Constants.DB_HOST; + public String databaseHost = Constants.DATABASE_HOST; @ConfigKey("db.port") - public int databasePort = Constants.DB_PORT; + public int databasePort = Constants.DATABASE_PORT; @ConfigKey("db.name") - public String databaseName = Constants.DB_NAME; + public String databaseName = Constants.DATABASE_NAME; @ConfigKey("tm.processor") public Processor processor = Constants.Processor.DEFAULT(); @@ -234,19 +234,18 @@ public class Config { } public String hash() { - String config = databaseHost + databasePort + databaseName + processor + analyzer + windowResolution - + saveAllWords; + String config = databaseHost + databasePort + databaseName + processor + analyzer + saveAllWords; return DigestUtils.md5(config); } public static File getGenericDataDir() { File base = PathUtils.appDataDir(); - return new File(base, Constants.FB_DIR); + return new File(base, Constants.FILEBASE_DIR); } public static File getGenericConfigDir() { File base = PathUtils.appConfigDir(); - return new File(base, Constants.FB_DIR); + return new File(base, Constants.FILEBASE_DIR); } public static Config getConfig() throws IOException, ConfigException { diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 034893c2..9b338d50 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -3,6 +3,9 @@ package de.vipra.util; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.Calendar; +import java.util.Date; +import java.util.GregorianCalendar; import java.util.List; public class Constants { @@ -11,24 +14,29 @@ public class Constants { * FILEBASE */ - public static final String FB_DIR = "vipra"; - public static final Charset FB_ENCODING = StandardCharsets.UTF_8; + public static final String FILEBASE_DIR = "vipra"; + public static final Charset FILEBASE_ENCODING = StandardCharsets.UTF_8; + public static final String LINE_SEP = System.lineSeparator(); + + /** + * Buffer used while importing files into the database and filebase in @ of + * articles. + */ + public static final int IMPORT_BUFFER_MAX = 1000; /* * FILES */ public static final String CONFIG_FILE = "config.properties"; - public static final String INDEX_FILE = "index"; - public static final String VOCAB_FILE = "vocab"; /* * DATABASE */ - public static final String DB_HOST = "localhost"; - public static final int DB_PORT = 27017; - public static final String DB_NAME = "test"; + public static final String DATABASE_HOST = "localhost"; + public static final int DATABASE_PORT = 27017; + public static final String DATABASE_NAME = "test"; /* * ELASTICSEARCH @@ -248,7 +256,7 @@ public class Constants { */ public static enum Analyzer { JGIBB("jgibb"), - DYNNMF("dynnmf"); + DTM("dtm"); public final String name; @@ -294,6 +302,24 @@ public class Constants { this.name = def.name; } + public String fromDate(Date date) { + Calendar c = new GregorianCalendar(); + c.setTime(date); + String str = c.get(Calendar.YEAR) + ""; + switch (this) { + case QUARTERLY: + str += "-" + CalendarUtils.getQuarter(c); + break; + case MONTHLY: + int month = c.get(Calendar.MONTH); + str += "-" + (month < 10 ? "0" : "") + month; + break; + default: + break; + } + return str; + } + public static WindowResolution DEFAULT() { return YEARLY; } diff --git a/vipra-util/src/main/java/de/vipra/util/FileUtils.java b/vipra-util/src/main/java/de/vipra/util/FileUtils.java index d9b6bbd2..10ab794c 100644 --- a/vipra-util/src/main/java/de/vipra/util/FileUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/FileUtils.java @@ -1,12 +1,16 @@ package de.vipra.util; import java.io.BufferedInputStream; +import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.Iterator; import java.util.List; public class FileUtils extends org.apache.commons.io.FileUtils { @@ -19,7 +23,7 @@ public class FileUtils extends org.apache.commons.io.FileUtils { } public static List<String> readFile(File file) throws IOException { - return Files.readAllLines(Paths.get(file.getAbsolutePath()), Constants.FB_ENCODING); + return Files.readAllLines(Paths.get(file.getAbsolutePath()), Constants.FILEBASE_ENCODING); } public static InputStream getResource(String name) { @@ -59,4 +63,46 @@ public class FileUtils extends org.apache.commons.io.FileUtils { } } + public static Iterator<String> iterateFileLines(File file) throws FileNotFoundException { + return (new Iterator<String>() { + + private BufferedReader reader; + private Boolean next; + private String nextLine; + + @Override + public boolean hasNext() { + if (!next) + return false; + if (next == null) { + nextLine = null; + try { + nextLine = reader.readLine(); + } catch (IOException e1) { + e1.printStackTrace(); + } + next = nextLine != null; + if (!next) + try { + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + return next; + } + + @Override + public String next() { + return nextLine; + } + + public Iterator<String> init(File file) throws FileNotFoundException { + reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); + return this; + } + + }).init(file); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/model/FileModel.java b/vipra-util/src/main/java/de/vipra/util/model/FileModel.java index de0ade9e..a2e6c82f 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/FileModel.java +++ b/vipra-util/src/main/java/de/vipra/util/model/FileModel.java @@ -11,7 +11,7 @@ import de.vipra.util.Constants; public abstract class FileModel<IdType> implements Model<IdType> { public void writeToFile(File file) throws IOException { - FileUtils.writeStringToFile(file, toFileString(), Constants.FB_ENCODING, false); + FileUtils.writeStringToFile(file, toFileString(), Constants.FILEBASE_ENCODING, false); } public abstract void fromFile(File file) throws IOException; -- GitLab