From 2e1beffc1b9410f662d53802cd94da2e28e8d5ab Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Thu, 11 Feb 2016 00:34:42 +0100
Subject: [PATCH] added dtm file model

added dtm file model
removed dynnmf filemodel and analyzer
---
 tasks.todo                                    |   2 +
 .../src/main/java/de/vipra/cmd/Main.java      |   4 +-
 .../de/vipra/cmd/ex/FilebaseException.java    |   4 +
 .../java/de/vipra/cmd/file/DTMDateIndex.java  | 110 ++++++++++++++++++
 .../java/de/vipra/cmd/file/DTMFilebase.java   |  95 +++++++++++++++
 ...baseVocabulary.java => DTMVocabulary.java} |  41 ++++++-
 .../de/vipra/cmd/file/DynNMFFilebase.java     |  66 -----------
 .../main/java/de/vipra/cmd/file/Filebase.java |  32 +++--
 .../java/de/vipra/cmd/file/FilebaseIndex.java |   2 +-
 .../java/de/vipra/cmd/file/JGibbFilebase.java |  25 ++--
 .../main/java/de/vipra/cmd/lda/Analyzer.java  |   4 +-
 .../{DynNMFAnalyzer.java => DTMAnalyzer.java} |   6 +-
 .../de/vipra/cmd/option/ImportCommand.java    |  33 +++++-
 .../de/vipra/cmd/option/StatsCommand.java     |   8 --
 .../src/main/resources/config.properties      |   2 +-
 .../java/de/vipra/util/AbstractCache.java     |   4 +-
 .../src/main/java/de/vipra/util/Config.java   |  13 +--
 .../main/java/de/vipra/util/Constants.java    |  42 +++++--
 .../main/java/de/vipra/util/FileUtils.java    |  48 +++++++-
 .../java/de/vipra/util/model/FileModel.java   |   2 +-
 20 files changed, 399 insertions(+), 144 deletions(-)
 create mode 100644 tasks.todo
 create mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java
 create mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java
 rename vipra-cmd/src/main/java/de/vipra/cmd/file/{FilebaseVocabulary.java => DTMVocabulary.java} (54%)
 delete mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/file/DynNMFFilebase.java
 rename vipra-cmd/src/main/java/de/vipra/cmd/lda/{DynNMFAnalyzer.java => DTMAnalyzer.java} (87%)

diff --git a/tasks.todo b/tasks.todo
new file mode 100644
index 00000000..6cda7ed7
--- /dev/null
+++ b/tasks.todo
@@ -0,0 +1,2 @@
+ ☐ topic/word network
+ ☐ enable topic editing/labeling
\ No newline at end of file
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java
index eaa25a59..923605d7 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java
@@ -6,11 +6,11 @@ import static de.vipra.cmd.CmdOptions.OPT_DEBUG;
 import static de.vipra.cmd.CmdOptions.OPT_DEFAULTS;
 import static de.vipra.cmd.CmdOptions.OPT_HELP;
 import static de.vipra.cmd.CmdOptions.OPT_IMPORT;
+import static de.vipra.cmd.CmdOptions.OPT_MODELING;
 import static de.vipra.cmd.CmdOptions.OPT_SHELL;
 import static de.vipra.cmd.CmdOptions.OPT_SILENT;
 import static de.vipra.cmd.CmdOptions.OPT_STATS;
 import static de.vipra.cmd.CmdOptions.OPT_TEST;
-import static de.vipra.cmd.CmdOptions.OPT_MODELING;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -37,9 +37,9 @@ import de.vipra.cmd.option.ClearCommand;
 import de.vipra.cmd.option.Command;
 import de.vipra.cmd.option.ConfigCommand;
 import de.vipra.cmd.option.ImportCommand;
+import de.vipra.cmd.option.ModelingCommand;
 import de.vipra.cmd.option.StatsCommand;
 import de.vipra.cmd.option.TestCommand;
-import de.vipra.cmd.option.ModelingCommand;
 
 public class Main {
 
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java b/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java
index ad5f6ef7..22df17d5 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java
@@ -12,4 +12,8 @@ public class FilebaseException extends Exception {
 		super(e);
 	}
 
+	public FilebaseException(String msg, Exception e) {
+		super(msg, e);
+	}
+
 }
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java
new file mode 100644
index 00000000..7951fdc7
--- /dev/null
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java
@@ -0,0 +1,110 @@
+package de.vipra.cmd.file;
+
+import java.io.BufferedWriter;
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import de.vipra.util.Constants;
+import de.vipra.util.Constants.WindowResolution;
+import de.vipra.util.FileUtils;
+
+public class DTMDateIndex implements Closeable, Iterable<DTMDateIndex.DTMDateIndexEntry> {
+
+	public static class DTMDateIndexEntry implements Comparable<DTMDateIndexEntry> {
+		public Date date;
+		public boolean exists;
+		public String line;
+
+		public DTMDateIndexEntry(Date date, boolean exists, String line) {
+			this.date = date;
+			this.exists = exists;
+			this.line = line;
+		}
+
+		@Override
+		public int compareTo(DTMDateIndexEntry o) {
+			if (o == null)
+				return 1;
+			if (date == null)
+				return -1;
+			return this.date.compareTo(o.date);
+		}
+	}
+
+	private final File file;
+	private final WindowResolution windowResolution;
+	private final List<DTMDateIndexEntry> entries;
+	private final SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT);
+
+	public DTMDateIndex(File file, WindowResolution windowResolution) throws IOException, ParseException {
+		this.file = file;
+		this.windowResolution = windowResolution;
+		if (file.exists()) {
+			List<String> dates = FileUtils.readFile(file);
+			this.entries = new ArrayList<>(dates.size());
+			for (String date : dates) {
+				this.entries.add(new DTMDateIndexEntry(df.parse(date), true, null));
+			}
+		} else {
+			this.entries = new ArrayList<>();
+		}
+	}
+
+	public void add(Date date, String line) {
+		this.entries.add(new DTMDateIndexEntry(date, false, line));
+	}
+
+	@Override
+	public Iterator<DTMDateIndexEntry> iterator() {
+		Collections.sort(entries);
+		return entries.iterator();
+	}
+
+	@Override
+	public void close() throws IOException {
+		List<String> windows = new ArrayList<>();
+		Map<String, Integer> windowSizes = new HashMap<>();
+
+		// write date index
+		BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false)));
+		for (DTMDateIndexEntry entry : entries) {
+			writer.write(df.format(entry.date));
+			writer.write(Constants.LINE_SEP);
+
+			String window = windowResolution.fromDate(entry.date);
+			Integer count = windowSizes.get(window);
+			if (count == null) {
+				windowSizes.put(window, 1);
+				windows.add(window);
+			} else {
+				windowSizes.put(window, count + 1);
+			}
+		}
+		writer.close();
+
+		// write window index
+		File seqFile = new File(file.getParentFile(), "dtm-seq.dat");
+		writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seqFile, false)));
+		writer.write(Integer.toString(windows.size()));
+		writer.write(Constants.LINE_SEP);
+		Collections.sort(windows);
+		for (String window : windows) {
+			writer.write(Integer.toString(windowSizes.get(window)));
+			writer.write(Constants.LINE_SEP);
+		}
+		writer.close();
+	}
+
+}
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java
new file mode 100644
index 00000000..8c81bbfc
--- /dev/null
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java
@@ -0,0 +1,95 @@
+package de.vipra.cmd.file;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.text.ParseException;
+import java.util.Iterator;
+import java.util.List;
+
+import de.vipra.cmd.ex.FilebaseException;
+import de.vipra.cmd.file.DTMDateIndex.DTMDateIndexEntry;
+import de.vipra.util.Config;
+import de.vipra.util.Constants;
+import de.vipra.util.FileUtils;
+import de.vipra.util.ex.ConfigException;
+import de.vipra.util.model.ArticleFull;
+
+public class DTMFilebase extends Filebase {
+
+	private final DTMDateIndex index;
+	private final DTMVocabulary vocab;
+	private final File modelFile;
+
+	public DTMFilebase(File dataDir) throws FilebaseException {
+		super(dataDir, "dtm");
+		Config config;
+		try {
+			config = Config.getConfig();
+		} catch (IOException | ConfigException e) {
+			throw new FilebaseException(e);
+		}
+		try {
+			this.index = new DTMDateIndex(getModelFile("dtm-dates.dat"), config.windowResolution);
+		} catch (IOException | ParseException e) {
+			throw new FilebaseException("could not read date index file", e);
+		}
+		try {
+			this.vocab = new DTMVocabulary(getModelFile("dtm-vocab.dat"));
+		} catch (IOException e) {
+			throw new FilebaseException("could not read vocabulary file", e);
+		}
+		this.modelFile = getModelFile("dtm-mult.dat");
+	}
+
+	@Override
+	public void write(List<ArticleFull> articles) throws IOException {
+		if (!articles.isEmpty()) {
+			// index new articles
+			for (ArticleFull article : articles) {
+				index.add(article.getDate(), vocab.indexText(article.getProcessedText()));
+			}
+
+			// write temp file
+			File modelFileTmp = getModelFile("dtm-mult.dat.tmp");
+			Iterator<String> lines = null;
+			if (modelFile.exists())
+				lines = FileUtils.iterateFileLines(modelFile);
+			BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(modelFileTmp)));
+			for (DTMDateIndexEntry e : index) {
+				if (e.exists) {
+					if (lines == null) {
+						writer.close();
+						throw new IOException("index inconsistency: missing article file");
+					}
+					writer.write(lines.next());
+				} else {
+					e.exists = true;
+					writer.write(e.line);
+				}
+				writer.write(Constants.LINE_SEP);
+			}
+			writer.close();
+
+			// replace model file by temp file
+			if (modelFile.exists() && !modelFile.delete())
+				throw new IOException("could not delete file " + modelFile.getAbsolutePath());
+			if (!modelFileTmp.renameTo(modelFile))
+				throw new IOException(
+						"could not rename tmp file " + modelFileTmp.getAbsolutePath() + " to " + modelFile.getName());
+
+		}
+	}
+
+	@Override
+	public void close() throws IOException {
+		super.close();
+
+		// write vocabulary and windows
+		vocab.close();
+		index.close();
+	}
+
+}
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java
similarity index 54%
rename from vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java
rename to vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java
index 8f8b3b6f..99948366 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java
@@ -4,22 +4,24 @@ import java.io.Closeable;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 
 import de.vipra.util.Constants;
 import de.vipra.util.FileUtils;
 
-public class FilebaseVocabulary implements Closeable, Iterable<String> {
+public class DTMVocabulary implements Closeable, Iterable<String> {
 
 	private File file;
 	private List<String> vocables;
 	private Map<String, Integer> vocablesMap;
-	private int nextIndex = 0;
+	private int nextIndex = 1;
 
-	public FilebaseVocabulary(File file) throws IOException {
+	public DTMVocabulary(File file) throws IOException {
 		this.file = file;
 		if (file.exists()) {
 			vocables = new ArrayList<>(FileUtils.readFile(file));
@@ -33,7 +35,7 @@ public class FilebaseVocabulary implements Closeable, Iterable<String> {
 	}
 
 	public void write() throws IOException {
-		FileUtils.writeLines(file, Constants.FB_ENCODING.name(), vocables, null, false);
+		FileUtils.writeLines(file, Constants.FILEBASE_ENCODING.name(), vocables, null, false);
 	}
 
 	public void addVocabulary(String text) {
@@ -51,13 +53,42 @@ public class FilebaseVocabulary implements Closeable, Iterable<String> {
 
 	public int index(String word) {
 		Integer index = vocablesMap.get(word);
-		return index == null ? -1 : index;
+		if (index == null) {
+			index = nextIndex++;
+			vocablesMap.put(word, index);
+			vocables.add(word);
+		}
+		return index;
 	}
 
 	public int size() {
 		return vocablesMap.size();
 	}
 
+	public String indexText(String in) {
+		// count unique words
+		List<String> wordList = Arrays.asList(in.split("\\s+"));
+		Map<String, Integer> wordMap = new HashMap<>(wordList.size());
+		for (String word : wordList) {
+			Integer count = wordMap.get(word);
+			if (count == null)
+				wordMap.put(word, 1);
+			else
+				wordMap.put(word, count + 1);
+		}
+
+		// assemble string
+		// <unique word count> <index1>:<count1> <index2>:<count2> ...
+		StringBuilder sb = new StringBuilder();
+		sb.append(wordMap.size());
+		for (Entry<String, Integer> e : wordMap.entrySet()) {
+			int index = index(e.getKey());
+			sb.append(" ").append(index).append(":").append(e.getValue());
+		}
+
+		return sb.toString();
+	}
+
 	@Override
 	public void close() throws IOException {
 		write();
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DynNMFFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DynNMFFilebase.java
deleted file mode 100644
index abb7a473..00000000
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DynNMFFilebase.java
+++ /dev/null
@@ -1,66 +0,0 @@
-package de.vipra.cmd.file;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Calendar;
-import java.util.Date;
-import java.util.GregorianCalendar;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import de.vipra.cmd.ex.FilebaseException;
-import de.vipra.util.CalendarUtils;
-import de.vipra.util.Constants.WindowResolution;
-import de.vipra.util.FileUtils;
-import de.vipra.util.model.ArticleFull;
-
-public class DynNMFFilebase extends Filebase {
-
-	private final File modelDir;
-	private final WindowResolution windowResolution;
-	private final Map<String, File> dirMap;
-
-	public DynNMFFilebase(File dataDir, WindowResolution windowResolution) throws FilebaseException {
-		super(dataDir, "dynlda");
-		this.modelDir = super.getModelDir();
-		this.windowResolution = windowResolution;
-		this.dirMap = new HashMap<>();
-	}
-
-	@Override
-	public void write(List<ArticleFull> articles) throws IOException {
-		if (!articles.isEmpty()) {
-			for (ArticleFull article : articles) {
-				File windowDir = getWindowDir(article.getDate());
-				File articleFile = new File(windowDir, article.getId().toString());
-				FileUtils.writeStringToFile(articleFile, article.getTitle() + "\n" + article.getProcessedText());
-			}
-		}
-	}
-
-	private File getWindowDir(Date date) {
-		Calendar c = new GregorianCalendar();
-		c.setTime(date);
-		String dirName = "" + c.get(Calendar.YEAR);
-		switch (windowResolution) {
-			case QUARTERLY:
-				dirName += "-" + CalendarUtils.getQuarter(c);
-				break;
-			case MONTHLY:
-				dirName += "-" + c.get(Calendar.MONTH);
-				break;
-			case YEARLY:
-			default:
-				break;
-		}
-		File dir = dirMap.get(dirName);
-		if (dir == null) {
-			dir = new File(modelDir, dirName);
-			dir.mkdirs();
-			dirMap.put(dirName, dir);
-		}
-		return dir;
-	}
-
-}
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java
index 58c0f596..df10b337 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java
@@ -6,6 +6,9 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
 import de.vipra.cmd.ex.FilebaseException;
 import de.vipra.util.Config;
 import de.vipra.util.Constants;
@@ -14,14 +17,13 @@ import de.vipra.util.model.ArticleFull;
 
 public abstract class Filebase implements Closeable {
 
+	public static final Logger log = LogManager.getLogger(Filebase.class);
+
 	private final String modelName;
 	private final File modelDir;
 	private final FilebaseIndex index;
-	private final FilebaseVocabulary vocab;
 	private final List<ArticleFull> articles;
 
-	private final int bufferMaxSize = 100;
-
 	public Filebase(File dataDir, String modelName) throws FilebaseException {
 		this.modelName = modelName;
 		this.modelDir = new File(dataDir, modelName);
@@ -31,19 +33,20 @@ public abstract class Filebase implements Closeable {
 			}
 		}
 		try {
-			this.index = new FilebaseIndex(new File(modelDir, Constants.INDEX_FILE));
-			this.vocab = new FilebaseVocabulary(new File(modelDir, Constants.VOCAB_FILE));
+			this.index = new FilebaseIndex(getModelFile("index"));
 		} catch (IOException e) {
 			throw new FilebaseException("could not read index: " + e.getMessage());
 		}
-		this.articles = new ArrayList<>(bufferMaxSize);
+		this.articles = new ArrayList<>(Constants.IMPORT_BUFFER_MAX);
 	}
 
 	public File getModelDir() {
 		return modelDir;
 	}
 
-	public File getModelFile() {
+	public File getModelFile(String fileName) {
+		if (fileName != null)
+			return new File(modelDir, fileName);
 		return new File(modelDir, modelName);
 	}
 
@@ -51,10 +54,6 @@ public abstract class Filebase implements Closeable {
 		return index;
 	}
 
-	public FilebaseVocabulary getVocab() {
-		return vocab;
-	}
-
 	public List<ArticleFull> getArticles() {
 		return articles;
 	}
@@ -63,18 +62,17 @@ public abstract class Filebase implements Closeable {
 	public void close() throws IOException {
 		write(articles);
 		index.close();
-		vocab.close();
 	}
 
 	public void add(ArticleFull article) throws FilebaseException {
-		String[] words = article.getProcessedText().split("\\s+");
-		vocab.addVocabulary(words);
 		index.add(article.getId().toString());
 		articles.add(article);
 
-		if (articles.size() >= bufferMaxSize) {
+		if (articles.size() >= Constants.IMPORT_BUFFER_MAX) {
 			try {
+				log.info("buffer filled, writing filebase");
 				write(articles);
+				articles.clear();
 			} catch (IOException e) {
 				throw new FilebaseException(e);
 			}
@@ -86,8 +84,8 @@ public abstract class Filebase implements Closeable {
 	public static Filebase getFilebase(Config config) throws FilebaseException, ConfigException {
 		File dataDir = config.getDataDirectory();
 		switch (config.analyzer) {
-			case DYNNMF:
-				return new DynNMFFilebase(dataDir, config.windowResolution);
+			case DTM:
+				return new DTMFilebase(dataDir);
 			case JGIBB:
 				return new JGibbFilebase(dataDir);
 			default:
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java
index 9efba97a..28f7a47a 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java
@@ -25,7 +25,7 @@ public class FilebaseIndex implements Closeable, Iterable<String> {
 	}
 
 	public void write() throws IOException {
-		FileUtils.writeLines(file, Constants.FB_ENCODING.name(), index, null, false);
+		FileUtils.writeLines(file, Constants.FILEBASE_ENCODING.name(), index, null, false);
 	}
 
 	public int add(String id) {
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java
index 0883a698..95003d31 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java
@@ -1,8 +1,10 @@
 package de.vipra.cmd.file;
 
+import java.io.BufferedWriter;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.RandomAccessFile;
+import java.io.OutputStreamWriter;
 import java.util.List;
 
 import de.vipra.cmd.ex.FilebaseException;
@@ -14,27 +16,16 @@ public class JGibbFilebase extends Filebase {
 
 	public JGibbFilebase(File dataDir) throws FilebaseException {
 		super(dataDir, "jgibb");
-		this.modelFile = getModelFile();
+		this.modelFile = getModelFile(null);
 	}
 
 	@Override
 	public void write(List<ArticleFull> articles) throws IOException {
 		if (!articles.isEmpty()) {
-			boolean linesep = modelFile.exists();
-			RandomAccessFile raf = new RandomAccessFile(modelFile, "rw");
-
-			// write articles
-			raf.seek(raf.length());
-			for (ArticleFull a : articles) {
-				if (linesep)
-					raf.writeBytes(System.lineSeparator());
-				else
-					linesep = true;
-				raf.writeBytes(a.getProcessedText());
-			}
-
-			raf.close();
-			articles.clear();
+			BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(modelFile)));
+			for (ArticleFull article : articles)
+				writer.write(article.getProcessedText());
+			writer.close();
 		}
 	}
 
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java
index 0e1c87c7..01cc8d15 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java
@@ -49,8 +49,8 @@ public abstract class Analyzer {
 	public static Analyzer getAnalyzer(Config config, WordMap wordMap) throws AnalyzerException {
 		Analyzer analyzer = null;
 		switch (config.analyzer) {
-			case DYNNMF:
-				analyzer = new DynNMFAnalyzer();
+			case DTM:
+				analyzer = new DTMAnalyzer();
 				break;
 			case JGIBB:
 				analyzer = new JGibbAnalyzer();
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DynNMFAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
similarity index 87%
rename from vipra-cmd/src/main/java/de/vipra/cmd/lda/DynNMFAnalyzer.java
rename to vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
index 8112c866..30607070 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DynNMFAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
@@ -9,10 +9,10 @@ import de.vipra.util.WordMap;
 import de.vipra.util.model.TopicFull;
 import de.vipra.util.model.TopicRef;
 
-public class DynNMFAnalyzer extends Analyzer {
+public class DTMAnalyzer extends Analyzer {
 
-	protected DynNMFAnalyzer() {
-		super("Dynamic NMF Analyzer");
+	protected DTMAnalyzer() {
+		super("Dynamic Topic Model Analyzer");
 	}
 
 	@Override
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java
index bb34530d..88a8e4bc 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java
@@ -18,9 +18,11 @@ import de.vipra.cmd.file.Filebase;
 import de.vipra.cmd.text.ProcessedText;
 import de.vipra.cmd.text.Processor;
 import de.vipra.util.Config;
+import de.vipra.util.Constants;
 import de.vipra.util.StringUtils;
 import de.vipra.util.Timer;
 import de.vipra.util.WordMap;
+import de.vipra.util.ex.DatabaseException;
 import de.vipra.util.model.Article;
 import de.vipra.util.model.ArticleFull;
 import de.vipra.util.model.ArticleStats;
@@ -29,6 +31,28 @@ import de.vipra.util.service.DatabaseService;
 
 public class ImportCommand implements Command {
 
+	public static class ArticleBuffer {
+
+		private DatabaseService<ArticleFull, ObjectId> dbArticles;
+		private List<ArticleFull> articles = new ArrayList<>(Constants.IMPORT_BUFFER_MAX);
+
+		public ArticleBuffer(DatabaseService<ArticleFull, ObjectId> dbArticles) {
+			this.dbArticles = dbArticles;
+		}
+
+		public void add(ArticleFull article) throws DatabaseException {
+			articles.add(article);
+			if (articles.size() >= Constants.IMPORT_BUFFER_MAX) {
+				save();
+			}
+		}
+
+		public void save() throws DatabaseException {
+			dbArticles.createMultiple(articles);
+			articles.clear();
+		}
+	}
+
 	public static final Logger log = LogManager.getLogger(ImportCommand.class);
 	public static final Logger out = LogManager.getLogger("shellout");
 
@@ -40,6 +64,7 @@ public class ImportCommand implements Command {
 	private Filebase filebase;
 	private Processor preprocessor;
 	private WordMap wordMap;
+	private ArticleBuffer articleBuffer;
 
 	/**
 	 * Import command to import articles into the database, do topic modeling
@@ -99,7 +124,7 @@ public class ImportCommand implements Command {
 		// add article to mongodb
 		article.setProcessedText(processedText.getText());
 		article.setStats(articleStats);
-		article = dbArticles.createSingle(article);
+		articleBuffer.add(article);
 
 		// add words
 		if (config.saveAllWords) {
@@ -145,6 +170,7 @@ public class ImportCommand implements Command {
 
 	private ArticleFull articleFromJSON(JSONObject obj) {
 		ArticleFull article = new ArticleFull();
+		article.setId(new ObjectId());
 		if (obj.containsKey("title"))
 			article.setTitle(obj.get("title").toString());
 		if (obj.containsKey("text"))
@@ -164,6 +190,7 @@ public class ImportCommand implements Command {
 		filebase = Filebase.getFilebase(config);
 		preprocessor = Processor.getProcessor(config);
 		wordMap = new WordMap(dbWords);
+		articleBuffer = new ArticleBuffer(dbArticles);
 
 		out.info("using data directory: " + config.getDataDirectory().getAbsolutePath());
 		out.info("using preprocessor: " + preprocessor.getName());
@@ -176,6 +203,7 @@ public class ImportCommand implements Command {
 		 */
 		out.info("file import");
 		List<Article> importedArticles = importFiles(files);
+		articleBuffer.save();
 		timer.lap("import");
 
 		/*
@@ -184,13 +212,12 @@ public class ImportCommand implements Command {
 		out.info("writing file index");
 		filebase.close();
 		timer.lap("filebase write");
-		
+
 		/*
 		 * save words
 		 */
 		out.info("saving words");
 		Set<Word> importedWords = wordMap.getNewWords();
-		timer.lap("saving topic refs and indexing");
 		wordMap.create();
 		timer.lap("saving words");
 
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java
index 8264c003..79fa23e8 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java
@@ -1,14 +1,10 @@
 package de.vipra.cmd.option;
 
-import java.io.File;
-
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.bson.types.ObjectId;
 
-import de.vipra.cmd.file.Filebase;
 import de.vipra.util.Config;
-import de.vipra.util.StringUtils;
 import de.vipra.util.model.Article;
 import de.vipra.util.model.Topic;
 import de.vipra.util.model.Word;
@@ -20,14 +16,11 @@ public class StatsCommand implements Command {
 	public static final Logger out = LogManager.getLogger("shellout");
 
 	private Config config;
-	private Filebase filebase;
 	private DatabaseService<Article, ObjectId> dbArticles;
 	private DatabaseService<Topic, ObjectId> dbTopics;
 	private DatabaseService<Word, String> dbWords;
 
 	private void stats() {
-		File modelFile = filebase.getModelFile();
-		out.info("filebase size: " + StringUtils.humanReadableByteCount(modelFile.length(), true));
 		out.info("# of articles: " + dbArticles.count());
 		out.info("# of topics  : " + dbTopics.count());
 		out.info("# of words   : " + dbWords.count());
@@ -36,7 +29,6 @@ public class StatsCommand implements Command {
 	@Override
 	public void run() throws Exception {
 		config = Config.getConfig();
-		filebase = Filebase.getFilebase(config);
 		dbArticles = DatabaseService.getDatabaseService(config, Article.class);
 		dbTopics = DatabaseService.getDatabaseService(config, Topic.class);
 		dbWords = DatabaseService.getDatabaseService(config, Word.class);
diff --git a/vipra-cmd/src/main/resources/config.properties b/vipra-cmd/src/main/resources/config.properties
index 0778073f..f7c0fa9e 100644
--- a/vipra-cmd/src/main/resources/config.properties
+++ b/vipra-cmd/src/main/resources/config.properties
@@ -2,5 +2,5 @@ db.host=localhost
 db.port=27017
 db.name=test
 tm.processor=corenlp
-tm.analyzer=jgibb
+tm.analyzer=dtm
 tm.saveallwords=false
\ No newline at end of file
diff --git a/vipra-util/src/main/java/de/vipra/util/AbstractCache.java b/vipra-util/src/main/java/de/vipra/util/AbstractCache.java
index f80100ac..8e72bda8 100644
--- a/vipra-util/src/main/java/de/vipra/util/AbstractCache.java
+++ b/vipra-util/src/main/java/de/vipra/util/AbstractCache.java
@@ -7,9 +7,9 @@ public interface AbstractCache<T, U> {
 	void put(T t, U u);
 
 	void remove(T t);
-	
+
 	boolean contains(T t);
-	
+
 	void clear();
 
 }
diff --git a/vipra-util/src/main/java/de/vipra/util/Config.java b/vipra-util/src/main/java/de/vipra/util/Config.java
index b03c6999..94cfb3b7 100644
--- a/vipra-util/src/main/java/de/vipra/util/Config.java
+++ b/vipra-util/src/main/java/de/vipra/util/Config.java
@@ -31,13 +31,13 @@ public class Config {
 	 */
 
 	@ConfigKey("db.host")
-	public String databaseHost = Constants.DB_HOST;
+	public String databaseHost = Constants.DATABASE_HOST;
 
 	@ConfigKey("db.port")
-	public int databasePort = Constants.DB_PORT;
+	public int databasePort = Constants.DATABASE_PORT;
 
 	@ConfigKey("db.name")
-	public String databaseName = Constants.DB_NAME;
+	public String databaseName = Constants.DATABASE_NAME;
 
 	@ConfigKey("tm.processor")
 	public Processor processor = Constants.Processor.DEFAULT();
@@ -234,19 +234,18 @@ public class Config {
 	}
 
 	public String hash() {
-		String config = databaseHost + databasePort + databaseName + processor + analyzer + windowResolution
-				+ saveAllWords;
+		String config = databaseHost + databasePort + databaseName + processor + analyzer + saveAllWords;
 		return DigestUtils.md5(config);
 	}
 
 	public static File getGenericDataDir() {
 		File base = PathUtils.appDataDir();
-		return new File(base, Constants.FB_DIR);
+		return new File(base, Constants.FILEBASE_DIR);
 	}
 
 	public static File getGenericConfigDir() {
 		File base = PathUtils.appConfigDir();
-		return new File(base, Constants.FB_DIR);
+		return new File(base, Constants.FILEBASE_DIR);
 	}
 
 	public static Config getConfig() throws IOException, ConfigException {
diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java
index 034893c2..9b338d50 100644
--- a/vipra-util/src/main/java/de/vipra/util/Constants.java
+++ b/vipra-util/src/main/java/de/vipra/util/Constants.java
@@ -3,6 +3,9 @@ package de.vipra.util;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.GregorianCalendar;
 import java.util.List;
 
 public class Constants {
@@ -11,24 +14,29 @@ public class Constants {
 	 * FILEBASE
 	 */
 
-	public static final String FB_DIR = "vipra";
-	public static final Charset FB_ENCODING = StandardCharsets.UTF_8;
+	public static final String FILEBASE_DIR = "vipra";
+	public static final Charset FILEBASE_ENCODING = StandardCharsets.UTF_8;
+	public static final String LINE_SEP = System.lineSeparator();
+
+	/**
+	 * Buffer used while importing files into the database and filebase in @ of
+	 * articles.
+	 */
+	public static final int IMPORT_BUFFER_MAX = 1000;
 
 	/*
 	 * FILES
 	 */
 
 	public static final String CONFIG_FILE = "config.properties";
-	public static final String INDEX_FILE = "index";
-	public static final String VOCAB_FILE = "vocab";
 
 	/*
 	 * DATABASE
 	 */
 
-	public static final String DB_HOST = "localhost";
-	public static final int DB_PORT = 27017;
-	public static final String DB_NAME = "test";
+	public static final String DATABASE_HOST = "localhost";
+	public static final int DATABASE_PORT = 27017;
+	public static final String DATABASE_NAME = "test";
 
 	/*
 	 * ELASTICSEARCH
@@ -248,7 +256,7 @@ public class Constants {
 	 */
 	public static enum Analyzer {
 		JGIBB("jgibb"),
-		DYNNMF("dynnmf");
+		DTM("dtm");
 
 		public final String name;
 
@@ -294,6 +302,24 @@ public class Constants {
 			this.name = def.name;
 		}
 
+		public String fromDate(Date date) {
+			Calendar c = new GregorianCalendar();
+			c.setTime(date);
+			String str = c.get(Calendar.YEAR) + "";
+			switch (this) {
+				case QUARTERLY:
+					str += "-" + CalendarUtils.getQuarter(c);
+					break;
+				case MONTHLY:
+					int month = c.get(Calendar.MONTH);
+					str += "-" + (month < 10 ? "0" : "") + month;
+					break;
+				default:
+					break;
+			}
+			return str;
+		}
+
 		public static WindowResolution DEFAULT() {
 			return YEARLY;
 		}
diff --git a/vipra-util/src/main/java/de/vipra/util/FileUtils.java b/vipra-util/src/main/java/de/vipra/util/FileUtils.java
index d9b6bbd2..10ab794c 100644
--- a/vipra-util/src/main/java/de/vipra/util/FileUtils.java
+++ b/vipra-util/src/main/java/de/vipra/util/FileUtils.java
@@ -1,12 +1,16 @@
 package de.vipra.util;
 
 import java.io.BufferedInputStream;
+import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.nio.file.Files;
 import java.nio.file.Paths;
+import java.util.Iterator;
 import java.util.List;
 
 public class FileUtils extends org.apache.commons.io.FileUtils {
@@ -19,7 +23,7 @@ public class FileUtils extends org.apache.commons.io.FileUtils {
 	}
 
 	public static List<String> readFile(File file) throws IOException {
-		return Files.readAllLines(Paths.get(file.getAbsolutePath()), Constants.FB_ENCODING);
+		return Files.readAllLines(Paths.get(file.getAbsolutePath()), Constants.FILEBASE_ENCODING);
 	}
 
 	public static InputStream getResource(String name) {
@@ -59,4 +63,46 @@ public class FileUtils extends org.apache.commons.io.FileUtils {
 		}
 	}
 
+	public static Iterator<String> iterateFileLines(File file) throws FileNotFoundException {
+		return (new Iterator<String>() {
+
+			private BufferedReader reader;
+			private Boolean next;
+			private String nextLine;
+
+			@Override
+			public boolean hasNext() {
+				if (!next)
+					return false;
+				if (next == null) {
+					nextLine = null;
+					try {
+						nextLine = reader.readLine();
+					} catch (IOException e1) {
+						e1.printStackTrace();
+					}
+					next = nextLine != null;
+					if (!next)
+						try {
+							reader.close();
+						} catch (IOException e) {
+							e.printStackTrace();
+						}
+				}
+				return next;
+			}
+
+			@Override
+			public String next() {
+				return nextLine;
+			}
+
+			public Iterator<String> init(File file) throws FileNotFoundException {
+				reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
+				return this;
+			}
+
+		}).init(file);
+	}
+
 }
diff --git a/vipra-util/src/main/java/de/vipra/util/model/FileModel.java b/vipra-util/src/main/java/de/vipra/util/model/FileModel.java
index de0ade9e..a2e6c82f 100644
--- a/vipra-util/src/main/java/de/vipra/util/model/FileModel.java
+++ b/vipra-util/src/main/java/de/vipra/util/model/FileModel.java
@@ -11,7 +11,7 @@ import de.vipra.util.Constants;
 public abstract class FileModel<IdType> implements Model<IdType> {
 
 	public void writeToFile(File file) throws IOException {
-		FileUtils.writeStringToFile(file, toFileString(), Constants.FB_ENCODING, false);
+		FileUtils.writeStringToFile(file, toFileString(), Constants.FILEBASE_ENCODING, false);
 	}
 
 	public abstract void fromFile(File file) throws IOException;
-- 
GitLab