updated dtm analyzer

dtm analyzer updated model import code, unfinished added constants for dtm iterations

updated dtm analyzer
396249a9 · Eike Cochu · 1637a1a4 · 396249a9 · 396249a9 · 396249a9
Commit 396249a9 authored Feb 25, 2016 by Eike Cochu
--- a/build.sh
+++ b/build.sh
@@ -57,7 +57,6 @@ echo "-------------------------------" >> $LOG
 cd ./vipra-ui
 ./build.sh >> $LOG 2>&1
 cd ..
-cp -r ./vipra-ui/public ./vipra-backend/src/main/webapp/public
 if [ $? -ne 0 ]; then
        echo "error"
        exit 1

--- a/vipra-backend/.settings/org.eclipse.wst.common.component
+++ b/vipra-backend/.settings/org.eclipse.wst.common.component
 <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
    <wb-module deploy-name="vipra-backend">
        <wb-resource deploy-path="/" source-path="/target/m2e-wtp/web-resources"/>
-        <wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
        <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
        <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/resources"/>
+        <wb-resource deploy-path="/" source-path="/src/main/webapp"/>
        <dependent-module archiveName="util-0.0.1-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/vipra-util/vipra-util">
            <dependency-type>uses</dependency-type>
        </dependent-module>

--- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java
+++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java
@@ -81,8 +81,11 @@ public class InfoResource {
 			info.put("const.topicautoname", Constants.TOPIC_AUTO_NAMING_WORDS);
 			info.put("const.ktopics", Constants.K_TOPICS);
 			info.put("const.ktopicwords", Constants.K_TOPIC_WORDS);
-			info.put("const.minimumlike", Constants.MINIMUM_LIKELINESS);
+			info.put("const.minimumlike", Constants.MINIMUM_RELATIVE_PROB);
 			info.put("const.topicthresh", Constants.TOPIC_THRESHOLD);
+			info.put("const.dynminiter", Constants.DYNAMIC_MIN_ITER);
+			info.put("const.dynmaxiter", Constants.DYNAMIC_MAX_ITER);
+			info.put("const.statiter", Constants.STATIC_ITER);
 			info.put("const.docminfreq", Constants.DOCUMENT_MIN_WORD_FREQ);
 			info.put("const.docminlength", Constants.DOCUMENT_MIN_LENGTH);
 			info.put("const.charsdisallow", Constants.CHARS_DISALLOWED);

--- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java
@@ -6,12 +6,12 @@ import static de.vipra.cmd.CmdOptions.OPT_DEBUG;
 import static de.vipra.cmd.CmdOptions.OPT_DEFAULTS;
 import static de.vipra.cmd.CmdOptions.OPT_HELP;
 import static de.vipra.cmd.CmdOptions.OPT_IMPORT;
+import static de.vipra.cmd.CmdOptions.OPT_INDEXING;
 import static de.vipra.cmd.CmdOptions.OPT_MODELING;
 import static de.vipra.cmd.CmdOptions.OPT_SHELL;
 import static de.vipra.cmd.CmdOptions.OPT_SILENT;
 import static de.vipra.cmd.CmdOptions.OPT_STATS;
 import static de.vipra.cmd.CmdOptions.OPT_TEST;
-import static de.vipra.cmd.CmdOptions.OPT_INDEXING;
 import java.util.ArrayList;
 import java.util.List;

--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java
@@ -11,7 +11,6 @@ import java.util.List;
 import de.vipra.cmd.ex.FilebaseException;
 import de.vipra.cmd.file.DTMSequenceIndex.DTMDateIndexEntry;
-import de.vipra.util.Config;
 import de.vipra.util.Constants;
 import de.vipra.util.FileUtils;
 import de.vipra.util.ex.ConfigException;
@@ -27,17 +26,11 @@ public class DTMFilebase extends Filebase {
 	public DTMFilebase(File dataDir) throws FilebaseException {
 		super(dataDir, "dtm");
-		Config config;
-		try {
-			config = Config.getConfig();
-		} catch (IOException | ConfigException e) {
-			throw new FilebaseException(e);
-		}
 		File modelDir = getModelDir();
 		try {
-			this.seqindex = new DTMSequenceIndex(modelDir, config.windowResolution, false);
+			this.seqindex = new DTMSequenceIndex(modelDir);
-		} catch (IOException | ParseException e) {
+		} catch (IOException | ConfigException | ParseException e) {
 			throw new FilebaseException("could not read date index file", e);
 		}

--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java
@@ -16,9 +16,11 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import de.vipra.util.Config;
 import de.vipra.util.Constants;
 import de.vipra.util.Constants.WindowResolution;
 import de.vipra.util.FileUtils;
+import de.vipra.util.ex.ConfigException;
 public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DTMDateIndexEntry> {
@@ -46,13 +48,19 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT
 	}
 	private File file;
-	private static WindowResolution windowResolution;
+	private boolean readonly = false;
+	private WindowResolution windowResolution;
 	private static List<DTMDateIndexEntry> entries;
 	private static SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT);
-	public DTMSequenceIndex(File modelDir, WindowResolution wr, boolean reread) throws IOException, ParseException {
+	public DTMSequenceIndex(File modelDir) throws IOException, ParseException, ConfigException {
+		this(modelDir, false);
+	}
+	public DTMSequenceIndex(File modelDir, boolean reread) throws IOException, ParseException, ConfigException {
 		this.file = new File(modelDir, "dates");
-		windowResolution = wr;
+		Config config = Config.getConfig();
+		this.windowResolution = config.windowResolution;
 		if (file.exists()) {
 			if (entries == null || reread) {
 				List<String> dates = FileUtils.readFile(file);
@@ -78,6 +86,8 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT
 	@Override
 	public void close() throws IOException {
+		if (readonly)
+			return;
 		List<String> windows = new ArrayList<>();
 		Map<String, Integer> windowSizes = new HashMap<>();
@@ -111,4 +121,8 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT
 		writer.close();
 	}
+	public int size() {
+		return entries.size();
+	}
 }
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java
@@ -22,6 +22,10 @@ public class DTMVocabulary implements Closeable, Iterable<String> {
 	private static Map<String, Integer> vocablesMap;
 	private static int nextIndex = 1;
+	public DTMVocabulary(File modelDir) throws IOException {
+		this(modelDir, false);
+	}
 	public DTMVocabulary(File modelDir, boolean reread) throws IOException {
 		this.file = new File(modelDir, FILE_NAME);
 		if (file.exists()) {
@@ -80,6 +84,10 @@ public class DTMVocabulary implements Closeable, Iterable<String> {
 		return sb.toString();
 	}
+	public String get(int index) {
+		return vocables.get(index);
+	}
 	@Override
 	public void close() throws IOException {
 		write();

--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java
@@ -12,11 +12,13 @@ import de.vipra.util.FileUtils;
 public class FilebaseIndex implements Closeable, Iterable<String> {
+	public static final String FILE_NAME = "index";
 	private final File file;
 	private final List<String> index;
-	public FilebaseIndex(File file) throws IOException {
+	public FilebaseIndex(File modelDir) throws IOException {
-		this.file = file;
+		this.file = new File(modelDir, FILE_NAME);
 		if (file.exists()) {
 			index = new ArrayList<>(FileUtils.readFile(file));
 		} else {

--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
@@ -2,30 +2,49 @@ package de.vipra.cmd.lda;
 import java.io.BufferedReader;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.List;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.bson.types.ObjectId;
 import de.vipra.cmd.ex.AnalyzerException;
+import de.vipra.cmd.file.DTMSequenceIndex;
+import de.vipra.cmd.file.DTMVocabulary;
+import de.vipra.cmd.file.FilebaseIndex;
 import de.vipra.util.Config;
 import de.vipra.util.Constants;
+import de.vipra.util.FileUtils;
 import de.vipra.util.StringUtils;
 import de.vipra.util.ex.ConfigException;
+import de.vipra.util.ex.DatabaseException;
+import de.vipra.util.model.ArticleFull;
+import de.vipra.util.model.TopicFull;
+import de.vipra.util.model.TopicWord;
+import de.vipra.util.model.Word;
+import de.vipra.util.service.MongoService;
 public class DTMAnalyzer extends Analyzer {
 	public static final Logger log = LogManager.getLogger(DTMAnalyzer.class);
-	public static final String NAME = "dtm";
-	public static final int dynamicMinIter = 100;
+	public static final String NAME = "dtm";
-	public static final int dynamicMaxIter = 1000;
-	public static final int staticIter = 100;
 	private String command;
 	private File modelDir;
 	private File outDir;
+	private File outDirSeq;
+	private DTMVocabulary vocab;
+	private DTMSequenceIndex sequences;
+	private FilebaseIndex index;
+	private MongoService<ArticleFull, ObjectId> dbArticles;
+	private MongoService<TopicFull, ObjectId> dbTopics;
+	private MongoService<Word, String> dbWords;
 	protected DTMAnalyzer() {
 		super("Dynamic Topic Model Analyzer");
@@ -37,7 +56,16 @@ public class DTMAnalyzer extends Analyzer {
 			File dataDir = config.getDataDirectory();
 			this.modelDir = new File(dataDir, NAME);
 			this.outDir = new File(modelDir, "out");
-		} catch (ConfigException e) {
+			this.outDirSeq = new File(outDir, "lda-seq");
+			this.vocab = new DTMVocabulary(modelDir);
+			this.sequences = new DTMSequenceIndex(modelDir);
+			index = new FilebaseIndex(modelDir);
+			config = Config.getConfig();
+			dbArticles = MongoService.getDatabaseService(config, ArticleFull.class);
+			dbTopics = MongoService.getDatabaseService(config, TopicFull.class);
+			dbWords = MongoService.getDatabaseService(config, Word.class);
+		} catch (ConfigException | IOException | ParseException e) {
 			throw new AnalyzerException(e);
 		}
@@ -66,11 +94,11 @@ public class DTMAnalyzer extends Analyzer {
 				// alpha (default -10)
 				"--alpha=0.01",
 				// minimum number if iterations
-				"--lda_sequence_min_iter=" + dynamicMinIter,
+				"--lda_sequence_min_iter=" + Constants.DYNAMIC_MIN_ITER,
 				// maximum number of iterations
-				"--lda_sequence_max_iter=" + dynamicMaxIter,
+				"--lda_sequence_max_iter=" + Constants.DYNAMIC_MAX_ITER,
 				// em iter (default 20)
-				"--lda_max_em_iter=" + staticIter,
+				"--lda_max_em_iter=" + Constants.STATIC_ITER,
 				// input file prefix
 				"--corpus_prefix=" + corpusPrefix,
 				// output directory
@@ -86,6 +114,7 @@ public class DTMAnalyzer extends Analyzer {
 			if (!p.isAlive())
 				throw new AnalyzerException("dtm process is dead");
+			// read from process output
 			BufferedReader in = new BufferedReader(new InputStreamReader(p.getErrorStream()));
 			String line;
@@ -99,7 +128,101 @@ public class DTMAnalyzer extends Analyzer {
 			in.close();
 			p.waitFor();
-			// TODO save model
+			List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS);
+			List<Word> newWords = new ArrayList<>(vocab.size());
+			int sequencesCount = sequences.size();
+			// read topic definition files
+			for (int i = 0; i < Constants.K_TOPICS; i++) {
+				File seqFile = new File(outDirSeq, "topic-" + StringUtils.padNumber(i, 3) + "-var-e-log-prob.dat");
+				if (!seqFile.exists()) {
+					log.error("seq file " + seqFile.getName() + " not found");
+					continue;
+				}
+				int lineCount = FileUtils.countLines(seqFile);
+				int wordsPerSequence = lineCount / sequencesCount;
+				if (wordsPerSequence * sequencesCount != lineCount) {
+					log.error("unexpected number of words per sequence");
+					continue;
+				}
+				in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile)));
+				// read file lines into word x sequence matrix
+				// gather maximum likeliness per sequence
+				double[] maxLikelinesses = new double[sequencesCount];
+				double[][] likelinesses = new double[wordsPerSequence][sequencesCount];
+				for (int idxWord = 0; idxWord < wordsPerSequence; idxWord++) {
+					for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) {
+						double likeliness = Double.parseDouble(in.readLine());
+						likelinesses[idxWord][idxSeq] = likeliness;
+						if (likeliness > maxLikelinesses[idxSeq])
+							maxLikelinesses[idxSeq] = likeliness;
+					}
+				}
+				in.close();
+				// go through each sequence and gather all words that are above
+				// the minimum relative word likeliness
+				for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) {
+					double maxLikeliness = maxLikelinesses[idxSeq];
+					List<TopicWord> newSeqTopicWords = new ArrayList<>(wordsPerSequence);
+					for (int idxWord = 0; idxWord < wordsPerSequence; idxWord++) {
+						double likeliness = likelinesses[idxWord][idxSeq];
+						if (likeliness >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) {
+							Word newWord = new Word(vocab.get(idxWord));
+							newWords.add(newWord);
+							TopicWord newTopicWord = new TopicWord(newWord, likeliness);
+							newSeqTopicWords.add(newTopicWord);
+							// TODO gather words for static topic
+						}
+					}
+					// TODO create dynamic topic of sequence
+				}
+				// TODO create static topic
+			}
+			// recreate topics and words
+			dbTopics.drop();
+			dbWords.drop();
+			try {
+				dbTopics.createMultiple(newTopics);
+				dbWords.createMultiple(newWords);
+			} catch (DatabaseException e) {
+				throw new AnalyzerException(e);
+			}
+			// read gam.dat. It contains the topic proportions per article.
+			File gamFile = new File(outDir, "gam.dat");
+			if (!gamFile.exists()) {
+				throw new AnalyzerException("gam file " + gamFile.getName() + " not found");
+			}
+			in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile)));
+			for (int idxArticle = 0; idxArticle < index.size(); idxArticle++) {
+				double topicTotalProportions = 0;
+				double[] topicProportions = new double[Constants.K_TOPICS];
+				// gather individual topic proportions and sum of proportions
+				for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
+					double topicProportion = Double.parseDouble(in.readLine());
+					topicTotalProportions += topicProportion;
+					topicProportions[idxTopic] = topicProportion;
+				}
+				// normalize proportions
+				for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++)
+					topicProportions[idxTopic] /= topicTotalProportions;
+			}
+			// TODO create topicrefs
 		} catch (IOException | InterruptedException e) {
 			throw new AnalyzerException(e);
 		}

--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java
@@ -7,10 +7,8 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
 import java.util.regex.Matcher;
@@ -72,6 +70,7 @@ public class JGibbAnalyzer extends Analyzer {
 		options.est = !options.estc;
 		options.K = Constants.K_TOPICS;
 		options.twords = Constants.K_TOPIC_WORDS;
+		options.niters = Constants.STATIC_ITER;
 		modelFile = new File(modelDir, NAME);
 		options.dfile = modelFile.getName();
@@ -83,7 +82,7 @@ public class JGibbAnalyzer extends Analyzer {
 			dbArticles = MongoService.getDatabaseService(config, ArticleFull.class);
 			dbTopics = MongoService.getDatabaseService(config, TopicFull.class);
 			dbWords = MongoService.getDatabaseService(config, Word.class);
-			index = new FilebaseIndex(new File(modelDir, "index"));
+			index = new FilebaseIndex(modelDir);
 		} catch (Exception e) {
 			throw new AnalyzerException(e);
 		}
@@ -110,36 +109,48 @@ public class JGibbAnalyzer extends Analyzer {
 		// the list of new topics
 		List<TopicFull> newTopics = new ArrayList<>(options.K);
-		// a map of topic index -> topic. resolves topic ids from tassign file
-		Map<Integer, Topic> newTopicsMap = new HashMap<>(options.K);
 		// set of new words
 		Set<Word> newWords = new HashSet<>();
-		TopicFull newTopic = null;
 		List<TopicWord> topicWords = null;
-		int topicNum = 0;
+		int topicIndex = -1;
+		double[] maxLikelinesses = new double[Constants.K_TOPICS];
-		// for each line
+		// create topics and determine maximum likeliness for each topic
 		for (String line : lines) {
 			if (!line.startsWith("\t")) {
-				newTopic = new TopicFull();
+				topicIndex++;
-				topicWords = new ArrayList<>();
+				topicWords = new ArrayList<>(Constants.K_TOPIC_WORDS);
+				TopicFull newTopic = new TopicFull();
 				newTopic.setWords(topicWords);
 				newTopics.add(newTopic);
-				newTopicsMap.put(topicNum++, new Topic(newTopic.getId()));
 				continue;
 			}
 			String[] parts = line.trim().split("\\s+");
 			double likeliness = Double.parseDouble(parts[1]);
+			// determine maximum likeliness of this topic
+			if (likeliness > maxLikelinesses[topicIndex])
+				maxLikelinesses[topicIndex] = likeliness;
 			// check word likeliness
-			if (likeliness >= Constants.MINIMUM_LIKELINESS) {
+			topicWords.add(new TopicWord(new Word(parts[0]), likeliness));
-				Word newWord = new Word(parts[0]);
+		}
-				TopicWord topicWord = new TopicWord(newWord, likeliness);
-				topicWords.add(topicWord);
+		// filter out words below minimum relative likeliness, add accepted
-				newWords.add(newWord);
+		// words to list of new words
+		for (topicIndex = 0; topicIndex < newTopics.size(); topicIndex++) {
+			TopicFull topic = newTopics.get(topicIndex);
+			double maxLikeliness = maxLikelinesses[topicIndex];
+			ArrayList<TopicWord> filteredTopicWords = new ArrayList<>(topic.getWords().size());
+			for (TopicWord word : topic.getWords()) {
+				if (word.getLikeliness() >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) {
+					filteredTopicWords.add(word);
+					newWords.add(word.getWord());
+				}
 			}
+			topic.setWords(filteredTopicWords);
 		}
 		// sort topic words and generate topic name
@@ -189,10 +200,10 @@ public class JGibbAnalyzer extends Analyzer {
 					// check if topic above threshold
 					if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) {
 						reducedCount += entry.getValue();
-						Topic topic = newTopicsMap.get(Integer.parseInt(entry.getKey()));
+						TopicFull topic = newTopics.get(Integer.parseInt(entry.getKey()));
 						TopicRef ref = new TopicRef();
 						ref.setCount(entry.getValue());
-						ref.setTopic(topic);
+						ref.setTopic(new Topic(topic.getId()));
 						newTopicRefs.add(ref);
 					}
 				}

--- a/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java
@@ -7,9 +7,9 @@ import java.util.Set;
 import java.util.stream.Collectors;
 import edu.stanford.nlp.ling.CoreAnnotation;
-import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.pipeline.Annotation;
 import edu.stanford.nlp.pipeline.Annotator;

--- a/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java
@@ -8,8 +8,8 @@ import java.util.Properties;
 import java.util.Set;
 import edu.stanford.nlp.ling.CoreAnnotation;
-import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.pipeline.Annotation;
 import edu.stanford.nlp.pipeline.Annotator;

--- a/vipra-cmd/src/main/resources/config.properties
+++ b/vipra-cmd/src/main/resources/config.properties
@@ -4,5 +4,5 @@ db.name=test
 es.host=localhost
 es.port=9300
 tm.processor=corenlp
-tm.analyzer=jgibb
+tm.analyzer=dtm
 tm.dtmpath=/home/eike/repos/master/dtm_release/dtm/main
\ No newline at end of file
--- a/vipra-ui/app/js/config.js
+++ b/vipra-ui/app/js/config.js
+/******************************************************************************
+ * Vipra Application
+ * Configuration
+ ******************************************************************************/
 /* globals Vipra */
 (function() {

--- a/vipra-util/src/main/java/de/vipra/util/Constants.java
+++ b/vipra-util/src/main/java/de/vipra/util/Constants.java
@@ -79,7 +79,7 @@ public class Constants {
 	/**
 	 * Minimum likeliness of words. Words with lower likeliness are ignored
 	 */
-	public static final double MINIMUM_LIKELINESS = 0;
+	public static final double MINIMUM_RELATIVE_PROB = 0.01;
 	/**
 	 * Topics with a share greater or equal to this number are regarded as
@@ -87,6 +87,21 @@ public class Constants {
 	 */
 	public static final double TOPIC_THRESHOLD = 0.01;
+	/**
+	 * Dynamic minimum iterations. Used for dynamic topic modeling.
+	 */
+	public static final int DYNAMIC_MIN_ITER = 100;
+	/**
+	 * Dynamic maximum iterations. Used for dynamic topic modeling.
+	 */
+	public static final int DYNAMIC_MAX_ITER = 1000;
+	/**
+	 * Static iterations. Used for static topic modeling.
+	 */
+	public static final int STATIC_ITER = 100;
 	/**
 	 * Minimum word frequency for words to be used for topic modeling. All words
 	 * below this frequency in a document are filtered out before generating the

--- a/vipra-util/src/main/java/de/vipra/util/FileUtils.java
+++ b/vipra-util/src/main/java/de/vipra/util/FileUtils.java
 package de.vipra.util;
-import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
@@ -112,27 +111,12 @@ public class FileUtils extends org.apache.commons.io.FileUtils {
 	 * @throws IOException
 	 */
 	public static int countLines(File file) throws IOException {
-		if (!file.exists()) {
+		BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
-			return 0;
+		int lines = 0;
-		}
+		while (reader.readLine() != null)
-		InputStream is = new BufferedInputStream(new FileInputStream(file));
+			lines++;
-		try {
+		reader.close();
-			byte[] c = new byte[1024];
+		return lines;
-			int count = 0;
-			int readChars = 0;
-			boolean empty = true;
-			while ((readChars = is.read(c)) != -1) {
-				empty = false;
-				for (int i = 0; i < readChars; ++i) {
-					if (c[i] == '\n') {
-						++count;
-					}
-				}
-			}
-			return (count == 0 && !empty) ? 1 : count;
-		} finally {
-			is.close();
-		}
 	}
 	/**

--- a/vipra-util/src/main/java/de/vipra/util/StringUtils.java
+++ b/vipra-util/src/main/java/de/vipra/util/StringUtils.java
@@ -81,9 +81,9 @@ public class StringUtils {
 		return StringUtils.join(parts);
 	}
-	public static String padNumber(int lineCount) {
+	public static String padNumber(int number, int length) {
-		String lc = Integer.toString(lineCount);
+		String lc = Integer.toString(number);
-		while (lc.length() < 10) {
+		while (lc.length() < length) {
 			lc = "0" + lc;
 		}
 		return lc;