updated dtm analyzer, some memory improvement with vocabulary

8efdb22d · Eike Cochu · a4bd5e98 · 8efdb22d · 8efdb22d · 8efdb22d
Commit 8efdb22d authored 9 years ago by Eike Cochu
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java
 package de.vipra.cmd.file;

-import java.io.BufferedWriter;
 import java.io.Closeable;
 import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.OutputStreamWriter;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;

-import de.vipra.util.Constants;
 import de.vipra.util.FileUtils;
 import de.vipra.util.model.Word;

-public class DTMVocabulary implements Closeable {
+public class DTMVocabulary implements Closeable, Iterable<String> {

 	public static final String FILE_NAME = "vocab";

 	private final File file;
-	private static int currVocablesSize;
 	private static List<String> vocables;
-	private static Map<String, Integer> vocablesMap;
-	private static List<String> newVocables = new ArrayList<>();
-	private static Map<String, Integer> newVocablesMap = new HashMap<>();
-	private static int nextIndex = 0;

 	public DTMVocabulary(final File modelDir) throws IOException {
 		this(modelDir, false);
@@ -36,45 +28,32 @@ public class DTMVocabulary implements Closeable {
 		this.file = new File(modelDir, FILE_NAME);
 		if (file.exists()) {
 			if (vocables == null || reread)
-				vocables = new ArrayList<>(FileUtils.readFile(file));
-			if (vocablesMap == null || reread)
-				vocablesMap = new HashMap<>(vocables.size());
-			for (final String vocable : vocables)
-				vocablesMap.put(vocable, nextIndex++);
-			currVocablesSize = vocables.size();
-		} else {
-			if (vocables == null)
-				vocables = new ArrayList<>(0);
-			if (vocablesMap == null)
-				vocablesMap = new HashMap<>(0);
-		}
+				vocables = FileUtils.readFile(file);
+		} else if (vocables == null || reread)
+			vocables = new ArrayList<>(500);
 	}

 	public void write() throws IOException {
-		BufferedWriter out = new BufferedWriter(
-				new OutputStreamWriter(new FileOutputStream(file, false), Constants.FILEBASE_ENCODING));
-		for (String word : vocables)
-			out.write(word + "\n");
-		for (String word : newVocables)
-			out.write(word + "\n");
-		out.close();
+		org.apache.commons.io.FileUtils.writeLines(file, vocables, false);
 	}

 	private int index(final String word) {
-		Integer index = vocablesMap.get(word);
-		if (index == null) {
-			index = newVocablesMap.get(word);
-			if (index == null) {
-				index = nextIndex++;
-				newVocablesMap.put(word, index);
-				newVocables.add(word);
-			}
+		int index = vocables.indexOf(word);
+		if (index == -1) {
+			vocables.add(word);
+			index = vocables.size() - 1;
 		}
 		return index;
 	}

 	public int size() {
-		return currVocablesSize + newVocablesMap.size();
+		return vocables.size();
+	}
+
+	public Word getWord(final int index) {
+		if (vocables.size() > index)
+			return new Word(vocables.get(index));
+		return null;
 	}

 	public String transform(final String[] words) {
@@ -100,23 +79,8 @@ public class DTMVocabulary implements Closeable {
 		return sb.toString();
 	}

-	public String getWordString(final int index) {
-		if (index < currVocablesSize)
-			return vocables.get(index);
-		else
-			return newVocables.get(index - currVocablesSize);
-	}
-
-	public Word getWord(final int index) {
-		String wordStr = getWordString(index);
-		return wordStr == null ? null : new Word(wordStr);
-	}
-
-	public List<Word> getNewWords() {
-		List<Word> words = new ArrayList<>(newVocables.size());
-		for (String vocable : newVocables)
-			words.add(new Word(vocable));
-		return words;
+	public String get(final int index) {
+		return vocables.get(index);
 	}

 	@Override
@@ -124,4 +88,9 @@ public class DTMVocabulary implements Closeable {
 		write();
 	}

-}
+	@Override
+	public Iterator<String> iterator() {
+		return vocables.iterator();
+	}
+
+}
\ No newline at end of file
--- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java
@@ -9,8 +9,10 @@ import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map.Entry;
+import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@@ -142,10 +144,16 @@ public class DTMAnalyzer extends Analyzer {
 			final int sequencesCount = sequences.size();
 			// collects created topics
 			final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS);
+			// collects created words
+			final Set<Word> newWords = new HashSet<>(wordCount);
 			// collect mapping between words and topics
 			@SuppressWarnings("unchecked")
 			final Tuple<Double, Integer>[] wordTopicMapping = new Tuple[wordCount];

+			log.info("vocabulary size: " + wordCount);
+			log.info("sequences: " + sequencesCount);
+			log.info("topics: " + Constants.K_TOPICS);
+
 			// for each topic file
 			for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
 				final File seqFile = new File(outDirSeq,
@@ -156,8 +164,6 @@ public class DTMAnalyzer extends Analyzer {
 				final List<Sequence> newSequences = new ArrayList<>(sequencesCount);
 				final List<TopicWord> newTopicWords = new ArrayList<>(wordCount);
 				newTopic.setSequences(newSequences);
-				newTopic.setWords(newTopicWords);
-				newTopics.add(newTopic);

 				in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile)));

@@ -176,13 +182,18 @@ public class DTMAnalyzer extends Analyzer {
 				final double[] maxWordLikelinesses = ArrayUtils.findRowMaximum(likelinesses);
 				final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses);
 				final double maxOverallLikeliness = ArrayUtils.findMaximum(maxSeqLikelinesses);
+				final double minAcceptableLikeliness = (maxOverallLikeliness >= 0 ? 1
+						: 2 - Constants.MINIMUM_RELATIVE_PROB) * maxOverallLikeliness;

 				// static topic and word topic mapping
 				// most likely words form the static topic over all sequences
 				for (int idxWord = 0; idxWord < wordCount; idxWord++) {
-					if (maxWordLikelinesses[idxWord] >= Constants.MINIMUM_RELATIVE_PROB * maxOverallLikeliness) {
+					if (maxWordLikelinesses[idxWord] >= minAcceptableLikeliness) {
 						// add word to static topic
-						newTopicWords.add(new TopicWord(vocab.getWord(idxWord), maxWordLikelinesses[idxWord]));
+						final Word word = vocab.getWord(idxWord);
+						newWords.add(word);
+						final TopicWord topicWord = new TopicWord(word, maxWordLikelinesses[idxWord]);
+						newTopicWords.add(topicWord);

 						// check if better word topic mapping than previous
 						final Tuple<Double, Integer> tuple = wordTopicMapping[idxWord];
@@ -193,16 +204,30 @@ public class DTMAnalyzer extends Analyzer {
 					}
 				}

+				if (newTopicWords.isEmpty())
+					continue;
+
+				newTopic.setWords(newTopicWords);
+
+				String msg = "topic with " + newTopicWords.size() + " "
+						+ StringUtils.quantity(newTopicWords.size(), "word") + ", sequences: [";
+
 				// dynamic topics
 				// go through each sequence and gather all words that are above
 				// the minimum relative word likeliness
 				for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) {
 					final double maxSeqLikeliness = maxSeqLikelinesses[idxSeq];
+					final double minAcceptableSeqLikeliness = (maxSeqLikeliness >= 0 ? 1
+							: 2 - Constants.MINIMUM_RELATIVE_PROB) * maxSeqLikeliness;
 					final List<TopicWord> newSeqTopicWords = new ArrayList<>(wordCount);
 					for (int idxWord = 0; idxWord < wordCount; idxWord++) {
 						final double likeliness = likelinesses[idxWord][idxSeq];
-						if (likeliness >= Constants.MINIMUM_RELATIVE_PROB * maxSeqLikeliness)
-							newSeqTopicWords.add(new TopicWord(vocab.getWord(idxWord), likeliness));
+						if (likeliness >= minAcceptableSeqLikeliness) {
+							final Word word = vocab.getWord(idxWord);
+							newWords.add(word);
+							final TopicWord topicWord = new TopicWord(word, likeliness);
+							newSeqTopicWords.add(topicWord);
+						}
 					}
 					Collections.sort(newSeqTopicWords, Comparator.reverseOrder());

@@ -210,14 +235,24 @@ public class DTMAnalyzer extends Analyzer {
 					newSequence.setNumber(idxSeq);
 					newSequence.setWords(newSeqTopicWords);
 					newSequences.add(newSequence);
+
+					msg += " " + newSeqTopicWords.size();
 				}
+
+				log.info(msg + " ]");
+
+				newTopics.add(newTopic);
 			}

+			log.info("creating " + newTopics.size() + " " + StringUtils.quantity(newTopics.size(), "topic"));
+			log.info("creating " + newWords.size() + " " + StringUtils.quantity(newWords.size(), "word"));
+
 			// recreate topics and words
 			dbTopics.drop();
+			dbWords.drop();
 			try {
 				dbTopics.createMultiple(newTopics);
-				dbWords.createMultiple(vocab.getNewWords());
+				dbWords.createMultiple(newWords);
 			} catch (final DatabaseException e) {
 				throw new AnalyzerException(e);
 			}
@@ -265,6 +300,9 @@ public class DTMAnalyzer extends Analyzer {
 					}
 				}

+				log.info("article with " + totalCount + " topic refs and " + reducedCount + " reduced topic refs ("
+						+ (reducedCount - totalCount) + ")");
+
 				// calculate each accepted topic share
 				for (final TopicRef ref : newTopicRefs)
 					ref.setShare((double) ref.getCount() / reducedCount);

--- a/vipra-cmd/src/main/resources/config.properties
+++ b/vipra-cmd/src/main/resources/config.properties
@@ -6,4 +6,4 @@ es.port=9300
 tm.processor=corenlp
 tm.analyzer=dtm
 tm.dtmpath=/home/eike/repos/master/ma-impl/dtm_release/dtm/main
-tm.windowresolution=monthly
\ No newline at end of file
+tm.windowresolution=yearly
\ No newline at end of file
--- a/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java
+++ b/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java
@@ -14,9 +14,9 @@ public class ArrayUtils {
 	}

 	public static double[] findRowMaximum(final double[][] values) {
-		int rows = values.length;
-		int cols = values[0].length;
-		double[] maximum = new double[rows];
+		final int rows = values.length;
+		final int cols = values[0].length;
+		final double[] maximum = new double[rows];
 		Arrays.fill(maximum, Integer.MIN_VALUE);
 		for (int row = 0; row < rows; row++)
 			for (int col = 0; col < cols; col++)
@@ -26,9 +26,9 @@ public class ArrayUtils {
 	}

 	public static double[] findColMaximum(final double[][] values) {
-		int rows = values.length;
-		int cols = values[0].length;
-		double[] maximum = new double[cols];
+		final int rows = values.length;
+		final int cols = values[0].length;
+		final double[] maximum = new double[cols];
 		Arrays.fill(maximum, Integer.MIN_VALUE);
 		for (int row = 0; row < rows; row++)
 			for (int col = 0; col < cols; col++)

--- a/vipra-util/src/main/java/de/vipra/util/Constants.java
+++ b/vipra-util/src/main/java/de/vipra/util/Constants.java
@@ -68,7 +68,7 @@ public class Constants {
 	 * Number of topics to discover with topic modeling, if the selected topic
 	 * modeling library supports this parameter.
 	 */
-	public static final int K_TOPICS = 50;
+	public static final int K_TOPICS = 25;

 	/**
 	 * Number of words in a discovered topic, if the selected topic modeling
@@ -95,7 +95,7 @@ public class Constants {
 	/**
 	 * Dynamic maximum iterations. Used for dynamic topic modeling.
 	 */
-	public static final int DYNAMIC_MAX_ITER = 1000;
+	public static final int DYNAMIC_MAX_ITER = 500;

 	/**
 	 * Static iterations. Used for static topic modeling.

--- a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java
+++ b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java
@@ -21,7 +21,7 @@ public class Sequence implements Comparable<Sequence>, Serializable {
 		return date;
 	}

-	public void setDate(Date date) {
+	public void setDate(final Date date) {
 		this.date = date;
 	}