Skip to content
Snippets Groups Projects
Commit 8efdb22d authored by Eike Cochu's avatar Eike Cochu
Browse files

updated dtm analyzer, some memory improvement with vocabulary

parent a4bd5e98
No related merge requests found
package de.vipra.cmd.file;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import de.vipra.util.Constants;
import de.vipra.util.FileUtils;
import de.vipra.util.model.Word;
public class DTMVocabulary implements Closeable {
public class DTMVocabulary implements Closeable, Iterable<String> {
public static final String FILE_NAME = "vocab";
private final File file;
private static int currVocablesSize;
private static List<String> vocables;
private static Map<String, Integer> vocablesMap;
private static List<String> newVocables = new ArrayList<>();
private static Map<String, Integer> newVocablesMap = new HashMap<>();
private static int nextIndex = 0;
public DTMVocabulary(final File modelDir) throws IOException {
this(modelDir, false);
......@@ -36,45 +28,32 @@ public class DTMVocabulary implements Closeable {
this.file = new File(modelDir, FILE_NAME);
if (file.exists()) {
if (vocables == null || reread)
vocables = new ArrayList<>(FileUtils.readFile(file));
if (vocablesMap == null || reread)
vocablesMap = new HashMap<>(vocables.size());
for (final String vocable : vocables)
vocablesMap.put(vocable, nextIndex++);
currVocablesSize = vocables.size();
} else {
if (vocables == null)
vocables = new ArrayList<>(0);
if (vocablesMap == null)
vocablesMap = new HashMap<>(0);
}
vocables = FileUtils.readFile(file);
} else if (vocables == null || reread)
vocables = new ArrayList<>(500);
}
public void write() throws IOException {
BufferedWriter out = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(file, false), Constants.FILEBASE_ENCODING));
for (String word : vocables)
out.write(word + "\n");
for (String word : newVocables)
out.write(word + "\n");
out.close();
org.apache.commons.io.FileUtils.writeLines(file, vocables, false);
}
private int index(final String word) {
Integer index = vocablesMap.get(word);
if (index == null) {
index = newVocablesMap.get(word);
if (index == null) {
index = nextIndex++;
newVocablesMap.put(word, index);
newVocables.add(word);
}
int index = vocables.indexOf(word);
if (index == -1) {
vocables.add(word);
index = vocables.size() - 1;
}
return index;
}
public int size() {
return currVocablesSize + newVocablesMap.size();
return vocables.size();
}
public Word getWord(final int index) {
if (vocables.size() > index)
return new Word(vocables.get(index));
return null;
}
public String transform(final String[] words) {
......@@ -100,23 +79,8 @@ public class DTMVocabulary implements Closeable {
return sb.toString();
}
public String getWordString(final int index) {
if (index < currVocablesSize)
return vocables.get(index);
else
return newVocables.get(index - currVocablesSize);
}
public Word getWord(final int index) {
String wordStr = getWordString(index);
return wordStr == null ? null : new Word(wordStr);
}
public List<Word> getNewWords() {
List<Word> words = new ArrayList<>(newVocables.size());
for (String vocable : newVocables)
words.add(new Word(vocable));
return words;
public String get(final int index) {
return vocables.get(index);
}
@Override
......@@ -124,4 +88,9 @@ public class DTMVocabulary implements Closeable {
write();
}
}
@Override
public Iterator<String> iterator() {
return vocables.iterator();
}
}
\ No newline at end of file
......@@ -9,8 +9,10 @@ import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......@@ -142,10 +144,16 @@ public class DTMAnalyzer extends Analyzer {
final int sequencesCount = sequences.size();
// collects created topics
final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS);
// collects created words
final Set<Word> newWords = new HashSet<>(wordCount);
// collect mapping between words and topics
@SuppressWarnings("unchecked")
final Tuple<Double, Integer>[] wordTopicMapping = new Tuple[wordCount];
log.info("vocabulary size: " + wordCount);
log.info("sequences: " + sequencesCount);
log.info("topics: " + Constants.K_TOPICS);
// for each topic file
for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
final File seqFile = new File(outDirSeq,
......@@ -156,8 +164,6 @@ public class DTMAnalyzer extends Analyzer {
final List<Sequence> newSequences = new ArrayList<>(sequencesCount);
final List<TopicWord> newTopicWords = new ArrayList<>(wordCount);
newTopic.setSequences(newSequences);
newTopic.setWords(newTopicWords);
newTopics.add(newTopic);
in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile)));
......@@ -176,13 +182,18 @@ public class DTMAnalyzer extends Analyzer {
final double[] maxWordLikelinesses = ArrayUtils.findRowMaximum(likelinesses);
final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses);
final double maxOverallLikeliness = ArrayUtils.findMaximum(maxSeqLikelinesses);
final double minAcceptableLikeliness = (maxOverallLikeliness >= 0 ? 1
: 2 - Constants.MINIMUM_RELATIVE_PROB) * maxOverallLikeliness;
// static topic and word topic mapping
// most likely words form the static topic over all sequences
for (int idxWord = 0; idxWord < wordCount; idxWord++) {
if (maxWordLikelinesses[idxWord] >= Constants.MINIMUM_RELATIVE_PROB * maxOverallLikeliness) {
if (maxWordLikelinesses[idxWord] >= minAcceptableLikeliness) {
// add word to static topic
newTopicWords.add(new TopicWord(vocab.getWord(idxWord), maxWordLikelinesses[idxWord]));
final Word word = vocab.getWord(idxWord);
newWords.add(word);
final TopicWord topicWord = new TopicWord(word, maxWordLikelinesses[idxWord]);
newTopicWords.add(topicWord);
// check if better word topic mapping than previous
final Tuple<Double, Integer> tuple = wordTopicMapping[idxWord];
......@@ -193,16 +204,30 @@ public class DTMAnalyzer extends Analyzer {
}
}
if (newTopicWords.isEmpty())
continue;
newTopic.setWords(newTopicWords);
String msg = "topic with " + newTopicWords.size() + " "
+ StringUtils.quantity(newTopicWords.size(), "word") + ", sequences: [";
// dynamic topics
// go through each sequence and gather all words that are above
// the minimum relative word likeliness
for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) {
final double maxSeqLikeliness = maxSeqLikelinesses[idxSeq];
final double minAcceptableSeqLikeliness = (maxSeqLikeliness >= 0 ? 1
: 2 - Constants.MINIMUM_RELATIVE_PROB) * maxSeqLikeliness;
final List<TopicWord> newSeqTopicWords = new ArrayList<>(wordCount);
for (int idxWord = 0; idxWord < wordCount; idxWord++) {
final double likeliness = likelinesses[idxWord][idxSeq];
if (likeliness >= Constants.MINIMUM_RELATIVE_PROB * maxSeqLikeliness)
newSeqTopicWords.add(new TopicWord(vocab.getWord(idxWord), likeliness));
if (likeliness >= minAcceptableSeqLikeliness) {
final Word word = vocab.getWord(idxWord);
newWords.add(word);
final TopicWord topicWord = new TopicWord(word, likeliness);
newSeqTopicWords.add(topicWord);
}
}
Collections.sort(newSeqTopicWords, Comparator.reverseOrder());
......@@ -210,14 +235,24 @@ public class DTMAnalyzer extends Analyzer {
newSequence.setNumber(idxSeq);
newSequence.setWords(newSeqTopicWords);
newSequences.add(newSequence);
msg += " " + newSeqTopicWords.size();
}
log.info(msg + " ]");
newTopics.add(newTopic);
}
log.info("creating " + newTopics.size() + " " + StringUtils.quantity(newTopics.size(), "topic"));
log.info("creating " + newWords.size() + " " + StringUtils.quantity(newWords.size(), "word"));
// recreate topics and words
dbTopics.drop();
dbWords.drop();
try {
dbTopics.createMultiple(newTopics);
dbWords.createMultiple(vocab.getNewWords());
dbWords.createMultiple(newWords);
} catch (final DatabaseException e) {
throw new AnalyzerException(e);
}
......@@ -265,6 +300,9 @@ public class DTMAnalyzer extends Analyzer {
}
}
log.info("article with " + totalCount + " topic refs and " + reducedCount + " reduced topic refs ("
+ (reducedCount - totalCount) + ")");
// calculate each accepted topic share
for (final TopicRef ref : newTopicRefs)
ref.setShare((double) ref.getCount() / reducedCount);
......
......@@ -6,4 +6,4 @@ es.port=9300
tm.processor=corenlp
tm.analyzer=dtm
tm.dtmpath=/home/eike/repos/master/ma-impl/dtm_release/dtm/main
tm.windowresolution=monthly
\ No newline at end of file
tm.windowresolution=yearly
\ No newline at end of file
......@@ -14,9 +14,9 @@ public class ArrayUtils {
}
public static double[] findRowMaximum(final double[][] values) {
int rows = values.length;
int cols = values[0].length;
double[] maximum = new double[rows];
final int rows = values.length;
final int cols = values[0].length;
final double[] maximum = new double[rows];
Arrays.fill(maximum, Integer.MIN_VALUE);
for (int row = 0; row < rows; row++)
for (int col = 0; col < cols; col++)
......@@ -26,9 +26,9 @@ public class ArrayUtils {
}
public static double[] findColMaximum(final double[][] values) {
int rows = values.length;
int cols = values[0].length;
double[] maximum = new double[cols];
final int rows = values.length;
final int cols = values[0].length;
final double[] maximum = new double[cols];
Arrays.fill(maximum, Integer.MIN_VALUE);
for (int row = 0; row < rows; row++)
for (int col = 0; col < cols; col++)
......
......@@ -68,7 +68,7 @@ public class Constants {
* Number of topics to discover with topic modeling, if the selected topic
* modeling library supports this parameter.
*/
public static final int K_TOPICS = 50;
public static final int K_TOPICS = 25;
/**
* Number of words in a discovered topic, if the selected topic modeling
......@@ -95,7 +95,7 @@ public class Constants {
/**
* Dynamic maximum iterations. Used for dynamic topic modeling.
*/
public static final int DYNAMIC_MAX_ITER = 1000;
public static final int DYNAMIC_MAX_ITER = 500;
/**
* Static iterations. Used for static topic modeling.
......
......@@ -21,7 +21,7 @@ public class Sequence implements Comparable<Sequence>, Serializable {
return date;
}
public void setDate(Date date) {
public void setDate(final Date date) {
this.date = date;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment