diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java index 3840b76b78ea81fb4844febbbc768c95a69d7efa..9477f00f738c3c7226bbacf72211dff41db3cac3 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java @@ -1,32 +1,24 @@ package de.vipra.cmd.file; -import java.io.BufferedWriter; import java.io.Closeable; import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import de.vipra.util.Constants; import de.vipra.util.FileUtils; import de.vipra.util.model.Word; -public class DTMVocabulary implements Closeable { +public class DTMVocabulary implements Closeable, Iterable<String> { public static final String FILE_NAME = "vocab"; private final File file; - private static int currVocablesSize; private static List<String> vocables; - private static Map<String, Integer> vocablesMap; - private static List<String> newVocables = new ArrayList<>(); - private static Map<String, Integer> newVocablesMap = new HashMap<>(); - private static int nextIndex = 0; public DTMVocabulary(final File modelDir) throws IOException { this(modelDir, false); @@ -36,45 +28,32 @@ public class DTMVocabulary implements Closeable { this.file = new File(modelDir, FILE_NAME); if (file.exists()) { if (vocables == null || reread) - vocables = new ArrayList<>(FileUtils.readFile(file)); - if (vocablesMap == null || reread) - vocablesMap = new HashMap<>(vocables.size()); - for (final String vocable : vocables) - vocablesMap.put(vocable, nextIndex++); - currVocablesSize = vocables.size(); - } else { - if (vocables == null) - vocables = new ArrayList<>(0); - if (vocablesMap == null) - vocablesMap = new HashMap<>(0); - } + vocables = FileUtils.readFile(file); + } else if (vocables == null || reread) + vocables = new ArrayList<>(500); } public void write() throws IOException { - BufferedWriter out = new BufferedWriter( - new OutputStreamWriter(new FileOutputStream(file, false), Constants.FILEBASE_ENCODING)); - for (String word : vocables) - out.write(word + "\n"); - for (String word : newVocables) - out.write(word + "\n"); - out.close(); + org.apache.commons.io.FileUtils.writeLines(file, vocables, false); } private int index(final String word) { - Integer index = vocablesMap.get(word); - if (index == null) { - index = newVocablesMap.get(word); - if (index == null) { - index = nextIndex++; - newVocablesMap.put(word, index); - newVocables.add(word); - } + int index = vocables.indexOf(word); + if (index == -1) { + vocables.add(word); + index = vocables.size() - 1; } return index; } public int size() { - return currVocablesSize + newVocablesMap.size(); + return vocables.size(); + } + + public Word getWord(final int index) { + if (vocables.size() > index) + return new Word(vocables.get(index)); + return null; } public String transform(final String[] words) { @@ -100,23 +79,8 @@ public class DTMVocabulary implements Closeable { return sb.toString(); } - public String getWordString(final int index) { - if (index < currVocablesSize) - return vocables.get(index); - else - return newVocables.get(index - currVocablesSize); - } - - public Word getWord(final int index) { - String wordStr = getWordString(index); - return wordStr == null ? null : new Word(wordStr); - } - - public List<Word> getNewWords() { - List<Word> words = new ArrayList<>(newVocables.size()); - for (String vocable : newVocables) - words.add(new Word(vocable)); - return words; + public String get(final int index) { + return vocables.get(index); } @Override @@ -124,4 +88,9 @@ public class DTMVocabulary implements Closeable { write(); } -} + @Override + public Iterator<String> iterator() { + return vocables.iterator(); + } + +} \ No newline at end of file diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index 3093cdc978bc100c73c0c26e73d0d2a289596a0b..412c6581fa81d9601d16b0e4e946858e01401281 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -9,8 +9,10 @@ import java.text.ParseException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; +import java.util.HashSet; import java.util.List; import java.util.Map.Entry; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -142,10 +144,16 @@ public class DTMAnalyzer extends Analyzer { final int sequencesCount = sequences.size(); // collects created topics final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); + // collects created words + final Set<Word> newWords = new HashSet<>(wordCount); // collect mapping between words and topics @SuppressWarnings("unchecked") final Tuple<Double, Integer>[] wordTopicMapping = new Tuple[wordCount]; + log.info("vocabulary size: " + wordCount); + log.info("sequences: " + sequencesCount); + log.info("topics: " + Constants.K_TOPICS); + // for each topic file for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { final File seqFile = new File(outDirSeq, @@ -156,8 +164,6 @@ public class DTMAnalyzer extends Analyzer { final List<Sequence> newSequences = new ArrayList<>(sequencesCount); final List<TopicWord> newTopicWords = new ArrayList<>(wordCount); newTopic.setSequences(newSequences); - newTopic.setWords(newTopicWords); - newTopics.add(newTopic); in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile))); @@ -176,13 +182,18 @@ public class DTMAnalyzer extends Analyzer { final double[] maxWordLikelinesses = ArrayUtils.findRowMaximum(likelinesses); final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses); final double maxOverallLikeliness = ArrayUtils.findMaximum(maxSeqLikelinesses); + final double minAcceptableLikeliness = (maxOverallLikeliness >= 0 ? 1 + : 2 - Constants.MINIMUM_RELATIVE_PROB) * maxOverallLikeliness; // static topic and word topic mapping // most likely words form the static topic over all sequences for (int idxWord = 0; idxWord < wordCount; idxWord++) { - if (maxWordLikelinesses[idxWord] >= Constants.MINIMUM_RELATIVE_PROB * maxOverallLikeliness) { + if (maxWordLikelinesses[idxWord] >= minAcceptableLikeliness) { // add word to static topic - newTopicWords.add(new TopicWord(vocab.getWord(idxWord), maxWordLikelinesses[idxWord])); + final Word word = vocab.getWord(idxWord); + newWords.add(word); + final TopicWord topicWord = new TopicWord(word, maxWordLikelinesses[idxWord]); + newTopicWords.add(topicWord); // check if better word topic mapping than previous final Tuple<Double, Integer> tuple = wordTopicMapping[idxWord]; @@ -193,16 +204,30 @@ public class DTMAnalyzer extends Analyzer { } } + if (newTopicWords.isEmpty()) + continue; + + newTopic.setWords(newTopicWords); + + String msg = "topic with " + newTopicWords.size() + " " + + StringUtils.quantity(newTopicWords.size(), "word") + ", sequences: ["; + // dynamic topics // go through each sequence and gather all words that are above // the minimum relative word likeliness for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { final double maxSeqLikeliness = maxSeqLikelinesses[idxSeq]; + final double minAcceptableSeqLikeliness = (maxSeqLikeliness >= 0 ? 1 + : 2 - Constants.MINIMUM_RELATIVE_PROB) * maxSeqLikeliness; final List<TopicWord> newSeqTopicWords = new ArrayList<>(wordCount); for (int idxWord = 0; idxWord < wordCount; idxWord++) { final double likeliness = likelinesses[idxWord][idxSeq]; - if (likeliness >= Constants.MINIMUM_RELATIVE_PROB * maxSeqLikeliness) - newSeqTopicWords.add(new TopicWord(vocab.getWord(idxWord), likeliness)); + if (likeliness >= minAcceptableSeqLikeliness) { + final Word word = vocab.getWord(idxWord); + newWords.add(word); + final TopicWord topicWord = new TopicWord(word, likeliness); + newSeqTopicWords.add(topicWord); + } } Collections.sort(newSeqTopicWords, Comparator.reverseOrder()); @@ -210,14 +235,24 @@ public class DTMAnalyzer extends Analyzer { newSequence.setNumber(idxSeq); newSequence.setWords(newSeqTopicWords); newSequences.add(newSequence); + + msg += " " + newSeqTopicWords.size(); } + + log.info(msg + " ]"); + + newTopics.add(newTopic); } + log.info("creating " + newTopics.size() + " " + StringUtils.quantity(newTopics.size(), "topic")); + log.info("creating " + newWords.size() + " " + StringUtils.quantity(newWords.size(), "word")); + // recreate topics and words dbTopics.drop(); + dbWords.drop(); try { dbTopics.createMultiple(newTopics); - dbWords.createMultiple(vocab.getNewWords()); + dbWords.createMultiple(newWords); } catch (final DatabaseException e) { throw new AnalyzerException(e); } @@ -265,6 +300,9 @@ public class DTMAnalyzer extends Analyzer { } } + log.info("article with " + totalCount + " topic refs and " + reducedCount + " reduced topic refs (" + + (reducedCount - totalCount) + ")"); + // calculate each accepted topic share for (final TopicRef ref : newTopicRefs) ref.setShare((double) ref.getCount() / reducedCount); diff --git a/vipra-cmd/src/main/resources/config.properties b/vipra-cmd/src/main/resources/config.properties index ec94fae2b72d95f1c9943877573bc4892ae31b6d..6f38f34439e93fdde6eba975c5f46b7acddeee69 100644 --- a/vipra-cmd/src/main/resources/config.properties +++ b/vipra-cmd/src/main/resources/config.properties @@ -6,4 +6,4 @@ es.port=9300 tm.processor=corenlp tm.analyzer=dtm tm.dtmpath=/home/eike/repos/master/ma-impl/dtm_release/dtm/main -tm.windowresolution=monthly \ No newline at end of file +tm.windowresolution=yearly \ No newline at end of file diff --git a/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java b/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java index bf80400b360a50e8653169ab8f281b1481fe40a0..ce7ea92bbf0298d13251bce61e4b71cb884763cd 100644 --- a/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java @@ -14,9 +14,9 @@ public class ArrayUtils { } public static double[] findRowMaximum(final double[][] values) { - int rows = values.length; - int cols = values[0].length; - double[] maximum = new double[rows]; + final int rows = values.length; + final int cols = values[0].length; + final double[] maximum = new double[rows]; Arrays.fill(maximum, Integer.MIN_VALUE); for (int row = 0; row < rows; row++) for (int col = 0; col < cols; col++) @@ -26,9 +26,9 @@ public class ArrayUtils { } public static double[] findColMaximum(final double[][] values) { - int rows = values.length; - int cols = values[0].length; - double[] maximum = new double[cols]; + final int rows = values.length; + final int cols = values[0].length; + final double[] maximum = new double[cols]; Arrays.fill(maximum, Integer.MIN_VALUE); for (int row = 0; row < rows; row++) for (int col = 0; col < cols; col++) diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 2d3d58d09d9122f9e855a12b0a8d4131eff1fcb1..2466791f942bfdf3f6a577b80c4920bf247d2e68 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -68,7 +68,7 @@ public class Constants { * Number of topics to discover with topic modeling, if the selected topic * modeling library supports this parameter. */ - public static final int K_TOPICS = 50; + public static final int K_TOPICS = 25; /** * Number of words in a discovered topic, if the selected topic modeling @@ -95,7 +95,7 @@ public class Constants { /** * Dynamic maximum iterations. Used for dynamic topic modeling. */ - public static final int DYNAMIC_MAX_ITER = 1000; + public static final int DYNAMIC_MAX_ITER = 500; /** * Static iterations. Used for static topic modeling. diff --git a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java index bdb0177e15be7087694b3c371fe84f9afc970534..5078dcb31c4e17cebe4b0843b7770440dd18a6aa 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java @@ -21,7 +21,7 @@ public class Sequence implements Comparable<Sequence>, Serializable { return date; } - public void setDate(Date date) { + public void setDate(final Date date) { this.date = date; }