From 44d044b12d327aeed5ae4d332b70ad09761c643d Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Mon, 29 Feb 2016 22:56:44 +0100 Subject: [PATCH] updated dtm analyzer, first version --- .../de/vipra/cmd/file/DTMSequenceIndex.java | 37 ++++----- .../java/de/vipra/cmd/file/DTMVocabulary.java | 2 +- .../java/de/vipra/cmd/lda/DTMAnalyzer.java | 83 +++++++++++-------- .../src/main/resources/config.properties | 5 +- .../src/main/java/de/vipra/util/CountMap.java | 8 ++ 5 files changed, 76 insertions(+), 59 deletions(-) diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java index 8ce1e089..90cf68eb 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java @@ -12,14 +12,13 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Date; -import java.util.HashMap; import java.util.Iterator; import java.util.List; -import java.util.Map; import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.Constants.WindowResolution; +import de.vipra.util.CountMap; import de.vipra.util.FileUtils; import de.vipra.util.ex.ConfigException; @@ -48,11 +47,12 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT } } - private File file; - private boolean readonly = false; - private WindowResolution windowResolution; + private final File file; + private final boolean readonly = false; + private final WindowResolution windowResolution; private static List<DTMDateIndexEntry> entries; private static SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); + private static CountMap<String> windowSizes = new CountMap<>(); public DTMSequenceIndex(File modelDir) throws IOException, ParseException, ConfigException { this(modelDir, false); @@ -66,17 +66,21 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT if (entries == null || reread) { List<String> dates = FileUtils.readFile(file); entries = new ArrayList<>(dates.size()); - for (String date : dates) { - entries.add(new DTMDateIndexEntry(df.parse(date), true, null)); - } + for (String date : dates) + add(df.parse(date)); } } else if (entries == null || reread) { entries = new ArrayList<>(); } } + private void add(Date date) { + add(date, null); + } + public void add(Date date, String line) { - entries.add(new DTMDateIndexEntry(date, false, line)); + entries.add(new DTMDateIndexEntry(date, line == null, line)); + windowSizes.count(windowResolution.fromDate(date)); } @Override @@ -89,21 +93,12 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT public void close() throws IOException { if (readonly) return; - Map<String, Integer> windowSizes = new HashMap<>(); // write date index BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false))); for (DTMDateIndexEntry entry : entries) { writer.write(df.format(entry.date)); writer.write(Constants.LINE_SEP); - - String window = windowResolution.fromDate(entry.date); - Integer count = windowSizes.get(window); - if (count == null) { - windowSizes.put(window, 1); - } else { - windowSizes.put(window, count + 1); - } } writer.close(); @@ -112,7 +107,7 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seqFile, false))); writer.write(Integer.toString(windowSizes.size())); writer.write(Constants.LINE_SEP); - + // write window sizes String[] windows = windowSizes.keySet().toArray(new String[windowSizes.size()]); Arrays.sort(windows); @@ -120,12 +115,12 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT writer.write(Integer.toString(windowSizes.get(window))); writer.write(Constants.LINE_SEP); } - + writer.close(); } public int size() { - return entries.size(); + return windowSizes.size(); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java index e9ceb68e..3a5c8975 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java @@ -20,7 +20,7 @@ public class DTMVocabulary implements Closeable, Iterable<String> { private File file; private static List<String> vocables; private static Map<String, Integer> vocablesMap; - private static int nextIndex = 1; + private static int nextIndex = 0; public DTMVocabulary(File modelDir) throws IOException { this(modelDir, false); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index ac283617..71928d84 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -7,11 +7,10 @@ import java.io.IOException; import java.io.InputStreamReader; import java.text.ParseException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -27,8 +26,8 @@ import de.vipra.cmd.file.FilebaseIndex; import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.CountMap; -import de.vipra.util.FileUtils; import de.vipra.util.StringUtils; +import de.vipra.util.Tuple; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleFull; @@ -141,7 +140,7 @@ public class DTMAnalyzer extends Analyzer { // TODO find out what proportions are good for and where to store // them - File gamFile = new File(outDir, "gam.dat"); + File gamFile = new File(outDirSeq, "gam.dat"); in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile))); for (int idxArticle = 0; idxArticle < index.size(); idxArticle++) { @@ -166,27 +165,24 @@ public class DTMAnalyzer extends Analyzer { // read topic definition files and create topics - Map<Word, Topic> topicWordMap = new HashMap<>(vocab.size()); - List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); - List<Word> newWords = new ArrayList<>(vocab.size()); int sequencesCount = sequences.size(); + int wordCount = vocab.size(); + // collects created topics + List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); + // collects created words + List<Word> newWords = new ArrayList<>(wordCount); + // collect mapping between words and topics + @SuppressWarnings("unchecked") + Tuple<Double, Integer>[] wordTopicMapping = (Tuple<Double, Integer>[]) new Tuple[wordCount]; // for each topic file for (int i = 0; i < Constants.K_TOPICS; i++) { File seqFile = new File(outDirSeq, "topic-" + StringUtils.padNumber(i, 3) + "-var-e-log-prob.dat"); - int lineCount = FileUtils.countLines(seqFile); - int wordsCount = lineCount / sequencesCount; - - if (wordsCount * sequencesCount != lineCount) { - log.error("unexpected number of words per sequence"); - continue; - } - // create new topic TopicFull newTopic = new TopicFull(); List<Sequence> newSequences = new ArrayList<>(sequencesCount); - List<TopicWord> newTopicWords = new ArrayList<>(wordsCount); + List<TopicWord> newTopicWords = new ArrayList<>(wordCount); newTopic.setSequences(newSequences); newTopic.setWords(newTopicWords); newTopics.add(newTopic); @@ -196,9 +192,11 @@ public class DTMAnalyzer extends Analyzer { // read file lines into word x sequence matrix // gather maximum likeliness per sequence and per word double[] maxSeqLikelinesses = new double[sequencesCount]; - double[] maxWordLikelinesses = new double[wordsCount]; - double[][] likelinesses = new double[wordsCount][sequencesCount]; - for (int idxWord = 0; idxWord < wordsCount; idxWord++) { + Arrays.fill(maxSeqLikelinesses, Integer.MIN_VALUE); + double[] maxWordLikelinesses = new double[wordCount]; + Arrays.fill(maxWordLikelinesses, Integer.MIN_VALUE); + double[][] likelinesses = new double[wordCount][sequencesCount]; + for (int idxWord = 0; idxWord < wordCount; idxWord++) { for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { double likeliness = Double.parseDouble(in.readLine()); likelinesses[idxWord][idxSeq] = likeliness; @@ -211,8 +209,18 @@ public class DTMAnalyzer extends Analyzer { in.close(); + // compare to current word <-> topic mapping, accept higher + // likeliness as better + for (int idxWord = 0; idxWord < maxWordLikelinesses.length; idxWord++) { + Tuple<Double, Integer> tuple = wordTopicMapping[idxWord]; + if (tuple == null) + wordTopicMapping[idxWord] = new Tuple<>(maxWordLikelinesses[idxWord], i); + else if (maxWordLikelinesses[idxWord] > tuple.first()) + tuple.setSecond(i); + } + // find maximum overall likeliness - double maxOverallLikeliness = 0; + double maxOverallLikeliness = Integer.MIN_VALUE; for (double likeliness : maxSeqLikelinesses) { if (likeliness > maxOverallLikeliness) maxOverallLikeliness = likeliness; @@ -220,7 +228,7 @@ public class DTMAnalyzer extends Analyzer { // static topic // most likely words form the static topic over all sequences - for (int idxWord = 0; idxWord < wordsCount; idxWord++) { + for (int idxWord = 0; idxWord < wordCount; idxWord++) { if (maxWordLikelinesses[idxWord] >= Constants.MINIMUM_RELATIVE_PROB * maxOverallLikeliness) { Word newWord = new Word(vocab.get(idxWord)); newWords.add(newWord); @@ -234,8 +242,8 @@ public class DTMAnalyzer extends Analyzer { // the minimum relative word likeliness for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { double maxLikeliness = maxSeqLikelinesses[idxSeq]; - List<TopicWord> newSeqTopicWords = new ArrayList<>(wordsCount); - for (int idxWord = 0; idxWord < wordsCount; idxWord++) { + List<TopicWord> newSeqTopicWords = new ArrayList<>(wordCount); + for (int idxWord = 0; idxWord < wordCount; idxWord++) { double likeliness = likelinesses[idxWord][idxSeq]; if (likeliness >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) { Word newWord = new Word(vocab.get(idxWord)); @@ -272,14 +280,18 @@ public class DTMAnalyzer extends Analyzer { // for each article in the model file while ((line = in.readLine()) != null) { - // extract unique word ids and count + // get topic id from word id, count topics CountMap<Integer> countMap = new CountMap<>(); Matcher matcher = wordCountPattern.matcher(line); double totalCount = 0; while (matcher.find()) { - int count = Integer.parseInt(matcher.group(2)); - countMap.count(Integer.parseInt(matcher.group(1)), count); - totalCount += count; + Integer wordId = Integer.parseInt(matcher.group(1)); + Tuple<Double, Integer> wordTopicTuple = wordTopicMapping[wordId]; + if (wordTopicTuple != null) { + int count = Integer.parseInt(matcher.group(2)); + countMap.count(wordTopicTuple.second(), count); + totalCount += count; + } } // create list of topics refs referencing topics with counted @@ -290,14 +302,15 @@ public class DTMAnalyzer extends Analyzer { // check if topic above threshold if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) { reducedCount += entry.getValue(); - TopicFull topic = null; - // TODO find topic of this word - if (topic != null) { - TopicRef ref = new TopicRef(); - ref.setCount(entry.getValue()); - ref.setTopic(new Topic(topic.getId())); - newTopicRefs.add(ref); - } + TopicFull topic = newTopics.get(entry.getKey()); + // TODO words with low relative likeliness are ignored. + // topic references from this file are possibly wrong. + // fix this by checking if the word is actually accepted + // by the referenced topic. + TopicRef ref = new TopicRef(); + ref.setCount(entry.getValue()); + ref.setTopic(new Topic(topic.getId())); + newTopicRefs.add(ref); } } diff --git a/vipra-cmd/src/main/resources/config.properties b/vipra-cmd/src/main/resources/config.properties index 61bfe4a9..ec94fae2 100644 --- a/vipra-cmd/src/main/resources/config.properties +++ b/vipra-cmd/src/main/resources/config.properties @@ -4,5 +4,6 @@ db.name=test es.host=localhost es.port=9300 tm.processor=corenlp -tm.analyzer=jgibb -tm.dtmpath=/home/eike/repos/master/dtm_release/dtm/main \ No newline at end of file +tm.analyzer=dtm +tm.dtmpath=/home/eike/repos/master/ma-impl/dtm_release/dtm/main +tm.windowresolution=monthly \ No newline at end of file diff --git a/vipra-util/src/main/java/de/vipra/util/CountMap.java b/vipra-util/src/main/java/de/vipra/util/CountMap.java index bdb9af1e..7de5817e 100644 --- a/vipra-util/src/main/java/de/vipra/util/CountMap.java +++ b/vipra-util/src/main/java/de/vipra/util/CountMap.java @@ -37,4 +37,12 @@ public class CountMap<T> { return map.size(); } + public Integer get(T key) { + return map.get(key); + } + + public Set<T> keySet() { + return map.keySet(); + } + } -- GitLab