From 6b5802359f9442d7d347987e1ab4327da94b8f12 Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Thu, 31 Mar 2016 01:41:43 +0200 Subject: [PATCH] counting word frequencies per document on import, preparation for tfidf --- vipra | 2 +- .../main/java/de/vipra/cmd/file/Filebase.java | 1 + .../de/vipra/cmd/file/FilebaseWordIndex.java | 43 ++++++++++++++++--- .../src/main/java/de/vipra/util/CountMap.java | 6 ++- 4 files changed, 45 insertions(+), 7 deletions(-) diff --git a/vipra b/vipra index c93b564d..82a0b943 100755 --- a/vipra +++ b/vipra @@ -8,7 +8,7 @@ if [ $? -ne 0 ]; then fi # path -JAR="cmd-0.0.1-SNAPSHOT.jar" +JAR="vipra-cmd-0.0.1-SNAPSHOT.jar" DIR="./vipra-cmd/target" JARFILE="$DIR/$JAR" diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java index e5693aa8..09b461eb 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java @@ -40,6 +40,7 @@ public class Filebase { public void add(final ArticleFull article) throws FilebaseException { newArticles.put(article.getId().toString(), article); idDateIndex.add(article.getId().toString(), article.getDate()); + wordIndex.countWords(article.getWords()); } public void sync() throws IOException, ConfigException { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java index 420d97ad..fc1cb496 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java @@ -1,7 +1,10 @@ package de.vipra.cmd.file; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -9,8 +12,10 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import de.vipra.util.Constants; import de.vipra.util.CountMap; import de.vipra.util.FileUtils; +import de.vipra.util.model.ArticleWord; public class FilebaseWordIndex implements Iterable<String> { @@ -20,28 +25,48 @@ public class FilebaseWordIndex implements Iterable<String> { private final File file; private final List<String> words; private final Map<String, Integer> wordIndex; + private final CountMap<String> wordDocumentCount; private int nextIndex = 0; public FilebaseWordIndex(final File modelDir) throws IOException { file = new File(modelDir, FILE_NAME); if (file.exists()) { - words = FileUtils.readFile(file); - wordIndex = new HashMap<>(words.size()); - for (final String word : words) - wordIndex.put(word, nextIndex++); + final List<String> lines = FileUtils.readFile(file); + words = new ArrayList<>(lines.size()); + wordIndex = new HashMap<>(lines.size()); + wordDocumentCount = new CountMap<>(lines.size()); + for (final String line : lines) { + final String[] parts = line.split(","); + words.add(parts[0]); + wordIndex.put(parts[0], nextIndex++); + wordDocumentCount.count(parts[0], Integer.parseInt(parts[1])); + } } else { words = new ArrayList<>(); wordIndex = new HashMap<>(); + wordDocumentCount = new CountMap<>(); } } public void sync() throws IOException { if (!dirty) return; - org.apache.commons.io.FileUtils.writeLines(file, words); + BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false))); + for (String word : words) { + out.write(word); + out.write(","); + out.write(Integer.toString(wordDocumentCount.get(word))); + out.write(Constants.LINE_SEP); + } + out.close(); dirty = false; } + public void countWords(final List<ArticleWord> articleWords) { + for (ArticleWord articleWord : articleWords) + wordDocumentCount.count(articleWord.getWord()); + } + public String transform(final String[] words) { final CountMap<String> countMap = new CountMap<>(); for (final String word : words) @@ -66,6 +91,14 @@ public class FilebaseWordIndex implements Iterable<String> { return index; } + public int getWordDocumentCount(final String word) { + return wordDocumentCount.get(word); + } + + public int getWordDocumentCount(final int wordIndex) { + return getWordDocumentCount(words.get(wordIndex)); + } + public String word(final int index) { return words.get(index); } diff --git a/vipra-util/src/main/java/de/vipra/util/CountMap.java b/vipra-util/src/main/java/de/vipra/util/CountMap.java index 0285cd39..8bca6b16 100644 --- a/vipra-util/src/main/java/de/vipra/util/CountMap.java +++ b/vipra-util/src/main/java/de/vipra/util/CountMap.java @@ -14,7 +14,11 @@ public class CountMap<T> { } public CountMap(final Map<T, Integer> map) { - this.map = map; + this.map = new HashMap<>(map); + } + + public CountMap(final int size) { + this.map = new HashMap<>(size); } public void count(final T t) { -- GitLab