diff --git a/vipra b/vipra index c93b564d256380af3f9c860a2801398240a9dbd3..82a0b94380184212cfd927124e77037254e7ca76 100755 --- a/vipra +++ b/vipra @@ -8,7 +8,7 @@ if [ $? -ne 0 ]; then fi # path -JAR="cmd-0.0.1-SNAPSHOT.jar" +JAR="vipra-cmd-0.0.1-SNAPSHOT.jar" DIR="./vipra-cmd/target" JARFILE="$DIR/$JAR" diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java index e5693aa8683d2ef0529671237bccc317690dfcee..09b461eb8d916b47852c879976e4c5721744a51b 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java @@ -40,6 +40,7 @@ public class Filebase { public void add(final ArticleFull article) throws FilebaseException { newArticles.put(article.getId().toString(), article); idDateIndex.add(article.getId().toString(), article.getDate()); + wordIndex.countWords(article.getWords()); } public void sync() throws IOException, ConfigException { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java index 420d97ad5b04610c771c8170be24b8fe3312d75b..fc1cb496dd7d6f8d5cb08796b0d478f4fa02ae07 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java @@ -1,7 +1,10 @@ package de.vipra.cmd.file; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -9,8 +12,10 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import de.vipra.util.Constants; import de.vipra.util.CountMap; import de.vipra.util.FileUtils; +import de.vipra.util.model.ArticleWord; public class FilebaseWordIndex implements Iterable<String> { @@ -20,28 +25,48 @@ public class FilebaseWordIndex implements Iterable<String> { private final File file; private final List<String> words; private final Map<String, Integer> wordIndex; + private final CountMap<String> wordDocumentCount; private int nextIndex = 0; public FilebaseWordIndex(final File modelDir) throws IOException { file = new File(modelDir, FILE_NAME); if (file.exists()) { - words = FileUtils.readFile(file); - wordIndex = new HashMap<>(words.size()); - for (final String word : words) - wordIndex.put(word, nextIndex++); + final List<String> lines = FileUtils.readFile(file); + words = new ArrayList<>(lines.size()); + wordIndex = new HashMap<>(lines.size()); + wordDocumentCount = new CountMap<>(lines.size()); + for (final String line : lines) { + final String[] parts = line.split(","); + words.add(parts[0]); + wordIndex.put(parts[0], nextIndex++); + wordDocumentCount.count(parts[0], Integer.parseInt(parts[1])); + } } else { words = new ArrayList<>(); wordIndex = new HashMap<>(); + wordDocumentCount = new CountMap<>(); } } public void sync() throws IOException { if (!dirty) return; - org.apache.commons.io.FileUtils.writeLines(file, words); + BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false))); + for (String word : words) { + out.write(word); + out.write(","); + out.write(Integer.toString(wordDocumentCount.get(word))); + out.write(Constants.LINE_SEP); + } + out.close(); dirty = false; } + public void countWords(final List<ArticleWord> articleWords) { + for (ArticleWord articleWord : articleWords) + wordDocumentCount.count(articleWord.getWord()); + } + public String transform(final String[] words) { final CountMap<String> countMap = new CountMap<>(); for (final String word : words) @@ -66,6 +91,14 @@ public class FilebaseWordIndex implements Iterable<String> { return index; } + public int getWordDocumentCount(final String word) { + return wordDocumentCount.get(word); + } + + public int getWordDocumentCount(final int wordIndex) { + return getWordDocumentCount(words.get(wordIndex)); + } + public String word(final int index) { return words.get(index); } diff --git a/vipra-util/src/main/java/de/vipra/util/CountMap.java b/vipra-util/src/main/java/de/vipra/util/CountMap.java index 0285cd3912b06acd0f67dc0d0460b118cb1edebf..8bca6b168f763e07c9e3b7642a6f82e573a6e1d7 100644 --- a/vipra-util/src/main/java/de/vipra/util/CountMap.java +++ b/vipra-util/src/main/java/de/vipra/util/CountMap.java @@ -14,7 +14,11 @@ public class CountMap<T> { } public CountMap(final Map<T, Integer> map) { - this.map = map; + this.map = new HashMap<>(map); + } + + public CountMap(final int size) { + this.map = new HashMap<>(size); } public void count(final T t) {