From 6b5802359f9442d7d347987e1ab4327da94b8f12 Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Thu, 31 Mar 2016 01:41:43 +0200
Subject: [PATCH] counting word frequencies per document on import, preparation
 for tfidf

---
 vipra                                         |  2 +-
 .../main/java/de/vipra/cmd/file/Filebase.java |  1 +
 .../de/vipra/cmd/file/FilebaseWordIndex.java  | 43 ++++++++++++++++---
 .../src/main/java/de/vipra/util/CountMap.java |  6 ++-
 4 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/vipra b/vipra
index c93b564d..82a0b943 100755
--- a/vipra
+++ b/vipra
@@ -8,7 +8,7 @@ if [ $? -ne 0 ]; then
 fi
 
 # path
-JAR="cmd-0.0.1-SNAPSHOT.jar"
+JAR="vipra-cmd-0.0.1-SNAPSHOT.jar"
 DIR="./vipra-cmd/target"
 JARFILE="$DIR/$JAR"
 
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java
index e5693aa8..09b461eb 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java
@@ -40,6 +40,7 @@ public class Filebase {
 	public void add(final ArticleFull article) throws FilebaseException {
 		newArticles.put(article.getId().toString(), article);
 		idDateIndex.add(article.getId().toString(), article.getDate());
+		wordIndex.countWords(article.getWords());
 	}
 
 	public void sync() throws IOException, ConfigException {
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java
index 420d97ad..fc1cb496 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java
@@ -1,7 +1,10 @@
 package de.vipra.cmd.file;
 
+import java.io.BufferedWriter;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -9,8 +12,10 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 
+import de.vipra.util.Constants;
 import de.vipra.util.CountMap;
 import de.vipra.util.FileUtils;
+import de.vipra.util.model.ArticleWord;
 
 public class FilebaseWordIndex implements Iterable<String> {
 
@@ -20,28 +25,48 @@ public class FilebaseWordIndex implements Iterable<String> {
 	private final File file;
 	private final List<String> words;
 	private final Map<String, Integer> wordIndex;
+	private final CountMap<String> wordDocumentCount;
 	private int nextIndex = 0;
 
 	public FilebaseWordIndex(final File modelDir) throws IOException {
 		file = new File(modelDir, FILE_NAME);
 		if (file.exists()) {
-			words = FileUtils.readFile(file);
-			wordIndex = new HashMap<>(words.size());
-			for (final String word : words)
-				wordIndex.put(word, nextIndex++);
+			final List<String> lines = FileUtils.readFile(file);
+			words = new ArrayList<>(lines.size());
+			wordIndex = new HashMap<>(lines.size());
+			wordDocumentCount = new CountMap<>(lines.size());
+			for (final String line : lines) {
+				final String[] parts = line.split(",");
+				words.add(parts[0]);
+				wordIndex.put(parts[0], nextIndex++);
+				wordDocumentCount.count(parts[0], Integer.parseInt(parts[1]));
+			}
 		} else {
 			words = new ArrayList<>();
 			wordIndex = new HashMap<>();
+			wordDocumentCount = new CountMap<>();
 		}
 	}
 
 	public void sync() throws IOException {
 		if (!dirty)
 			return;
-		org.apache.commons.io.FileUtils.writeLines(file, words);
+		BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false)));
+		for (String word : words) {
+			out.write(word);
+			out.write(",");
+			out.write(Integer.toString(wordDocumentCount.get(word)));
+			out.write(Constants.LINE_SEP);
+		}
+		out.close();
 		dirty = false;
 	}
 
+	public void countWords(final List<ArticleWord> articleWords) {
+		for (ArticleWord articleWord : articleWords)
+			wordDocumentCount.count(articleWord.getWord());
+	}
+
 	public String transform(final String[] words) {
 		final CountMap<String> countMap = new CountMap<>();
 		for (final String word : words)
@@ -66,6 +91,14 @@ public class FilebaseWordIndex implements Iterable<String> {
 		return index;
 	}
 
+	public int getWordDocumentCount(final String word) {
+		return wordDocumentCount.get(word);
+	}
+
+	public int getWordDocumentCount(final int wordIndex) {
+		return getWordDocumentCount(words.get(wordIndex));
+	}
+
 	public String word(final int index) {
 		return words.get(index);
 	}
diff --git a/vipra-util/src/main/java/de/vipra/util/CountMap.java b/vipra-util/src/main/java/de/vipra/util/CountMap.java
index 0285cd39..8bca6b16 100644
--- a/vipra-util/src/main/java/de/vipra/util/CountMap.java
+++ b/vipra-util/src/main/java/de/vipra/util/CountMap.java
@@ -14,7 +14,11 @@ public class CountMap<T> {
 	}
 
 	public CountMap(final Map<T, Integer> map) {
-		this.map = map;
+		this.map = new HashMap<>(map);
+	}
+
+	public CountMap(final int size) {
+		this.map = new HashMap<>(size);
 	}
 
 	public void count(final T t) {
-- 
GitLab