Skip to content
Snippets Groups Projects
Commit 6b580235 authored by Eike Cochu's avatar Eike Cochu
Browse files

counting word frequencies per document on import, preparation for tfidf

parent babcae3b
No related branches found
No related tags found
No related merge requests found
......@@ -8,7 +8,7 @@ if [ $? -ne 0 ]; then
fi
# path
JAR="cmd-0.0.1-SNAPSHOT.jar"
JAR="vipra-cmd-0.0.1-SNAPSHOT.jar"
DIR="./vipra-cmd/target"
JARFILE="$DIR/$JAR"
......
......@@ -40,6 +40,7 @@ public class Filebase {
public void add(final ArticleFull article) throws FilebaseException {
newArticles.put(article.getId().toString(), article);
idDateIndex.add(article.getId().toString(), article.getDate());
wordIndex.countWords(article.getWords());
}
public void sync() throws IOException, ConfigException {
......
package de.vipra.cmd.file;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
......@@ -9,8 +12,10 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import de.vipra.util.Constants;
import de.vipra.util.CountMap;
import de.vipra.util.FileUtils;
import de.vipra.util.model.ArticleWord;
public class FilebaseWordIndex implements Iterable<String> {
......@@ -20,28 +25,48 @@ public class FilebaseWordIndex implements Iterable<String> {
private final File file;
private final List<String> words;
private final Map<String, Integer> wordIndex;
private final CountMap<String> wordDocumentCount;
private int nextIndex = 0;
public FilebaseWordIndex(final File modelDir) throws IOException {
file = new File(modelDir, FILE_NAME);
if (file.exists()) {
words = FileUtils.readFile(file);
wordIndex = new HashMap<>(words.size());
for (final String word : words)
wordIndex.put(word, nextIndex++);
final List<String> lines = FileUtils.readFile(file);
words = new ArrayList<>(lines.size());
wordIndex = new HashMap<>(lines.size());
wordDocumentCount = new CountMap<>(lines.size());
for (final String line : lines) {
final String[] parts = line.split(",");
words.add(parts[0]);
wordIndex.put(parts[0], nextIndex++);
wordDocumentCount.count(parts[0], Integer.parseInt(parts[1]));
}
} else {
words = new ArrayList<>();
wordIndex = new HashMap<>();
wordDocumentCount = new CountMap<>();
}
}
public void sync() throws IOException {
if (!dirty)
return;
org.apache.commons.io.FileUtils.writeLines(file, words);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false)));
for (String word : words) {
out.write(word);
out.write(",");
out.write(Integer.toString(wordDocumentCount.get(word)));
out.write(Constants.LINE_SEP);
}
out.close();
dirty = false;
}
public void countWords(final List<ArticleWord> articleWords) {
for (ArticleWord articleWord : articleWords)
wordDocumentCount.count(articleWord.getWord());
}
public String transform(final String[] words) {
final CountMap<String> countMap = new CountMap<>();
for (final String word : words)
......@@ -66,6 +91,14 @@ public class FilebaseWordIndex implements Iterable<String> {
return index;
}
public int getWordDocumentCount(final String word) {
return wordDocumentCount.get(word);
}
public int getWordDocumentCount(final int wordIndex) {
return getWordDocumentCount(words.get(wordIndex));
}
public String word(final int index) {
return words.get(index);
}
......
......@@ -14,7 +14,11 @@ public class CountMap<T> {
}
public CountMap(final Map<T, Integer> map) {
this.map = map;
this.map = new HashMap<>(map);
}
public CountMap(final int size) {
this.map = new HashMap<>(size);
}
public void count(final T t) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment