-
Eike Cochu authored
fixed word,entity menu ids, updated help screenshots, updated hypernym serialization, updated logging, added messages, more fixes
Eike Cochu authoredfixed word,entity menu ids, updated help screenshots, updated hypernym serialization, updated logging, added messages, more fixes
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
FilebaseWordIndex.java 2.40 KiB
package de.vipra.cmd.file;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import de.vipra.util.CountMap;
import de.vipra.util.FileUtils;
import de.vipra.util.StringUtils;
public class FilebaseWordIndex implements Iterable<String> {
public static final String FILE_NAME = "word.idx";
private boolean dirty = false;
private final File file;
private final List<String> words;
private final Map<String, Integer> wordIndex;
private final Set<String> newWords;
private int nextIndex = 0;
public FilebaseWordIndex(final File modelDir) throws IOException {
file = new File(modelDir, FILE_NAME);
newWords = new HashSet<>();
if (file.exists()) {
final List<String> lines = FileUtils.readFile(file);
words = new ArrayList<>(lines.size());
wordIndex = new HashMap<>(lines.size());
for (final String line : lines) {
final String[] parts = line.split(",");
words.add(parts[0]);
wordIndex.put(parts[0], nextIndex++);
}
} else {
words = new ArrayList<>();
wordIndex = new HashMap<>();
}
}
public void sync() throws IOException {
if (!dirty)
return;
org.apache.commons.io.FileUtils.writeLines(file, words);
dirty = false;
}
public String transform(final String[] words) {
final CountMap<String> countMap = new CountMap<>();
for (final String word : words) {
if (word != null && !word.trim().isEmpty()) {
countMap.count(word.trim());
if (StringUtils.isWord(word))
newWords.add(word.toLowerCase().trim());
}
}
final StringBuilder sb = new StringBuilder();
sb.append(countMap.size());
for (final Entry<String, Integer> entry : countMap.entrySet())
sb.append(" ").append(index(entry.getKey())).append(":").append(entry.getValue());
return sb.toString();
}
public int index(String word) {
word = word.toLowerCase().trim();
Integer index = wordIndex.get(word);
if (index == null) {
index = nextIndex++;
words.add(word);
wordIndex.put(word, index);
dirty = true;
}
return index;
}
public String word(final int index) {
return words.get(index);
}
public int size() {
return words.size();
}
public Set<String> getNewWords() {
return newWords;
}
@Override
public Iterator<String> iterator() {
return words.iterator();
}
}