Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
FilebaseWordIndex.java 2.40 KiB
package de.vipra.cmd.file;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import de.vipra.util.CountMap;
import de.vipra.util.FileUtils;
import de.vipra.util.StringUtils;

public class FilebaseWordIndex implements Iterable<String> {

	public static final String FILE_NAME = "word.idx";

	private boolean dirty = false;
	private final File file;
	private final List<String> words;
	private final Map<String, Integer> wordIndex;
	private final Set<String> newWords;
	private int nextIndex = 0;

	public FilebaseWordIndex(final File modelDir) throws IOException {
		file = new File(modelDir, FILE_NAME);
		newWords = new HashSet<>();
		if (file.exists()) {
			final List<String> lines = FileUtils.readFile(file);
			words = new ArrayList<>(lines.size());
			wordIndex = new HashMap<>(lines.size());
			for (final String line : lines) {
				final String[] parts = line.split(",");
				words.add(parts[0]);
				wordIndex.put(parts[0], nextIndex++);
			}
		} else {
			words = new ArrayList<>();
			wordIndex = new HashMap<>();
		}
	}

	public void sync() throws IOException {
		if (!dirty)
			return;
		org.apache.commons.io.FileUtils.writeLines(file, words);
		dirty = false;
	}

	public String transform(final String[] words) {
		final CountMap<String> countMap = new CountMap<>();
		for (final String word : words) {
			if (word != null && !word.trim().isEmpty()) {
				countMap.count(word.trim());
				if (StringUtils.isWord(word))
					newWords.add(word.toLowerCase().trim());
			}
		}

		final StringBuilder sb = new StringBuilder();
		sb.append(countMap.size());
		for (final Entry<String, Integer> entry : countMap.entrySet())
			sb.append(" ").append(index(entry.getKey())).append(":").append(entry.getValue());

		return sb.toString();
	}

	public int index(String word) {
		word = word.toLowerCase().trim();
		Integer index = wordIndex.get(word);
		if (index == null) {
			index = nextIndex++;
			words.add(word);
			wordIndex.put(word, index);
			dirty = true;
		}
		return index;
	}

	public String word(final int index) {
		return words.get(index);
	}

	public int size() {
		return words.size();
	}

	public Set<String> getNewWords() {
		return newWords;
	}

	@Override
	public Iterator<String> iterator() {
		return words.iterator();
	}

}