Skip to content
Snippets Groups Projects
Commit 396249a9 authored by Eike Cochu's avatar Eike Cochu
Browse files

updated dtm analyzer

dtm analyzer updated model import code, unfinished
added constants for dtm iterations
parent 1637a1a4
Branches
No related tags found
No related merge requests found
Showing
with 231 additions and 75 deletions
...@@ -57,7 +57,6 @@ echo "-------------------------------" >> $LOG ...@@ -57,7 +57,6 @@ echo "-------------------------------" >> $LOG
cd ./vipra-ui cd ./vipra-ui
./build.sh >> $LOG 2>&1 ./build.sh >> $LOG 2>&1
cd .. cd ..
cp -r ./vipra-ui/public ./vipra-backend/src/main/webapp/public
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "error" echo "error"
exit 1 exit 1
......
<?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0"> <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
<wb-module deploy-name="vipra-backend"> <wb-module deploy-name="vipra-backend">
<wb-resource deploy-path="/" source-path="/target/m2e-wtp/web-resources"/> <wb-resource deploy-path="/" source-path="/target/m2e-wtp/web-resources"/>
<wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/> <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/resources"/> <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/resources"/>
<wb-resource deploy-path="/" source-path="/src/main/webapp"/>
<dependent-module archiveName="util-0.0.1-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/vipra-util/vipra-util"> <dependent-module archiveName="util-0.0.1-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/vipra-util/vipra-util">
<dependency-type>uses</dependency-type> <dependency-type>uses</dependency-type>
</dependent-module> </dependent-module>
......
...@@ -81,8 +81,11 @@ public class InfoResource { ...@@ -81,8 +81,11 @@ public class InfoResource {
info.put("const.topicautoname", Constants.TOPIC_AUTO_NAMING_WORDS); info.put("const.topicautoname", Constants.TOPIC_AUTO_NAMING_WORDS);
info.put("const.ktopics", Constants.K_TOPICS); info.put("const.ktopics", Constants.K_TOPICS);
info.put("const.ktopicwords", Constants.K_TOPIC_WORDS); info.put("const.ktopicwords", Constants.K_TOPIC_WORDS);
info.put("const.minimumlike", Constants.MINIMUM_LIKELINESS); info.put("const.minimumlike", Constants.MINIMUM_RELATIVE_PROB);
info.put("const.topicthresh", Constants.TOPIC_THRESHOLD); info.put("const.topicthresh", Constants.TOPIC_THRESHOLD);
info.put("const.dynminiter", Constants.DYNAMIC_MIN_ITER);
info.put("const.dynmaxiter", Constants.DYNAMIC_MAX_ITER);
info.put("const.statiter", Constants.STATIC_ITER);
info.put("const.docminfreq", Constants.DOCUMENT_MIN_WORD_FREQ); info.put("const.docminfreq", Constants.DOCUMENT_MIN_WORD_FREQ);
info.put("const.docminlength", Constants.DOCUMENT_MIN_LENGTH); info.put("const.docminlength", Constants.DOCUMENT_MIN_LENGTH);
info.put("const.charsdisallow", Constants.CHARS_DISALLOWED); info.put("const.charsdisallow", Constants.CHARS_DISALLOWED);
......
...@@ -6,12 +6,12 @@ import static de.vipra.cmd.CmdOptions.OPT_DEBUG; ...@@ -6,12 +6,12 @@ import static de.vipra.cmd.CmdOptions.OPT_DEBUG;
import static de.vipra.cmd.CmdOptions.OPT_DEFAULTS; import static de.vipra.cmd.CmdOptions.OPT_DEFAULTS;
import static de.vipra.cmd.CmdOptions.OPT_HELP; import static de.vipra.cmd.CmdOptions.OPT_HELP;
import static de.vipra.cmd.CmdOptions.OPT_IMPORT; import static de.vipra.cmd.CmdOptions.OPT_IMPORT;
import static de.vipra.cmd.CmdOptions.OPT_INDEXING;
import static de.vipra.cmd.CmdOptions.OPT_MODELING; import static de.vipra.cmd.CmdOptions.OPT_MODELING;
import static de.vipra.cmd.CmdOptions.OPT_SHELL; import static de.vipra.cmd.CmdOptions.OPT_SHELL;
import static de.vipra.cmd.CmdOptions.OPT_SILENT; import static de.vipra.cmd.CmdOptions.OPT_SILENT;
import static de.vipra.cmd.CmdOptions.OPT_STATS; import static de.vipra.cmd.CmdOptions.OPT_STATS;
import static de.vipra.cmd.CmdOptions.OPT_TEST; import static de.vipra.cmd.CmdOptions.OPT_TEST;
import static de.vipra.cmd.CmdOptions.OPT_INDEXING;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
......
...@@ -11,7 +11,6 @@ import java.util.List; ...@@ -11,7 +11,6 @@ import java.util.List;
import de.vipra.cmd.ex.FilebaseException; import de.vipra.cmd.ex.FilebaseException;
import de.vipra.cmd.file.DTMSequenceIndex.DTMDateIndexEntry; import de.vipra.cmd.file.DTMSequenceIndex.DTMDateIndexEntry;
import de.vipra.util.Config;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.FileUtils; import de.vipra.util.FileUtils;
import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.ConfigException;
...@@ -27,17 +26,11 @@ public class DTMFilebase extends Filebase { ...@@ -27,17 +26,11 @@ public class DTMFilebase extends Filebase {
public DTMFilebase(File dataDir) throws FilebaseException { public DTMFilebase(File dataDir) throws FilebaseException {
super(dataDir, "dtm"); super(dataDir, "dtm");
Config config;
try {
config = Config.getConfig();
} catch (IOException | ConfigException e) {
throw new FilebaseException(e);
}
File modelDir = getModelDir(); File modelDir = getModelDir();
try { try {
this.seqindex = new DTMSequenceIndex(modelDir, config.windowResolution, false); this.seqindex = new DTMSequenceIndex(modelDir);
} catch (IOException | ParseException e) { } catch (IOException | ConfigException | ParseException e) {
throw new FilebaseException("could not read date index file", e); throw new FilebaseException("could not read date index file", e);
} }
......
...@@ -16,9 +16,11 @@ import java.util.Iterator; ...@@ -16,9 +16,11 @@ import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import de.vipra.util.Config;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.Constants.WindowResolution; import de.vipra.util.Constants.WindowResolution;
import de.vipra.util.FileUtils; import de.vipra.util.FileUtils;
import de.vipra.util.ex.ConfigException;
public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DTMDateIndexEntry> { public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DTMDateIndexEntry> {
...@@ -46,13 +48,19 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT ...@@ -46,13 +48,19 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT
} }
private File file; private File file;
private static WindowResolution windowResolution; private boolean readonly = false;
private WindowResolution windowResolution;
private static List<DTMDateIndexEntry> entries; private static List<DTMDateIndexEntry> entries;
private static SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); private static SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT);
public DTMSequenceIndex(File modelDir, WindowResolution wr, boolean reread) throws IOException, ParseException { public DTMSequenceIndex(File modelDir) throws IOException, ParseException, ConfigException {
this(modelDir, false);
}
public DTMSequenceIndex(File modelDir, boolean reread) throws IOException, ParseException, ConfigException {
this.file = new File(modelDir, "dates"); this.file = new File(modelDir, "dates");
windowResolution = wr; Config config = Config.getConfig();
this.windowResolution = config.windowResolution;
if (file.exists()) { if (file.exists()) {
if (entries == null || reread) { if (entries == null || reread) {
List<String> dates = FileUtils.readFile(file); List<String> dates = FileUtils.readFile(file);
...@@ -78,6 +86,8 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT ...@@ -78,6 +86,8 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT
@Override @Override
public void close() throws IOException { public void close() throws IOException {
if (readonly)
return;
List<String> windows = new ArrayList<>(); List<String> windows = new ArrayList<>();
Map<String, Integer> windowSizes = new HashMap<>(); Map<String, Integer> windowSizes = new HashMap<>();
...@@ -111,4 +121,8 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT ...@@ -111,4 +121,8 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT
writer.close(); writer.close();
} }
public int size() {
return entries.size();
}
} }
...@@ -22,6 +22,10 @@ public class DTMVocabulary implements Closeable, Iterable<String> { ...@@ -22,6 +22,10 @@ public class DTMVocabulary implements Closeable, Iterable<String> {
private static Map<String, Integer> vocablesMap; private static Map<String, Integer> vocablesMap;
private static int nextIndex = 1; private static int nextIndex = 1;
public DTMVocabulary(File modelDir) throws IOException {
this(modelDir, false);
}
public DTMVocabulary(File modelDir, boolean reread) throws IOException { public DTMVocabulary(File modelDir, boolean reread) throws IOException {
this.file = new File(modelDir, FILE_NAME); this.file = new File(modelDir, FILE_NAME);
if (file.exists()) { if (file.exists()) {
...@@ -80,6 +84,10 @@ public class DTMVocabulary implements Closeable, Iterable<String> { ...@@ -80,6 +84,10 @@ public class DTMVocabulary implements Closeable, Iterable<String> {
return sb.toString(); return sb.toString();
} }
public String get(int index) {
return vocables.get(index);
}
@Override @Override
public void close() throws IOException { public void close() throws IOException {
write(); write();
......
...@@ -12,11 +12,13 @@ import de.vipra.util.FileUtils; ...@@ -12,11 +12,13 @@ import de.vipra.util.FileUtils;
public class FilebaseIndex implements Closeable, Iterable<String> { public class FilebaseIndex implements Closeable, Iterable<String> {
public static final String FILE_NAME = "index";
private final File file; private final File file;
private final List<String> index; private final List<String> index;
public FilebaseIndex(File file) throws IOException { public FilebaseIndex(File modelDir) throws IOException {
this.file = file; this.file = new File(modelDir, FILE_NAME);
if (file.exists()) { if (file.exists()) {
index = new ArrayList<>(FileUtils.readFile(file)); index = new ArrayList<>(FileUtils.readFile(file));
} else { } else {
......
...@@ -2,30 +2,49 @@ package de.vipra.cmd.lda; ...@@ -2,30 +2,49 @@ package de.vipra.cmd.lda;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.bson.types.ObjectId;
import de.vipra.cmd.ex.AnalyzerException; import de.vipra.cmd.ex.AnalyzerException;
import de.vipra.cmd.file.DTMSequenceIndex;
import de.vipra.cmd.file.DTMVocabulary;
import de.vipra.cmd.file.FilebaseIndex;
import de.vipra.util.Config; import de.vipra.util.Config;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.FileUtils;
import de.vipra.util.StringUtils; import de.vipra.util.StringUtils;
import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.ConfigException;
import de.vipra.util.ex.DatabaseException;
import de.vipra.util.model.ArticleFull;
import de.vipra.util.model.TopicFull;
import de.vipra.util.model.TopicWord;
import de.vipra.util.model.Word;
import de.vipra.util.service.MongoService;
public class DTMAnalyzer extends Analyzer { public class DTMAnalyzer extends Analyzer {
public static final Logger log = LogManager.getLogger(DTMAnalyzer.class); public static final Logger log = LogManager.getLogger(DTMAnalyzer.class);
public static final String NAME = "dtm";
public static final int dynamicMinIter = 100; public static final String NAME = "dtm";
public static final int dynamicMaxIter = 1000;
public static final int staticIter = 100;
private String command; private String command;
private File modelDir; private File modelDir;
private File outDir; private File outDir;
private File outDirSeq;
private DTMVocabulary vocab;
private DTMSequenceIndex sequences;
private FilebaseIndex index;
private MongoService<ArticleFull, ObjectId> dbArticles;
private MongoService<TopicFull, ObjectId> dbTopics;
private MongoService<Word, String> dbWords;
protected DTMAnalyzer() { protected DTMAnalyzer() {
super("Dynamic Topic Model Analyzer"); super("Dynamic Topic Model Analyzer");
...@@ -37,7 +56,16 @@ public class DTMAnalyzer extends Analyzer { ...@@ -37,7 +56,16 @@ public class DTMAnalyzer extends Analyzer {
File dataDir = config.getDataDirectory(); File dataDir = config.getDataDirectory();
this.modelDir = new File(dataDir, NAME); this.modelDir = new File(dataDir, NAME);
this.outDir = new File(modelDir, "out"); this.outDir = new File(modelDir, "out");
} catch (ConfigException e) { this.outDirSeq = new File(outDir, "lda-seq");
this.vocab = new DTMVocabulary(modelDir);
this.sequences = new DTMSequenceIndex(modelDir);
index = new FilebaseIndex(modelDir);
config = Config.getConfig();
dbArticles = MongoService.getDatabaseService(config, ArticleFull.class);
dbTopics = MongoService.getDatabaseService(config, TopicFull.class);
dbWords = MongoService.getDatabaseService(config, Word.class);
} catch (ConfigException | IOException | ParseException e) {
throw new AnalyzerException(e); throw new AnalyzerException(e);
} }
...@@ -66,11 +94,11 @@ public class DTMAnalyzer extends Analyzer { ...@@ -66,11 +94,11 @@ public class DTMAnalyzer extends Analyzer {
// alpha (default -10) // alpha (default -10)
"--alpha=0.01", "--alpha=0.01",
// minimum number if iterations // minimum number if iterations
"--lda_sequence_min_iter=" + dynamicMinIter, "--lda_sequence_min_iter=" + Constants.DYNAMIC_MIN_ITER,
// maximum number of iterations // maximum number of iterations
"--lda_sequence_max_iter=" + dynamicMaxIter, "--lda_sequence_max_iter=" + Constants.DYNAMIC_MAX_ITER,
// em iter (default 20) // em iter (default 20)
"--lda_max_em_iter=" + staticIter, "--lda_max_em_iter=" + Constants.STATIC_ITER,
// input file prefix // input file prefix
"--corpus_prefix=" + corpusPrefix, "--corpus_prefix=" + corpusPrefix,
// output directory // output directory
...@@ -86,6 +114,7 @@ public class DTMAnalyzer extends Analyzer { ...@@ -86,6 +114,7 @@ public class DTMAnalyzer extends Analyzer {
if (!p.isAlive()) if (!p.isAlive())
throw new AnalyzerException("dtm process is dead"); throw new AnalyzerException("dtm process is dead");
// read from process output
BufferedReader in = new BufferedReader(new InputStreamReader(p.getErrorStream())); BufferedReader in = new BufferedReader(new InputStreamReader(p.getErrorStream()));
String line; String line;
...@@ -99,7 +128,101 @@ public class DTMAnalyzer extends Analyzer { ...@@ -99,7 +128,101 @@ public class DTMAnalyzer extends Analyzer {
in.close(); in.close();
p.waitFor(); p.waitFor();
// TODO save model List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS);
List<Word> newWords = new ArrayList<>(vocab.size());
int sequencesCount = sequences.size();
// read topic definition files
for (int i = 0; i < Constants.K_TOPICS; i++) {
File seqFile = new File(outDirSeq, "topic-" + StringUtils.padNumber(i, 3) + "-var-e-log-prob.dat");
if (!seqFile.exists()) {
log.error("seq file " + seqFile.getName() + " not found");
continue;
}
int lineCount = FileUtils.countLines(seqFile);
int wordsPerSequence = lineCount / sequencesCount;
if (wordsPerSequence * sequencesCount != lineCount) {
log.error("unexpected number of words per sequence");
continue;
}
in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile)));
// read file lines into word x sequence matrix
// gather maximum likeliness per sequence
double[] maxLikelinesses = new double[sequencesCount];
double[][] likelinesses = new double[wordsPerSequence][sequencesCount];
for (int idxWord = 0; idxWord < wordsPerSequence; idxWord++) {
for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) {
double likeliness = Double.parseDouble(in.readLine());
likelinesses[idxWord][idxSeq] = likeliness;
if (likeliness > maxLikelinesses[idxSeq])
maxLikelinesses[idxSeq] = likeliness;
}
}
in.close();
// go through each sequence and gather all words that are above
// the minimum relative word likeliness
for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) {
double maxLikeliness = maxLikelinesses[idxSeq];
List<TopicWord> newSeqTopicWords = new ArrayList<>(wordsPerSequence);
for (int idxWord = 0; idxWord < wordsPerSequence; idxWord++) {
double likeliness = likelinesses[idxWord][idxSeq];
if (likeliness >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) {
Word newWord = new Word(vocab.get(idxWord));
newWords.add(newWord);
TopicWord newTopicWord = new TopicWord(newWord, likeliness);
newSeqTopicWords.add(newTopicWord);
// TODO gather words for static topic
}
}
// TODO create dynamic topic of sequence
}
// TODO create static topic
}
// recreate topics and words
dbTopics.drop();
dbWords.drop();
try {
dbTopics.createMultiple(newTopics);
dbWords.createMultiple(newWords);
} catch (DatabaseException e) {
throw new AnalyzerException(e);
}
// read gam.dat. It contains the topic proportions per article.
File gamFile = new File(outDir, "gam.dat");
if (!gamFile.exists()) {
throw new AnalyzerException("gam file " + gamFile.getName() + " not found");
}
in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile)));
for (int idxArticle = 0; idxArticle < index.size(); idxArticle++) {
double topicTotalProportions = 0;
double[] topicProportions = new double[Constants.K_TOPICS];
// gather individual topic proportions and sum of proportions
for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
double topicProportion = Double.parseDouble(in.readLine());
topicTotalProportions += topicProportion;
topicProportions[idxTopic] = topicProportion;
}
// normalize proportions
for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++)
topicProportions[idxTopic] /= topicTotalProportions;
}
// TODO create topicrefs
} catch (IOException | InterruptedException e) { } catch (IOException | InterruptedException e) {
throw new AnalyzerException(e); throw new AnalyzerException(e);
} }
......
...@@ -7,10 +7,8 @@ import java.io.IOException; ...@@ -7,10 +7,8 @@ import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
...@@ -72,6 +70,7 @@ public class JGibbAnalyzer extends Analyzer { ...@@ -72,6 +70,7 @@ public class JGibbAnalyzer extends Analyzer {
options.est = !options.estc; options.est = !options.estc;
options.K = Constants.K_TOPICS; options.K = Constants.K_TOPICS;
options.twords = Constants.K_TOPIC_WORDS; options.twords = Constants.K_TOPIC_WORDS;
options.niters = Constants.STATIC_ITER;
modelFile = new File(modelDir, NAME); modelFile = new File(modelDir, NAME);
options.dfile = modelFile.getName(); options.dfile = modelFile.getName();
...@@ -83,7 +82,7 @@ public class JGibbAnalyzer extends Analyzer { ...@@ -83,7 +82,7 @@ public class JGibbAnalyzer extends Analyzer {
dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); dbArticles = MongoService.getDatabaseService(config, ArticleFull.class);
dbTopics = MongoService.getDatabaseService(config, TopicFull.class); dbTopics = MongoService.getDatabaseService(config, TopicFull.class);
dbWords = MongoService.getDatabaseService(config, Word.class); dbWords = MongoService.getDatabaseService(config, Word.class);
index = new FilebaseIndex(new File(modelDir, "index")); index = new FilebaseIndex(modelDir);
} catch (Exception e) { } catch (Exception e) {
throw new AnalyzerException(e); throw new AnalyzerException(e);
} }
...@@ -110,36 +109,48 @@ public class JGibbAnalyzer extends Analyzer { ...@@ -110,36 +109,48 @@ public class JGibbAnalyzer extends Analyzer {
// the list of new topics // the list of new topics
List<TopicFull> newTopics = new ArrayList<>(options.K); List<TopicFull> newTopics = new ArrayList<>(options.K);
// a map of topic index -> topic. resolves topic ids from tassign file
Map<Integer, Topic> newTopicsMap = new HashMap<>(options.K);
// set of new words // set of new words
Set<Word> newWords = new HashSet<>(); Set<Word> newWords = new HashSet<>();
TopicFull newTopic = null;
List<TopicWord> topicWords = null; List<TopicWord> topicWords = null;
int topicNum = 0; int topicIndex = -1;
double[] maxLikelinesses = new double[Constants.K_TOPICS];
// for each line // create topics and determine maximum likeliness for each topic
for (String line : lines) { for (String line : lines) {
if (!line.startsWith("\t")) { if (!line.startsWith("\t")) {
newTopic = new TopicFull(); topicIndex++;
topicWords = new ArrayList<>(); topicWords = new ArrayList<>(Constants.K_TOPIC_WORDS);
TopicFull newTopic = new TopicFull();
newTopic.setWords(topicWords); newTopic.setWords(topicWords);
newTopics.add(newTopic); newTopics.add(newTopic);
newTopicsMap.put(topicNum++, new Topic(newTopic.getId()));
continue; continue;
} }
String[] parts = line.trim().split("\\s+"); String[] parts = line.trim().split("\\s+");
double likeliness = Double.parseDouble(parts[1]); double likeliness = Double.parseDouble(parts[1]);
// determine maximum likeliness of this topic
if (likeliness > maxLikelinesses[topicIndex])
maxLikelinesses[topicIndex] = likeliness;
// check word likeliness // check word likeliness
if (likeliness >= Constants.MINIMUM_LIKELINESS) { topicWords.add(new TopicWord(new Word(parts[0]), likeliness));
Word newWord = new Word(parts[0]); }
TopicWord topicWord = new TopicWord(newWord, likeliness);
topicWords.add(topicWord); // filter out words below minimum relative likeliness, add accepted
newWords.add(newWord); // words to list of new words
for (topicIndex = 0; topicIndex < newTopics.size(); topicIndex++) {
TopicFull topic = newTopics.get(topicIndex);
double maxLikeliness = maxLikelinesses[topicIndex];
ArrayList<TopicWord> filteredTopicWords = new ArrayList<>(topic.getWords().size());
for (TopicWord word : topic.getWords()) {
if (word.getLikeliness() >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) {
filteredTopicWords.add(word);
newWords.add(word.getWord());
}
} }
topic.setWords(filteredTopicWords);
} }
// sort topic words and generate topic name // sort topic words and generate topic name
...@@ -189,10 +200,10 @@ public class JGibbAnalyzer extends Analyzer { ...@@ -189,10 +200,10 @@ public class JGibbAnalyzer extends Analyzer {
// check if topic above threshold // check if topic above threshold
if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) { if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) {
reducedCount += entry.getValue(); reducedCount += entry.getValue();
Topic topic = newTopicsMap.get(Integer.parseInt(entry.getKey())); TopicFull topic = newTopics.get(Integer.parseInt(entry.getKey()));
TopicRef ref = new TopicRef(); TopicRef ref = new TopicRef();
ref.setCount(entry.getValue()); ref.setCount(entry.getValue());
ref.setTopic(topic); ref.setTopic(new Topic(topic.getId()));
newTopicRefs.add(ref); newTopicRefs.add(ref);
} }
} }
......
...@@ -7,9 +7,9 @@ import java.util.Set; ...@@ -7,9 +7,9 @@ import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator; import edu.stanford.nlp.pipeline.Annotator;
......
...@@ -8,8 +8,8 @@ import java.util.Properties; ...@@ -8,8 +8,8 @@ import java.util.Properties;
import java.util.Set; import java.util.Set;
import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator; import edu.stanford.nlp.pipeline.Annotator;
......
...@@ -4,5 +4,5 @@ db.name=test ...@@ -4,5 +4,5 @@ db.name=test
es.host=localhost es.host=localhost
es.port=9300 es.port=9300
tm.processor=corenlp tm.processor=corenlp
tm.analyzer=jgibb tm.analyzer=dtm
tm.dtmpath=/home/eike/repos/master/dtm_release/dtm/main tm.dtmpath=/home/eike/repos/master/dtm_release/dtm/main
\ No newline at end of file
/******************************************************************************
* Vipra Application
* Configuration
******************************************************************************/
/* globals Vipra */ /* globals Vipra */
(function() { (function() {
......
...@@ -79,7 +79,7 @@ public class Constants { ...@@ -79,7 +79,7 @@ public class Constants {
/** /**
* Minimum likeliness of words. Words with lower likeliness are ignored * Minimum likeliness of words. Words with lower likeliness are ignored
*/ */
public static final double MINIMUM_LIKELINESS = 0; public static final double MINIMUM_RELATIVE_PROB = 0.01;
/** /**
* Topics with a share greater or equal to this number are regarded as * Topics with a share greater or equal to this number are regarded as
...@@ -87,6 +87,21 @@ public class Constants { ...@@ -87,6 +87,21 @@ public class Constants {
*/ */
public static final double TOPIC_THRESHOLD = 0.01; public static final double TOPIC_THRESHOLD = 0.01;
/**
* Dynamic minimum iterations. Used for dynamic topic modeling.
*/
public static final int DYNAMIC_MIN_ITER = 100;
/**
* Dynamic maximum iterations. Used for dynamic topic modeling.
*/
public static final int DYNAMIC_MAX_ITER = 1000;
/**
* Static iterations. Used for static topic modeling.
*/
public static final int STATIC_ITER = 100;
/** /**
* Minimum word frequency for words to be used for topic modeling. All words * Minimum word frequency for words to be used for topic modeling. All words
* below this frequency in a document are filtered out before generating the * below this frequency in a document are filtered out before generating the
......
package de.vipra.util; package de.vipra.util;
import java.io.BufferedInputStream;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
...@@ -112,27 +111,12 @@ public class FileUtils extends org.apache.commons.io.FileUtils { ...@@ -112,27 +111,12 @@ public class FileUtils extends org.apache.commons.io.FileUtils {
* @throws IOException * @throws IOException
*/ */
public static int countLines(File file) throws IOException { public static int countLines(File file) throws IOException {
if (!file.exists()) { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
return 0; int lines = 0;
} while (reader.readLine() != null)
InputStream is = new BufferedInputStream(new FileInputStream(file)); lines++;
try { reader.close();
byte[] c = new byte[1024]; return lines;
int count = 0;
int readChars = 0;
boolean empty = true;
while ((readChars = is.read(c)) != -1) {
empty = false;
for (int i = 0; i < readChars; ++i) {
if (c[i] == '\n') {
++count;
}
}
}
return (count == 0 && !empty) ? 1 : count;
} finally {
is.close();
}
} }
/** /**
......
...@@ -81,9 +81,9 @@ public class StringUtils { ...@@ -81,9 +81,9 @@ public class StringUtils {
return StringUtils.join(parts); return StringUtils.join(parts);
} }
public static String padNumber(int lineCount) { public static String padNumber(int number, int length) {
String lc = Integer.toString(lineCount); String lc = Integer.toString(number);
while (lc.length() < 10) { while (lc.length() < length) {
lc = "0" + lc; lc = "0" + lc;
} }
return lc; return lc;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment