diff --git a/build.sh b/build.sh index 3c49ceb7dfedc8c8b081fda04336b1f76cf6fb82..f700a5ce55358f313861149226b15299459b809a 100755 --- a/build.sh +++ b/build.sh @@ -57,7 +57,6 @@ echo "-------------------------------" >> $LOG cd ./vipra-ui ./build.sh >> $LOG 2>&1 cd .. -cp -r ./vipra-ui/public ./vipra-backend/src/main/webapp/public if [ $? -ne 0 ]; then echo "error" exit 1 diff --git a/vipra-backend/.settings/org.eclipse.wst.common.component b/vipra-backend/.settings/org.eclipse.wst.common.component index 363ce15daea32cc3701cf69aeba0762672248c68..e7bb02f746329d8c18868f884bb4b9ca909b2514 100644 --- a/vipra-backend/.settings/org.eclipse.wst.common.component +++ b/vipra-backend/.settings/org.eclipse.wst.common.component @@ -1,9 +1,9 @@ <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0"> <wb-module deploy-name="vipra-backend"> <wb-resource deploy-path="/" source-path="/target/m2e-wtp/web-resources"/> - <wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/> <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/> <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/resources"/> + <wb-resource deploy-path="/" source-path="/src/main/webapp"/> <dependent-module archiveName="util-0.0.1-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/vipra-util/vipra-util"> <dependency-type>uses</dependency-type> </dependent-module> diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java index 9bd18963a1a7dbef12767cd988119f16bde5c307..a34bcd54dca65c54622cff90da89cebf83826aac 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java @@ -81,8 +81,11 @@ public class InfoResource { info.put("const.topicautoname", Constants.TOPIC_AUTO_NAMING_WORDS); info.put("const.ktopics", Constants.K_TOPICS); info.put("const.ktopicwords", Constants.K_TOPIC_WORDS); - info.put("const.minimumlike", Constants.MINIMUM_LIKELINESS); + info.put("const.minimumlike", Constants.MINIMUM_RELATIVE_PROB); info.put("const.topicthresh", Constants.TOPIC_THRESHOLD); + info.put("const.dynminiter", Constants.DYNAMIC_MIN_ITER); + info.put("const.dynmaxiter", Constants.DYNAMIC_MAX_ITER); + info.put("const.statiter", Constants.STATIC_ITER); info.put("const.docminfreq", Constants.DOCUMENT_MIN_WORD_FREQ); info.put("const.docminlength", Constants.DOCUMENT_MIN_LENGTH); info.put("const.charsdisallow", Constants.CHARS_DISALLOWED); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java index 70842fdfb932a45b41f6439d519b90e51d75afb4..c57aa58d6180c9a08c57acb1678658a17ae1a122 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java @@ -6,12 +6,12 @@ import static de.vipra.cmd.CmdOptions.OPT_DEBUG; import static de.vipra.cmd.CmdOptions.OPT_DEFAULTS; import static de.vipra.cmd.CmdOptions.OPT_HELP; import static de.vipra.cmd.CmdOptions.OPT_IMPORT; +import static de.vipra.cmd.CmdOptions.OPT_INDEXING; import static de.vipra.cmd.CmdOptions.OPT_MODELING; import static de.vipra.cmd.CmdOptions.OPT_SHELL; import static de.vipra.cmd.CmdOptions.OPT_SILENT; import static de.vipra.cmd.CmdOptions.OPT_STATS; import static de.vipra.cmd.CmdOptions.OPT_TEST; -import static de.vipra.cmd.CmdOptions.OPT_INDEXING; import java.util.ArrayList; import java.util.List; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java index 4e3cccb39d460af8b3548104ccbdec852b1d5144..828c6fb2b701fe22808acab8a31697b040bde595 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java @@ -11,7 +11,6 @@ import java.util.List; import de.vipra.cmd.ex.FilebaseException; import de.vipra.cmd.file.DTMSequenceIndex.DTMDateIndexEntry; -import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.FileUtils; import de.vipra.util.ex.ConfigException; @@ -27,17 +26,11 @@ public class DTMFilebase extends Filebase { public DTMFilebase(File dataDir) throws FilebaseException { super(dataDir, "dtm"); - Config config; - try { - config = Config.getConfig(); - } catch (IOException | ConfigException e) { - throw new FilebaseException(e); - } File modelDir = getModelDir(); try { - this.seqindex = new DTMSequenceIndex(modelDir, config.windowResolution, false); - } catch (IOException | ParseException e) { + this.seqindex = new DTMSequenceIndex(modelDir); + } catch (IOException | ConfigException | ParseException e) { throw new FilebaseException("could not read date index file", e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java index 7dc6b2228accb5f33657d3cf91736c277cfb317a..2dfe89ec7e4e08daf483bc083eb02dd364829efd 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java @@ -16,9 +16,11 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.Constants.WindowResolution; import de.vipra.util.FileUtils; +import de.vipra.util.ex.ConfigException; public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DTMDateIndexEntry> { @@ -46,13 +48,19 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT } private File file; - private static WindowResolution windowResolution; + private boolean readonly = false; + private WindowResolution windowResolution; private static List<DTMDateIndexEntry> entries; private static SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); - public DTMSequenceIndex(File modelDir, WindowResolution wr, boolean reread) throws IOException, ParseException { + public DTMSequenceIndex(File modelDir) throws IOException, ParseException, ConfigException { + this(modelDir, false); + } + + public DTMSequenceIndex(File modelDir, boolean reread) throws IOException, ParseException, ConfigException { this.file = new File(modelDir, "dates"); - windowResolution = wr; + Config config = Config.getConfig(); + this.windowResolution = config.windowResolution; if (file.exists()) { if (entries == null || reread) { List<String> dates = FileUtils.readFile(file); @@ -78,6 +86,8 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT @Override public void close() throws IOException { + if (readonly) + return; List<String> windows = new ArrayList<>(); Map<String, Integer> windowSizes = new HashMap<>(); @@ -111,4 +121,8 @@ public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DT writer.close(); } + public int size() { + return entries.size(); + } + } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java index fc056a3a3d9a40e8541e62e7556a51d561c83380..e9ceb68e2c4ba88d27fcb2f1fac0247ef04f5c49 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java @@ -22,6 +22,10 @@ public class DTMVocabulary implements Closeable, Iterable<String> { private static Map<String, Integer> vocablesMap; private static int nextIndex = 1; + public DTMVocabulary(File modelDir) throws IOException { + this(modelDir, false); + } + public DTMVocabulary(File modelDir, boolean reread) throws IOException { this.file = new File(modelDir, FILE_NAME); if (file.exists()) { @@ -80,6 +84,10 @@ public class DTMVocabulary implements Closeable, Iterable<String> { return sb.toString(); } + public String get(int index) { + return vocables.get(index); + } + @Override public void close() throws IOException { write(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java index 28f7a47a2dcf99657f50f95acd1d6df9df598d5b..5fe238eeff45571c6b6933975e6761899880dd6a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java @@ -12,11 +12,13 @@ import de.vipra.util.FileUtils; public class FilebaseIndex implements Closeable, Iterable<String> { + public static final String FILE_NAME = "index"; + private final File file; private final List<String> index; - public FilebaseIndex(File file) throws IOException { - this.file = file; + public FilebaseIndex(File modelDir) throws IOException { + this.file = new File(modelDir, FILE_NAME); if (file.exists()) { index = new ArrayList<>(FileUtils.readFile(file)); } else { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index 1e922e5e1c80f332f954c65e9e99211529546f2a..63e128133c1add26f31b9cfa4ea07fb406d7de85 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -2,30 +2,49 @@ package de.vipra.cmd.lda; import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.List; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.bson.types.ObjectId; import de.vipra.cmd.ex.AnalyzerException; +import de.vipra.cmd.file.DTMSequenceIndex; +import de.vipra.cmd.file.DTMVocabulary; +import de.vipra.cmd.file.FilebaseIndex; import de.vipra.util.Config; import de.vipra.util.Constants; +import de.vipra.util.FileUtils; import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; +import de.vipra.util.ex.DatabaseException; +import de.vipra.util.model.ArticleFull; +import de.vipra.util.model.TopicFull; +import de.vipra.util.model.TopicWord; +import de.vipra.util.model.Word; +import de.vipra.util.service.MongoService; public class DTMAnalyzer extends Analyzer { public static final Logger log = LogManager.getLogger(DTMAnalyzer.class); - public static final String NAME = "dtm"; - public static final int dynamicMinIter = 100; - public static final int dynamicMaxIter = 1000; - public static final int staticIter = 100; + public static final String NAME = "dtm"; private String command; private File modelDir; private File outDir; + private File outDirSeq; + private DTMVocabulary vocab; + private DTMSequenceIndex sequences; + private FilebaseIndex index; + private MongoService<ArticleFull, ObjectId> dbArticles; + private MongoService<TopicFull, ObjectId> dbTopics; + private MongoService<Word, String> dbWords; protected DTMAnalyzer() { super("Dynamic Topic Model Analyzer"); @@ -37,7 +56,16 @@ public class DTMAnalyzer extends Analyzer { File dataDir = config.getDataDirectory(); this.modelDir = new File(dataDir, NAME); this.outDir = new File(modelDir, "out"); - } catch (ConfigException e) { + this.outDirSeq = new File(outDir, "lda-seq"); + this.vocab = new DTMVocabulary(modelDir); + this.sequences = new DTMSequenceIndex(modelDir); + index = new FilebaseIndex(modelDir); + + config = Config.getConfig(); + dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); + dbTopics = MongoService.getDatabaseService(config, TopicFull.class); + dbWords = MongoService.getDatabaseService(config, Word.class); + } catch (ConfigException | IOException | ParseException e) { throw new AnalyzerException(e); } @@ -66,11 +94,11 @@ public class DTMAnalyzer extends Analyzer { // alpha (default -10) "--alpha=0.01", // minimum number if iterations - "--lda_sequence_min_iter=" + dynamicMinIter, + "--lda_sequence_min_iter=" + Constants.DYNAMIC_MIN_ITER, // maximum number of iterations - "--lda_sequence_max_iter=" + dynamicMaxIter, + "--lda_sequence_max_iter=" + Constants.DYNAMIC_MAX_ITER, // em iter (default 20) - "--lda_max_em_iter=" + staticIter, + "--lda_max_em_iter=" + Constants.STATIC_ITER, // input file prefix "--corpus_prefix=" + corpusPrefix, // output directory @@ -86,6 +114,7 @@ public class DTMAnalyzer extends Analyzer { if (!p.isAlive()) throw new AnalyzerException("dtm process is dead"); + // read from process output BufferedReader in = new BufferedReader(new InputStreamReader(p.getErrorStream())); String line; @@ -99,7 +128,101 @@ public class DTMAnalyzer extends Analyzer { in.close(); p.waitFor(); - // TODO save model + List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); + List<Word> newWords = new ArrayList<>(vocab.size()); + int sequencesCount = sequences.size(); + + // read topic definition files + for (int i = 0; i < Constants.K_TOPICS; i++) { + File seqFile = new File(outDirSeq, "topic-" + StringUtils.padNumber(i, 3) + "-var-e-log-prob.dat"); + if (!seqFile.exists()) { + log.error("seq file " + seqFile.getName() + " not found"); + continue; + } + + int lineCount = FileUtils.countLines(seqFile); + int wordsPerSequence = lineCount / sequencesCount; + + if (wordsPerSequence * sequencesCount != lineCount) { + log.error("unexpected number of words per sequence"); + continue; + } + + in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile))); + + // read file lines into word x sequence matrix + // gather maximum likeliness per sequence + double[] maxLikelinesses = new double[sequencesCount]; + double[][] likelinesses = new double[wordsPerSequence][sequencesCount]; + for (int idxWord = 0; idxWord < wordsPerSequence; idxWord++) { + for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { + double likeliness = Double.parseDouble(in.readLine()); + likelinesses[idxWord][idxSeq] = likeliness; + if (likeliness > maxLikelinesses[idxSeq]) + maxLikelinesses[idxSeq] = likeliness; + } + } + + in.close(); + + // go through each sequence and gather all words that are above + // the minimum relative word likeliness + for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { + double maxLikeliness = maxLikelinesses[idxSeq]; + List<TopicWord> newSeqTopicWords = new ArrayList<>(wordsPerSequence); + for (int idxWord = 0; idxWord < wordsPerSequence; idxWord++) { + double likeliness = likelinesses[idxWord][idxSeq]; + if (likeliness >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) { + Word newWord = new Word(vocab.get(idxWord)); + newWords.add(newWord); + TopicWord newTopicWord = new TopicWord(newWord, likeliness); + newSeqTopicWords.add(newTopicWord); + + // TODO gather words for static topic + } + } + + // TODO create dynamic topic of sequence + } + + // TODO create static topic + } + + // recreate topics and words + dbTopics.drop(); + dbWords.drop(); + try { + dbTopics.createMultiple(newTopics); + dbWords.createMultiple(newWords); + } catch (DatabaseException e) { + throw new AnalyzerException(e); + } + + // read gam.dat. It contains the topic proportions per article. + + File gamFile = new File(outDir, "gam.dat"); + if (!gamFile.exists()) { + throw new AnalyzerException("gam file " + gamFile.getName() + " not found"); + } + in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile))); + + for (int idxArticle = 0; idxArticle < index.size(); idxArticle++) { + double topicTotalProportions = 0; + double[] topicProportions = new double[Constants.K_TOPICS]; + + // gather individual topic proportions and sum of proportions + for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + double topicProportion = Double.parseDouble(in.readLine()); + topicTotalProportions += topicProportion; + topicProportions[idxTopic] = topicProportion; + } + + // normalize proportions + for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) + topicProportions[idxTopic] /= topicTotalProportions; + } + + // TODO create topicrefs } catch (IOException | InterruptedException e) { throw new AnalyzerException(e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java index fd8ce1baf4658474c05f9791fd4b1f21aec03161..98f50d2f4bccb7b528570b4c1c7f32b610987217 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java @@ -7,10 +7,8 @@ import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.regex.Matcher; @@ -72,6 +70,7 @@ public class JGibbAnalyzer extends Analyzer { options.est = !options.estc; options.K = Constants.K_TOPICS; options.twords = Constants.K_TOPIC_WORDS; + options.niters = Constants.STATIC_ITER; modelFile = new File(modelDir, NAME); options.dfile = modelFile.getName(); @@ -83,7 +82,7 @@ public class JGibbAnalyzer extends Analyzer { dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); dbTopics = MongoService.getDatabaseService(config, TopicFull.class); dbWords = MongoService.getDatabaseService(config, Word.class); - index = new FilebaseIndex(new File(modelDir, "index")); + index = new FilebaseIndex(modelDir); } catch (Exception e) { throw new AnalyzerException(e); } @@ -110,36 +109,48 @@ public class JGibbAnalyzer extends Analyzer { // the list of new topics List<TopicFull> newTopics = new ArrayList<>(options.K); - // a map of topic index -> topic. resolves topic ids from tassign file - Map<Integer, Topic> newTopicsMap = new HashMap<>(options.K); // set of new words Set<Word> newWords = new HashSet<>(); - TopicFull newTopic = null; List<TopicWord> topicWords = null; - int topicNum = 0; + int topicIndex = -1; + double[] maxLikelinesses = new double[Constants.K_TOPICS]; - // for each line + // create topics and determine maximum likeliness for each topic for (String line : lines) { if (!line.startsWith("\t")) { - newTopic = new TopicFull(); - topicWords = new ArrayList<>(); + topicIndex++; + topicWords = new ArrayList<>(Constants.K_TOPIC_WORDS); + TopicFull newTopic = new TopicFull(); newTopic.setWords(topicWords); newTopics.add(newTopic); - newTopicsMap.put(topicNum++, new Topic(newTopic.getId())); continue; } String[] parts = line.trim().split("\\s+"); double likeliness = Double.parseDouble(parts[1]); + // determine maximum likeliness of this topic + if (likeliness > maxLikelinesses[topicIndex]) + maxLikelinesses[topicIndex] = likeliness; + // check word likeliness - if (likeliness >= Constants.MINIMUM_LIKELINESS) { - Word newWord = new Word(parts[0]); - TopicWord topicWord = new TopicWord(newWord, likeliness); - topicWords.add(topicWord); - newWords.add(newWord); + topicWords.add(new TopicWord(new Word(parts[0]), likeliness)); + } + + // filter out words below minimum relative likeliness, add accepted + // words to list of new words + for (topicIndex = 0; topicIndex < newTopics.size(); topicIndex++) { + TopicFull topic = newTopics.get(topicIndex); + double maxLikeliness = maxLikelinesses[topicIndex]; + ArrayList<TopicWord> filteredTopicWords = new ArrayList<>(topic.getWords().size()); + for (TopicWord word : topic.getWords()) { + if (word.getLikeliness() >= Constants.MINIMUM_RELATIVE_PROB * maxLikeliness) { + filteredTopicWords.add(word); + newWords.add(word.getWord()); + } } + topic.setWords(filteredTopicWords); } // sort topic words and generate topic name @@ -189,10 +200,10 @@ public class JGibbAnalyzer extends Analyzer { // check if topic above threshold if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) { reducedCount += entry.getValue(); - Topic topic = newTopicsMap.get(Integer.parseInt(entry.getKey())); + TopicFull topic = newTopics.get(Integer.parseInt(entry.getKey())); TopicRef ref = new TopicRef(); ref.setCount(entry.getValue()); - ref.setTopic(topic); + ref.setTopic(new Topic(topic.getId())); newTopicRefs.add(ref); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java index 8f339c497633b848658ed3801cc7d5191cf4f898..ae39104e90f29e498ce4816b7ff5395f8d5b24fb 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java @@ -7,9 +7,9 @@ import java.util.Set; import java.util.stream.Collectors; import edu.stanford.nlp.ling.CoreAnnotation; -import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.Annotator; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java index 7d5ab90a920e88a8051b84d464d102184014c074..42185b9acbce33159c4d67c7e97c33ec10705463 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java @@ -8,8 +8,8 @@ import java.util.Properties; import java.util.Set; import edu.stanford.nlp.ling.CoreAnnotation; -import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.Annotator; diff --git a/vipra-cmd/src/main/resources/config.properties b/vipra-cmd/src/main/resources/config.properties index 61bfe4a9fb5a057b497c5947af9db341ff191c81..80c62240811d3f1eeb9f19f17962b32649f6a382 100644 --- a/vipra-cmd/src/main/resources/config.properties +++ b/vipra-cmd/src/main/resources/config.properties @@ -4,5 +4,5 @@ db.name=test es.host=localhost es.port=9300 tm.processor=corenlp -tm.analyzer=jgibb +tm.analyzer=dtm tm.dtmpath=/home/eike/repos/master/dtm_release/dtm/main \ No newline at end of file diff --git a/vipra-ui/app/js/config.js b/vipra-ui/app/js/config.js index 07d0630b43a8c0e5f606aefdd057965e14b62618..92cbc66abd00f4f4e449251ed23d2645b53cf6da 100644 --- a/vipra-ui/app/js/config.js +++ b/vipra-ui/app/js/config.js @@ -1,3 +1,7 @@ +/****************************************************************************** + * Vipra Application + * Configuration + ******************************************************************************/ /* globals Vipra */ (function() { diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index c5bd5390ece80923acb9022e4d024e0f1a2103e9..c719c6785a2f4827d9f027e1ccc70a811231a650 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -75,11 +75,11 @@ public class Constants { * library supports this parameter. */ public static final int K_TOPIC_WORDS = 50; - + /** * Minimum likeliness of words. Words with lower likeliness are ignored */ - public static final double MINIMUM_LIKELINESS = 0; + public static final double MINIMUM_RELATIVE_PROB = 0.01; /** * Topics with a share greater or equal to this number are regarded as @@ -87,6 +87,21 @@ public class Constants { */ public static final double TOPIC_THRESHOLD = 0.01; + /** + * Dynamic minimum iterations. Used for dynamic topic modeling. + */ + public static final int DYNAMIC_MIN_ITER = 100; + + /** + * Dynamic maximum iterations. Used for dynamic topic modeling. + */ + public static final int DYNAMIC_MAX_ITER = 1000; + + /** + * Static iterations. Used for static topic modeling. + */ + public static final int STATIC_ITER = 100; + /** * Minimum word frequency for words to be used for topic modeling. All words * below this frequency in a document are filtered out before generating the diff --git a/vipra-util/src/main/java/de/vipra/util/FileUtils.java b/vipra-util/src/main/java/de/vipra/util/FileUtils.java index 1b26ea291e01333846f40450c7bed53e970eda1d..03b22281138b4646bb2c5bf840a531dcb2b0fc8b 100644 --- a/vipra-util/src/main/java/de/vipra/util/FileUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/FileUtils.java @@ -1,6 +1,5 @@ package de.vipra.util; -import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -112,27 +111,12 @@ public class FileUtils extends org.apache.commons.io.FileUtils { * @throws IOException */ public static int countLines(File file) throws IOException { - if (!file.exists()) { - return 0; - } - InputStream is = new BufferedInputStream(new FileInputStream(file)); - try { - byte[] c = new byte[1024]; - int count = 0; - int readChars = 0; - boolean empty = true; - while ((readChars = is.read(c)) != -1) { - empty = false; - for (int i = 0; i < readChars; ++i) { - if (c[i] == '\n') { - ++count; - } - } - } - return (count == 0 && !empty) ? 1 : count; - } finally { - is.close(); - } + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); + int lines = 0; + while (reader.readLine() != null) + lines++; + reader.close(); + return lines; } /** diff --git a/vipra-util/src/main/java/de/vipra/util/StringUtils.java b/vipra-util/src/main/java/de/vipra/util/StringUtils.java index 9d82a4c217d70a003861081c30519288172ea929..3f823beff07ff6b626763cd4b12d51b01928242e 100644 --- a/vipra-util/src/main/java/de/vipra/util/StringUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/StringUtils.java @@ -81,9 +81,9 @@ public class StringUtils { return StringUtils.join(parts); } - public static String padNumber(int lineCount) { - String lc = Integer.toString(lineCount); - while (lc.length() < 10) { + public static String padNumber(int number, int length) { + String lc = Integer.toString(number); + while (lc.length() < length) { lc = "0" + lc; } return lc;