diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java index 1a3d18310883ad428a639d6f9e5701f1eb2cb791..466ec1003d16bd59a9849ef5397e29524d9d7470 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java @@ -70,10 +70,10 @@ public class InfoResource { info.put("const.ktopics", Constants.K_TOPICS); info.put("const.ktopicwords", Constants.K_TOPIC_WORDS); info.put("const.decaylambda", Constants.RISING_DECAY_LAMBDA); - info.put("const.minrelprob", Constants.MINIMUM_RELATIVE_PROB); + info.put("const.minrelprob", Constants.MIN_RELATIVE_PROB); info.put("const.minshare", Constants.MINIMUM_SHARE); info.put("const.maxsimdocs", Constants.MAX_SIMILAR_DOCUMENTS); - info.put("const.maxdiv", Constants.MAX_DIVERGENCE); + info.put("const.maxdiv", Constants.MAX_SIMILAR_DOCUMENTS_DIVERGENCE); info.put("const.dynminiter", Constants.DYNAMIC_MIN_ITER); info.put("const.dynmaxiter", Constants.DYNAMIC_MAX_ITER); info.put("const.statiter", Constants.STATIC_ITER); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java index f03df9e06a1864b2c6700ade50260999da580e23..3d9bf43085a2592e24a7b776d4e6173c7a5fc272 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java @@ -17,9 +17,9 @@ import org.mongodb.morphia.logging.slf4j.SLF4JLoggerImplFactory; import de.vipra.cmd.option.ClearCommand; import de.vipra.cmd.option.Command; -import de.vipra.cmd.option.EditModelCommand; import de.vipra.cmd.option.CreateModelCommand; import de.vipra.cmd.option.DeleteModelCommand; +import de.vipra.cmd.option.EditModelCommand; import de.vipra.cmd.option.ImportCommand; import de.vipra.cmd.option.IndexingCommand; import de.vipra.cmd.option.ListModelsCommand; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java index 35eafe21061391bc096f815886d624e823c88375..2448ee904794bfdf6d7f1308310ef6cef0d3806f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java @@ -23,7 +23,6 @@ import de.vipra.cmd.file.FilebaseWindowIndex; import de.vipra.cmd.file.FilebaseWordIndex; import de.vipra.util.ArrayUtils; import de.vipra.util.Config; -import de.vipra.util.Constants; import de.vipra.util.ModelConfig; import de.vipra.util.MongoUtils; import de.vipra.util.StringUtils; @@ -83,7 +82,7 @@ public class Analyzer { final String[] parameters = { // number of topics - "--ntopics=" + Constants.K_TOPICS, + "--ntopics=" + modelConfig.getkTopics(), // topc modeling mode "--mode=fit", // random seed (0 for pseudo random) @@ -95,11 +94,11 @@ public class Analyzer { // alpha (default 0.01) "--alpha=0.01", // minimum number if iterations - "--lda_sequence_min_iter=" + Constants.DYNAMIC_MIN_ITER, + "--lda_sequence_min_iter=" + modelConfig.getDynamicMinIterations(), // maximum number of iterations - "--lda_sequence_max_iter=" + Constants.DYNAMIC_MAX_ITER, + "--lda_sequence_max_iter=" + modelConfig.getDynamicMaxIterations(), // em iter (default 20) - "--lda_max_em_iter=" + Constants.STATIC_ITER, + "--lda_max_em_iter=" + modelConfig.getStaticIterations(), // input file prefix "--corpus_prefix=" + modelDir.getAbsolutePath() + File.separator + "dtm", // output directory @@ -141,6 +140,7 @@ public class Analyzer { dbWindows.deleteMultiple(builder); dbTopicModels.deleteSingle(modelConfig.getName()); + final int topicCount = modelConfig.getkTopics(); final int wordCount = wordIndex.size(); final int sequencesCount = windowIndex.size(); final int articlesCount = idDateIndex.size(); @@ -152,17 +152,17 @@ public class Analyzer { throw new AnalyzerException("file not found: " + gamFile.getAbsolutePath()); in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile))); - final double[][] topicDistributions = new double[articlesCount][Constants.K_TOPICS]; + final double[][] topicDistributions = new double[articlesCount][topicCount]; for (int idxArticle = 0; idxArticle < articlesCount; idxArticle++) { // read distributions into matrix and sum double topicDistributionSum = 0; - for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + for (int idxTopic = 0; idxTopic < topicCount; idxTopic++) { final double topicDistribution = Double.parseDouble(in.readLine()); topicDistributions[idxArticle][idxTopic] = topicDistribution; topicDistributionSum += topicDistribution; } // normalize distributions by sum - for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + for (int idxTopic = 0; idxTopic < topicCount; idxTopic++) { topicDistributions[idxArticle][idxTopic] /= topicDistributionSum; } } @@ -173,14 +173,14 @@ public class Analyzer { final TopicModelFull newTopicModel = new TopicModelFull(modelConfig.getName()); final List<Window> newWindows = new ArrayList<>(sequencesCount); - final List<SequenceFull> newSequences = new ArrayList<>(Constants.K_TOPICS * sequencesCount); - final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); + final List<SequenceFull> newSequences = new ArrayList<>(topicCount * sequencesCount); + final List<TopicFull> newTopics = new ArrayList<>(topicCount); log.info("vocabulary size: " + wordCount); log.info("sequences: " + sequencesCount); - log.info("topics: " + Constants.K_TOPICS); + log.info("topics: " + topicCount); - final boolean seqRelativeCutoff = Constants.MINIMUM_RELATIVE_PROB > 0; + final boolean seqRelativeCutoff = modelConfig.getMinRelativeProbability() > 0; // create sequence windows for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { @@ -188,13 +188,13 @@ public class Analyzer { newWindow.setId(idxSeq); newWindow.setStartDate(windowIndex.startDate(idxSeq)); newWindow.setEndDate(windowIndex.endDate(idxSeq)); - newWindow.setWindowResolution(Constants.WINDOW_RESOLUTION); + newWindow.setWindowResolution(modelConfig.getWindowResolution()); newWindow.setModel(new TopicModel(newTopicModel.getId())); newWindows.add(newWindow); } // for each topic - for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + for (int idxTopic = 0; idxTopic < topicCount; idxTopic++) { final File seqFile = new File(outDirSeq, "topic-" + StringUtils.padNumber(idxTopic, 3) + "-var-e-log-prob.dat"); if (!seqFile.exists()) { in.close(); @@ -235,7 +235,7 @@ public class Analyzer { for (int idxSeq = 0, sequenceOffset = 0; idxSeq < sequencesCount; idxSeq++) { // calculate relative cutoff probability final double maxSeqLikeliness = maxSeqLikelinesses[idxSeq]; - final double minRelativeSeqLikeliness = Constants.MINIMUM_RELATIVE_PROB * Math.abs(maxSeqLikeliness); + final double minRelativeSeqLikeliness = modelConfig.getMinRelativeProbability() * Math.abs(maxSeqLikeliness); // collect words final List<TopicWord> newSeqTopicWords = new ArrayList<>(wordCount); @@ -254,7 +254,7 @@ public class Analyzer { Collections.sort(newSeqTopicWords, Comparator.reverseOrder()); // collect top words - topTopicWords.addAll(newSeqTopicWords.subList(0, Math.min(newSeqTopicWords.size(), Constants.TOPIC_AUTO_NAMING_WORDS))); + topTopicWords.addAll(newSeqTopicWords.subList(0, Math.min(newSeqTopicWords.size(), modelConfig.getTopicAutoNamingWords()))); } // calculate topic sequence relevance @@ -318,7 +318,7 @@ public class Analyzer { } else { fallingRelevance += Math.abs(relevanceDiff); } - risingDecayRelevance += Math.exp(-Constants.RISING_DECAY_LAMBDA * (sequencesCount - idxSeq2 + 1)) * relevanceDiff; + risingDecayRelevance += Math.exp(-modelConfig.getRisingDecayLambda() * (sequencesCount - idxSeq2 + 1)) * relevanceDiff; } newTopic.setRisingRelevance(risingRelevance); newTopic.setFallingRelevance(fallingRelevance); @@ -345,8 +345,8 @@ public class Analyzer { // create topic references double reducedShare = 0; - final List<TopicShare> newTopicRefs = new ArrayList<>(Constants.K_TOPICS); - for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + final List<TopicShare> newTopicRefs = new ArrayList<>(topicCount); + for (int idxTopic = 0; idxTopic < topicCount; idxTopic++) { if (topicDistribution[idxTopic] > 0.01) { reducedShare += topicDistribution[idxTopic]; final TopicShare newTopicRef = new TopicShare(); @@ -366,7 +366,7 @@ public class Analyzer { continue; final double divergence = ArrayUtils.jsDivergence(topicDistributions[idxArticle], topicDistributions[idxArticle2]); - if (divergence > Constants.MAX_DIVERGENCE) + if (divergence > modelConfig.getMaxSimilarDocumentsDivergence()) continue; final SimilarArticle similarArticle = new SimilarArticle(); @@ -377,8 +377,8 @@ public class Analyzer { Collections.sort(similarArticles); - if (similarArticles.size() > Constants.MAX_SIMILAR_DOCUMENTS) - similarArticles.subList(Constants.MAX_SIMILAR_DOCUMENTS, similarArticles.size()).clear(); + if (similarArticles.size() > modelConfig.getMaxSimilarDocuments()) + similarArticles.subList(modelConfig.getMaxSimilarDocuments(), similarArticles.size()).clear(); // update article diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java index 8024a5ed43af6560b5023f613333ddb794e261fb..fc93cfc452a9f86cfed253d2abf58a000ee2fd41 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java @@ -6,9 +6,6 @@ import org.apache.commons.io.IOUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationFeature; - import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.FileUtils; @@ -31,16 +28,13 @@ public class CreateModelCommand implements Command { final Config config = Config.getConfig(); - final ObjectMapper mapper = new ObjectMapper(); final String modelConfigString; if (config.getModelConfigTemplate() == null) { modelConfigString = IOUtils.toString(FileUtils.getResource(Constants.MODEL_FILE)); } else { - modelConfigString = mapper.writeValueAsString(config.getModelConfigTemplate()); + modelConfigString = Config.mapper.writeValueAsString(config.getModelConfigTemplate()); } - mapper.enable(SerializationFeature.INDENT_OUTPUT); - for (final String name : names) { if (name.toLowerCase().equals("all")) throw new Exception("invalid model name: " + name); @@ -50,9 +44,9 @@ public class CreateModelCommand implements Command { if (!modelDir.mkdirs()) throw new Exception("could not create model directory: " + modelDir.getAbsolutePath()); final File modelConfigFile = new File(modelDir, Constants.MODEL_FILE); - final ModelConfig modelConfig = mapper.readValue(modelConfigString, ModelConfig.class); + final ModelConfig modelConfig = Config.mapper.readValue(modelConfigString, ModelConfig.class); modelConfig.setName(name); - org.apache.commons.io.FileUtils.write(modelConfigFile, mapper.writeValueAsString(modelConfig)); + org.apache.commons.io.FileUtils.write(modelConfigFile, Config.mapper.writeValueAsString(modelConfig)); config.getModelConfigs().put(name, modelConfig); log.info("model created: " + name); } diff --git a/vipra-cmd/src/main/resources/model.json b/vipra-cmd/src/main/resources/model.json index 628e3533c7d28c35d9f53941d70e24f3c3621738..0eed5c3637e873744b64831a81f89ea2b6b1b532 100644 --- a/vipra-cmd/src/main/resources/model.json +++ b/vipra-cmd/src/main/resources/model.json @@ -4,6 +4,11 @@ "dynamicMinIterations": 100, "dynamicMaxIterations": 1000, "staticIterations": 100, + "topicAutoNamingWords": 4, + "maxSimilarDocuments": 20, + "minRelativeProbability": 0.01, + "risingDecayLambda": 0.0, + "maxSimilarDocumentsDivergence": 0.25, "windowResolution": "YEAR", "processorMode": "TEXT" } \ No newline at end of file diff --git a/vipra-util/src/main/java/de/vipra/util/Config.java b/vipra-util/src/main/java/de/vipra/util/Config.java index e190121308f636fbc79564cd6073dad516e0be0a..8026369566975b6f9934c724e8cb2ef57a0b3f29 100644 --- a/vipra-util/src/main/java/de/vipra/util/Config.java +++ b/vipra-util/src/main/java/de/vipra/util/Config.java @@ -11,7 +11,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.core.JsonParser.Feature; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; import de.vipra.util.ex.ConfigException; import de.vipra.util.model.Model; @@ -20,6 +22,12 @@ import de.vipra.util.service.MongoService; public class Config { public static final Logger log = LoggerFactory.getLogger(Config.class); + public static final ObjectMapper mapper = new ObjectMapper(); + + static { + mapper.enable(SerializationFeature.INDENT_OUTPUT); + mapper.enable(Feature.ALLOW_COMMENTS); + } private static Config instance; @@ -222,7 +230,6 @@ public class Config { if (configDir.exists() && !configFile.exists()) org.apache.commons.io.FileUtils.write(configFile, config); - final ObjectMapper mapper = new ObjectMapper(); instance = mapper.readValue(config, Config.class); if (instance == null) diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index f526c9788bef0728b739026976e0214732bd3b3b..c9b1d39433de5e5108b7ea4bed1e1f6d8bd34ffb 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -90,7 +90,7 @@ public class Constants { * Minimum likeliness of words. Words with lower likeliness are ignored. * Default 0.01. */ - public static final double MINIMUM_RELATIVE_PROB = 0.01; + public static final double MIN_RELATIVE_PROB = 0.01; /** * The minimum share of a topic to be accepted for an article. Topic shares @@ -107,7 +107,7 @@ public class Constants { * Maximum divergence between a document and similar documents. Lower values * mean more similar documents (less divergence). Default 1.0. */ - public static final double MAX_DIVERGENCE = 0.25; + public static final double MAX_SIMILAR_DOCUMENTS_DIVERGENCE = 0.25; /** * Dynamic minimum iterations. Used for dynamic topic modeling. Default 100. diff --git a/vipra-util/src/main/java/de/vipra/util/ModelConfig.java b/vipra-util/src/main/java/de/vipra/util/ModelConfig.java index c8e482f99fbf2d890f654eba0f0e541e1e635cd3..e89dd8b5dc6a2114f98da0d2ba3b4e4f247bdf48 100644 --- a/vipra-util/src/main/java/de/vipra/util/ModelConfig.java +++ b/vipra-util/src/main/java/de/vipra/util/ModelConfig.java @@ -8,12 +8,17 @@ import de.vipra.util.Constants.WindowResolution; public class ModelConfig { private String name; - private final int kTopics = Constants.K_TOPICS; - private final int dynamicMinIterations = Constants.DYNAMIC_MIN_ITER; - private final int dynamicMaxIterations = Constants.DYNAMIC_MAX_ITER; - private final int staticIterations = Constants.STATIC_ITER; - private final WindowResolution windowResolution = Constants.WINDOW_RESOLUTION; - private final ProcessorMode processorMode = Constants.PROCESSOR_MODE; + private int kTopics = Constants.K_TOPICS; + private int dynamicMinIterations = Constants.DYNAMIC_MIN_ITER; + private int dynamicMaxIterations = Constants.DYNAMIC_MAX_ITER; + private int staticIterations = Constants.STATIC_ITER; + private int topicAutoNamingWords = Constants.TOPIC_AUTO_NAMING_WORDS; + private int maxSimilarDocuments = Constants.MAX_SIMILAR_DOCUMENTS; + private double minRelativeProbability = Constants.MIN_RELATIVE_PROB; + private double risingDecayLambda = Constants.RISING_DECAY_LAMBDA; + private double maxSimilarDocumentsDivergence = Constants.MAX_SIMILAR_DOCUMENTS_DIVERGENCE; + private WindowResolution windowResolution = Constants.WINDOW_RESOLUTION; + private ProcessorMode processorMode = Constants.PROCESSOR_MODE; public String getName() { return name; @@ -27,26 +32,90 @@ public class ModelConfig { return kTopics; } + public void setkTopics(final int kTopics) { + this.kTopics = kTopics; + } + public int getDynamicMinIterations() { return dynamicMinIterations; } + public void setDynamicMinIterations(final int dynamicMinIterations) { + this.dynamicMinIterations = dynamicMinIterations; + } + public int getDynamicMaxIterations() { return dynamicMaxIterations; } + public void setDynamicMaxIterations(final int dynamicMaxIterations) { + this.dynamicMaxIterations = dynamicMaxIterations; + } + public int getStaticIterations() { return staticIterations; } + public void setStaticIterations(final int staticIterations) { + this.staticIterations = staticIterations; + } + + public int getTopicAutoNamingWords() { + return topicAutoNamingWords; + } + + public void setTopicAutoNamingWords(final int topicAutoNamingWords) { + this.topicAutoNamingWords = topicAutoNamingWords; + } + + public int getMaxSimilarDocuments() { + return maxSimilarDocuments; + } + + public void setMaxSimilarDocuments(final int maxSimilarDocuments) { + this.maxSimilarDocuments = maxSimilarDocuments; + } + + public double getMinRelativeProbability() { + return minRelativeProbability; + } + + public void setMinRelativeProbability(final double minRelativeProbability) { + this.minRelativeProbability = minRelativeProbability; + } + + public double getRisingDecayLambda() { + return risingDecayLambda; + } + + public void setRisingDecayLambda(final double risingDecayLambda) { + this.risingDecayLambda = risingDecayLambda; + } + + public double getMaxSimilarDocumentsDivergence() { + return maxSimilarDocumentsDivergence; + } + + public void setMaxSimilarDocumentsDivergence(final double maxSimilarDocumentsDivergence) { + this.maxSimilarDocumentsDivergence = maxSimilarDocumentsDivergence; + } + public WindowResolution getWindowResolution() { return windowResolution; } + public void setWindowResolution(final WindowResolution windowResolution) { + this.windowResolution = windowResolution; + } + public ProcessorMode getProcessorMode() { return processorMode; } + public void setProcessorMode(final ProcessorMode processorMode) { + this.processorMode = processorMode; + } + public File getModelDir(final File dataDir) { return new File(dataDir, name); }