From e719e5ac47ba07aef68f4da4d739517ddea481ff Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Wed, 23 Mar 2016 19:20:17 +0100 Subject: [PATCH] added spotlight analyzer, updated cmd options added spotlight analyzer updated cmd options with new characters and help removed log4j from cmd project, using custom solution added jansi for console colors and effects fixed bug where window index is read in wrong, ignoring first line renamed ModelConfig to TopicModelCOnfig multiple small changes --- .../vipra/rest/resource/ArticleResource.java | 7 +- .../de/vipra/rest/resource/InfoResource.java | 4 +- .../vipra/rest/resource/SearchResource.java | 6 +- .../vipra/rest/resource/SequenceResource.java | 7 +- .../de/vipra/rest/resource/TopicResource.java | 7 +- .../vipra/rest/resource/WindowResource.java | 7 +- vipra-cmd/pom.xml | 19 +- vipra-cmd/runcfg/CMD - Clear.launch | 4 +- vipra-cmd/runcfg/CMD - Config.launch | 18 -- vipra-cmd/runcfg/CMD - Help.launch | 2 +- vipra-cmd/runcfg/CMD - Import 1.launch | 4 +- vipra-cmd/runcfg/CMD - Import 10.launch | 4 +- vipra-cmd/runcfg/CMD - Import 2.launch | 4 +- vipra-cmd/runcfg/CMD - Import 25.launch | 4 +- vipra-cmd/runcfg/CMD - Import All.launch | 4 +- vipra-cmd/runcfg/CMD - Indexing.launch | 4 +- vipra-cmd/runcfg/CMD - Modeling.launch | 4 +- vipra-cmd/runcfg/CMD - Reread.launch | 4 +- vipra-cmd/runcfg/CMD - Test.launch | 2 +- .../java/de/vipra/cmd/CommandLineOptions.java | 34 +-- .../src/main/java/de/vipra/cmd/Main.java | 44 ++-- .../main/java/de/vipra/cmd/file/Filebase.java | 4 +- .../vipra/cmd/file/FilebaseWindowIndex.java | 1 - .../main/java/de/vipra/cmd/lda/Analyzer.java | 76 ++++--- .../de/vipra/cmd/option/ClearCommand.java | 36 +--- .../vipra/cmd/option/CreateModelCommand.java | 33 ++- .../vipra/cmd/option/DeleteModelCommand.java | 14 +- .../de/vipra/cmd/option/EditModelCommand.java | 28 --- .../de/vipra/cmd/option/ImportCommand.java | 194 +++++++++--------- .../de/vipra/cmd/option/IndexingCommand.java | 24 +-- .../vipra/cmd/option/ListModelsCommand.java | 20 +- .../de/vipra/cmd/option/ModelingCommand.java | 19 +- .../java/de/vipra/cmd/option/TestCommand.java | 13 +- .../cmd/plugin/ClassNameRegexFilter.java | 77 ------- .../java/de/vipra/cmd/text/Processor.java | 10 +- .../de/vipra/cmd/text/SpotlightAnalyzer.java | 53 +++++ .../de/vipra/cmd/text/SpotlightResource.java | 95 +++++++++ .../de/vipra/cmd/text/SpotlightResponse.java | 22 ++ vipra-cmd/src/main/resources/config.json | 9 + vipra-cmd/src/main/resources/log4j2.xml | 14 -- vipra-cmd/src/main/resources/log4j2dev.xml | 15 -- vipra-cmd/src/main/resources/model.json | 14 -- vipra-ui/app/html/about.html | 30 +-- vipra-ui/app/html/articles/index.html | 2 - vipra-ui/app/html/index.html | 2 +- vipra-ui/app/index.html | 48 ++++- vipra-ui/app/js/controllers.js | 127 ++++++++---- vipra-ui/app/js/directives.js | 8 + vipra-ui/app/js/factories.js | 4 + vipra-ui/app/less/app.less | 5 + vipra-util/pom.xml | 7 + .../src/main/java/de/vipra/util/Config.java | 58 +++--- .../main/java/de/vipra/util/ConsoleUtils.java | 38 ++++ .../main/java/de/vipra/util/Constants.java | 43 ++-- .../main/java/de/vipra/util/StringUtils.java | 25 +++ .../src/main/java/de/vipra/util/URLUtils.java | 23 +++ .../java/de/vipra/util/model/ArticleFull.java | 19 +- .../java/de/vipra/util/model/Sequence.java | 13 +- .../de/vipra/util/model/SequenceFull.java | 12 +- .../java/de/vipra/util/model/TextEntity.java | 38 ++++ .../java/de/vipra/util/model/TopicFull.java | 20 +- .../java/de/vipra/util/model/TopicModel.java | 9 +- .../TopicModelConfig.java} | 72 ++++++- .../de/vipra/util/model/TopicModelFull.java | 41 +++- .../main/java/de/vipra/util/model/Window.java | 14 +- .../de/vipra/util/service/MongoService.java | 14 +- .../java/de/vipra/util/service/Service.java | 13 ++ 67 files changed, 967 insertions(+), 682 deletions(-) delete mode 100644 vipra-cmd/runcfg/CMD - Config.launch delete mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/option/EditModelCommand.java delete mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/plugin/ClassNameRegexFilter.java create mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightAnalyzer.java create mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResource.java create mode 100644 vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java delete mode 100644 vipra-cmd/src/main/resources/log4j2.xml delete mode 100644 vipra-cmd/src/main/resources/log4j2dev.xml delete mode 100644 vipra-cmd/src/main/resources/model.json create mode 100644 vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java create mode 100644 vipra-util/src/main/java/de/vipra/util/URLUtils.java create mode 100644 vipra-util/src/main/java/de/vipra/util/model/TextEntity.java rename vipra-util/src/main/java/de/vipra/util/{ModelConfig.java => model/TopicModelConfig.java} (54%) diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/ArticleResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/ArticleResource.java index ff1c102d..74c809ec 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/ArticleResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/ArticleResource.java @@ -34,6 +34,7 @@ import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleFull; +import de.vipra.util.model.TopicModel; import de.vipra.util.service.MongoService; import de.vipra.util.service.Service.QueryBuilder; @@ -52,7 +53,7 @@ public class ArticleResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response getArticles(@QueryParam("model") final String model, @QueryParam("skip") final Integer skip, + public Response getArticles(@QueryParam("topicModel") final String topicModel, @QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, @QueryParam("sort") @DefaultValue("date") final String sortBy, @QueryParam("fields") final String fields, @QueryParam("word") final String word) { final ResponseWrapper<List<ArticleFull>> res = new ResponseWrapper<>(); @@ -65,8 +66,8 @@ public class ArticleResource { if (fields != null && !fields.isEmpty()) query.fields(true, StringUtils.getFields(fields)); - if (model != null && !model.isEmpty()) - query.criteria("model.id", model); + if (topicModel != null && !topicModel.isEmpty()) + query.criteria("topicModel", new TopicModel(topicModel)); if (word != null && !word.isEmpty()) query.criteria("words.word.id", word); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java index 466ec100..7a5c47d7 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java @@ -68,17 +68,15 @@ public class InfoResource { info.put("const.esboosttitles", Constants.ES_BOOST_TITLES); info.put("const.topicautoname", Constants.TOPIC_AUTO_NAMING_WORDS); info.put("const.ktopics", Constants.K_TOPICS); - info.put("const.ktopicwords", Constants.K_TOPIC_WORDS); info.put("const.decaylambda", Constants.RISING_DECAY_LAMBDA); info.put("const.minrelprob", Constants.MIN_RELATIVE_PROB); - info.put("const.minshare", Constants.MINIMUM_SHARE); info.put("const.maxsimdocs", Constants.MAX_SIMILAR_DOCUMENTS); info.put("const.maxdiv", Constants.MAX_SIMILAR_DOCUMENTS_DIVERGENCE); info.put("const.dynminiter", Constants.DYNAMIC_MIN_ITER); info.put("const.dynmaxiter", Constants.DYNAMIC_MAX_ITER); info.put("const.statiter", Constants.STATIC_ITER); - info.put("const.docminfreq", Constants.DOCUMENT_MIN_WORD_FREQ); info.put("const.docminlength", Constants.DOCUMENT_MIN_LENGTH); + info.put("const.docminwordfreq", Constants.DOCUMENT_MIN_WORD_FREQ); info.put("const.charsdisallow", Constants.CHARS_DISALLOWED); info.put("const.regexemail", Constants.REGEX_EMAIL); info.put("const.regexurl", Constants.REGEX_URL); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java index 434b58ee..18a39fd0 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java @@ -45,7 +45,7 @@ public class SearchResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response doSearch(@QueryParam("model") final String model, @QueryParam("skip") Integer skip, @QueryParam("limit") Integer limit, + public Response doSearch(@QueryParam("topicModel") final String topicModel, @QueryParam("skip") Integer skip, @QueryParam("limit") Integer limit, @QueryParam("fields") final String fields, @QueryParam("query") final String query) { final ResponseWrapper<List<ArticleFull>> res = new ResponseWrapper<>(); @@ -59,8 +59,8 @@ public class SearchResource { return res.noContent(); String indexName = "_all"; - if (model != null && !model.isEmpty()) - indexName = model + "-articles"; + if (topicModel != null && !topicModel.isEmpty()) + indexName = topicModel + "-articles"; SearchResponse response = null; try { diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/SequenceResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/SequenceResource.java index b22aed18..e2c66f9a 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/SequenceResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/SequenceResource.java @@ -24,6 +24,7 @@ import de.vipra.util.MongoUtils; import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; import de.vipra.util.model.SequenceFull; +import de.vipra.util.model.TopicModel; import de.vipra.util.service.MongoService; import de.vipra.util.service.Service.QueryBuilder; @@ -39,7 +40,7 @@ public class SequenceResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response getSequences(@QueryParam("model") final String model, @QueryParam("skip") final Integer skip, + public Response getSequences(@QueryParam("topicModel") final String topicModel, @QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, @QueryParam("sort") @DefaultValue("id") final String sortBy, @QueryParam("fields") final String fields) { final ResponseWrapper<List<SequenceFull>> res = new ResponseWrapper<>(); @@ -52,8 +53,8 @@ public class SequenceResource { if (fields != null && !fields.isEmpty()) query.fields(true, StringUtils.getFields(fields)); - if (model != null && !model.isEmpty()) - query.criteria("model.id", model); + if (topicModel != null && !topicModel.isEmpty()) + query.criteria("topicModel", new TopicModel(topicModel)); final List<SequenceFull> sequences = dbSequences.getMultiple(query); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java index c749773e..3f422d62 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java @@ -30,6 +30,7 @@ import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleFull; import de.vipra.util.model.Topic; import de.vipra.util.model.TopicFull; +import de.vipra.util.model.TopicModel; import de.vipra.util.service.MongoService; import de.vipra.util.service.Service.QueryBuilder; @@ -47,7 +48,7 @@ public class TopicResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response getTopics(@QueryParam("model") final String model, @QueryParam("skip") final Integer skip, + public Response getTopics(@QueryParam("topicModel") final String topicModel, @QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, @QueryParam("sort") @DefaultValue("name") final String sortBy, @QueryParam("fields") final String fields) { final ResponseWrapper<List<TopicFull>> res = new ResponseWrapper<>(); @@ -60,8 +61,8 @@ public class TopicResource { if (fields != null && !fields.isEmpty()) query.fields(true, StringUtils.getFields(fields)); - if (model != null && !model.isEmpty()) - query.criteria("model.id", model); + if (topicModel != null && !topicModel.isEmpty()) + query.criteria("topicModel", new TopicModel(topicModel)); final List<TopicFull> topics = dbTopics.getMultiple(query); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/WindowResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/WindowResource.java index 8528a5d7..6b3bed96 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/WindowResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/WindowResource.java @@ -18,6 +18,7 @@ import de.vipra.rest.model.ResponseWrapper; import de.vipra.util.Config; import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; +import de.vipra.util.model.TopicModel; import de.vipra.util.model.Window; import de.vipra.util.service.MongoService; import de.vipra.util.service.Service.QueryBuilder; @@ -34,7 +35,7 @@ public class WindowResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response getWindows(@QueryParam("model") final String model, @QueryParam("skip") final Integer skip, + public Response getWindows(@QueryParam("topicModel") final String topicModel, @QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, @QueryParam("sort") @DefaultValue("startDate") final String sortBy, @QueryParam("fields") final String fields) { final ResponseWrapper<List<Window>> res = new ResponseWrapper<>(); @@ -47,8 +48,8 @@ public class WindowResource { if (fields != null && !fields.isEmpty()) query.fields(true, StringUtils.getFields(fields)); - if (model != null && !model.isEmpty()) - query.criteria("model.id", model); + if (topicModel != null && !topicModel.isEmpty()) + query.criteria("topicModel", new TopicModel(topicModel)); final List<Window> windows = dbWindows.getMultiple(query); diff --git a/vipra-cmd/pom.xml b/vipra-cmd/pom.xml index 5c87afc8..9b81219d 100644 --- a/vipra-cmd/pom.xml +++ b/vipra-cmd/pom.xml @@ -15,7 +15,7 @@ <maven.compiler.target>1.8</maven.compiler.target> <maven.compiler.source>1.8</maven.compiler.source> <maven.build.timestamp.format>yyMMdd_HHmm</maven.build.timestamp.format> - <buildDate>${maven.build.timestamp}</buildDate> + <buildDate>${maven.build.timestamp}</buildDate> </properties> <scm> @@ -52,23 +52,6 @@ <classifier>models</classifier> </dependency> - <!-- Logging --> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-api</artifactId> - <version>2.5</version> - </dependency> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-core</artifactId> - <version>2.5</version> - </dependency> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-slf4j-impl</artifactId> - <version>2.5</version> - </dependency> - <!-- MongoDB Database Adapter --> <dependency> <groupId>org.mongodb.morphia</groupId> diff --git a/vipra-cmd/runcfg/CMD - Clear.launch b/vipra-cmd/runcfg/CMD - Clear.launch index 054ed66a..9f94416a 100644 --- a/vipra-cmd/runcfg/CMD - Clear.launch +++ b/vipra-cmd/runcfg/CMD - Clear.launch @@ -11,8 +11,8 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="--clear"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-cC yearly"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml -ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Config.launch b/vipra-cmd/runcfg/CMD - Config.launch deleted file mode 100644 index ba73d8cf..00000000 --- a/vipra-cmd/runcfg/CMD - Config.launch +++ /dev/null @@ -1,18 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication"> -<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS"> -<listEntry value="/vipra-cmd/src/main/java/de/vipra/cmd/Main.java"/> -</listAttribute> -<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES"> -<listEntry value="1"/> -</listAttribute> -<listAttribute key="org.eclipse.debug.ui.favoriteGroups"> -<listEntry value="org.eclipse.debug.ui.launchGroup.run"/> -</listAttribute> -<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-o"/> -<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> -<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> -</launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Help.launch b/vipra-cmd/runcfg/CMD - Help.launch index 2528a35c..a18a81cf 100644 --- a/vipra-cmd/runcfg/CMD - Help.launch +++ b/vipra-cmd/runcfg/CMD - Help.launch @@ -14,5 +14,5 @@ <stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-h"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Import 1.launch b/vipra-cmd/runcfg/CMD - Import 1.launch index 62653261..a907ac70 100644 --- a/vipra-cmd/runcfg/CMD - Import 1.launch +++ b/vipra-cmd/runcfg/CMD - Import 1.launch @@ -11,8 +11,8 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-i /home/eike/repos/master/ma-impl/vm/data/test-1.json"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-AI /home/eike/repos/master/ma-impl/vm/data/test-1.json"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Import 10.launch b/vipra-cmd/runcfg/CMD - Import 10.launch index 1d6c112b..f3178f97 100644 --- a/vipra-cmd/runcfg/CMD - Import 10.launch +++ b/vipra-cmd/runcfg/CMD - Import 10.launch @@ -11,8 +11,8 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-i /home/eike/repos/master/ma-impl/vm/data/test-10.json"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-AI /home/eike/repos/master/ma-impl/vm/data/test-10.json"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Import 2.launch b/vipra-cmd/runcfg/CMD - Import 2.launch index 89c246dd..16d1dbf5 100644 --- a/vipra-cmd/runcfg/CMD - Import 2.launch +++ b/vipra-cmd/runcfg/CMD - Import 2.launch @@ -11,8 +11,8 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-i /home/eike/repos/master/ma-impl/vm/data/test-2.json"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-AI /home/eike/repos/master/ma-impl/vm/data/test-2.json"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Import 25.launch b/vipra-cmd/runcfg/CMD - Import 25.launch index 60e6ec53..b548bdd7 100644 --- a/vipra-cmd/runcfg/CMD - Import 25.launch +++ b/vipra-cmd/runcfg/CMD - Import 25.launch @@ -11,8 +11,8 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-i /home/eike/repos/master/ma-impl/vm/data/test-25.json"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-AI /home/eike/repos/master/ma-impl/vm/data/test-25.json"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Import All.launch b/vipra-cmd/runcfg/CMD - Import All.launch index 750a14c5..0e8a0c85 100644 --- a/vipra-cmd/runcfg/CMD - Import All.launch +++ b/vipra-cmd/runcfg/CMD - Import All.launch @@ -11,8 +11,8 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-i /home/eike/repos/master/ma-impl/vm/data/data.json"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-AI /home/eike/repos/master/ma-impl/vm/data/data.json"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Indexing.launch b/vipra-cmd/runcfg/CMD - Indexing.launch index 801f3d35..0e3fecb1 100644 --- a/vipra-cmd/runcfg/CMD - Indexing.launch +++ b/vipra-cmd/runcfg/CMD - Indexing.launch @@ -11,8 +11,8 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-e"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-Ai"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Modeling.launch b/vipra-cmd/runcfg/CMD - Modeling.launch index 9560a948..b4a690b5 100644 --- a/vipra-cmd/runcfg/CMD - Modeling.launch +++ b/vipra-cmd/runcfg/CMD - Modeling.launch @@ -11,8 +11,8 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-g"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-AM"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Reread.launch b/vipra-cmd/runcfg/CMD - Reread.launch index 3d3851ee..c7c1d7de 100644 --- a/vipra-cmd/runcfg/CMD - Reread.launch +++ b/vipra-cmd/runcfg/CMD - Reread.launch @@ -11,8 +11,8 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-r"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-Ar"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/runcfg/CMD - Test.launch b/vipra-cmd/runcfg/CMD - Test.launch index 2e69b94c..b36db2e1 100644 --- a/vipra-cmd/runcfg/CMD - Test.launch +++ b/vipra-cmd/runcfg/CMD - Test.launch @@ -14,5 +14,5 @@ <stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-t"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> -<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> </launchConfiguration> diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/CommandLineOptions.java b/vipra-cmd/src/main/java/de/vipra/cmd/CommandLineOptions.java index 0e83857d..ac6610e5 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/CommandLineOptions.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/CommandLineOptions.java @@ -11,17 +11,18 @@ public class CommandLineOptions { public static final Option CLEAR = Option.builder("c").longOpt("clear").desc("clear the database and models").build(); public static final Option DEBUG = Option.builder("d").longOpt("debug").desc("show debug information").build(); - public static final Option EDIT = Option.builder("e").longOpt("edit").desc("edit a specific model").hasArg().argName("model").build(); public static final Option HELP = Option.builder("h").longOpt("help").desc("show this help").build(); - public static final Option INDEX = Option.builder("i").longOpt("index").desc("create the search index").build(); + public static final Option INDEX = Option.builder("i").longOpt("index").desc("create the search index on selected models").build(); public static final Option LIST = Option.builder("l").longOpt("list").desc("list available models").build(); public static final Option REREAD = Option.builder("r").longOpt("reread").desc("reread generated models").build(); public static final Option SILENT = Option.builder("s").longOpt("silent").desc("suppress all output").build(); public static final Option TEST = Option.builder("t").longOpt("test").desc("test database connections").build(); + public static final Option ALL = Option.builder("A").longOpt("all").desc("select all models, short for -S all").build(); public static final Option CREATE = Option.builder("C").longOpt("create").desc("create new models").hasArgs().argName("models...").build(); - public static final Option DELETE = Option.builder("d").longOpt("delete").desc("delete existing models").hasArgs().argName("models...").build(); - public static final Option IMPORT = Option.builder("I").longOpt("import").desc("import data from json").hasArgs().argName("models...").build(); - public static final Option MODEL = Option.builder("M").longOpt("model").desc("generate models from database").build(); + public static final Option DELETE = Option.builder("D").longOpt("delete").desc("delete existing models").hasArgs().argName("models...").build(); + public static final Option IMPORT = Option.builder("I").longOpt("import").desc("import data from json into selected models").hasArgs() + .argName("models...").build(); + public static final Option MODEL = Option.builder("M").longOpt("model").desc("generate topics on selected models").build(); public static final Option SELECT = Option.builder("S").longOpt("select").desc("select models").hasArgs().argName("models...").build(); private final Options options; @@ -29,7 +30,7 @@ public class CommandLineOptions { private final String cmdName = "vipra"; public CommandLineOptions() { - final Option[] optionsArray = { CLEAR, DEBUG, EDIT, HELP, INDEX, LIST, REREAD, SILENT, TEST, CREATE, DELETE, IMPORT, MODEL, SELECT }; + final Option[] optionsArray = { CLEAR, DEBUG, HELP, INDEX, LIST, REREAD, SILENT, TEST, ALL, CREATE, DELETE, IMPORT, MODEL, SELECT }; options = new Options(); for (final Option option : optionsArray) options.addOption(option); @@ -60,14 +61,6 @@ public class CommandLineOptions { return hasOption(DEBUG) && !hasOption(SILENT); } - public boolean isEdit() { - return hasOption(EDIT); - } - - public String modelToEdit() { - return getOptionValue(EDIT); - } - public boolean isHelp() { return hasOption(HELP); } @@ -109,6 +102,10 @@ public class CommandLineOptions { return hasOption(CREATE); } + public boolean isAll() { + return hasOption(ALL); + } + public String[] modelsToCreate() { return getOptionValues(CREATE); } @@ -142,13 +139,16 @@ public class CommandLineOptions { } public String[] selectedModels() { - return getOptionValues(SELECT); + if (isAll()) + return new String[] { "all" }; + else + return getOptionValues(SELECT); } private void checkDependencies() throws ParseException { - if (isImport() || isModel() || isIndex() || isReread() || isDelete()) { + if (isImport() || isModel() || isIndex() || isReread()) { // these options require at least one selected model - if (!isSelect()) + if (!isSelect() && !isAll()) throw new ParseException("select at least one model"); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java index 3d9bf430..a6c4906d 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java @@ -4,14 +4,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.ListIterator; -import java.util.Map.Entry; - -import org.apache.logging.log4j.Level; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.core.LoggerContext; -import org.apache.logging.log4j.core.config.Configuration; -import org.apache.logging.log4j.core.config.LoggerConfig; + import org.mongodb.morphia.logging.MorphiaLoggerFactory; import org.mongodb.morphia.logging.slf4j.SLF4JLoggerImplFactory; @@ -19,18 +12,16 @@ import de.vipra.cmd.option.ClearCommand; import de.vipra.cmd.option.Command; import de.vipra.cmd.option.CreateModelCommand; import de.vipra.cmd.option.DeleteModelCommand; -import de.vipra.cmd.option.EditModelCommand; import de.vipra.cmd.option.ImportCommand; import de.vipra.cmd.option.IndexingCommand; import de.vipra.cmd.option.ListModelsCommand; import de.vipra.cmd.option.ModelingCommand; import de.vipra.cmd.option.TestCommand; +import de.vipra.util.ConsoleUtils; import de.vipra.util.ex.ConfigException; public class Main { - public static final Logger log = LogManager.getLogger(Main.class); - static { // set morphia log level MorphiaLoggerFactory.registerLogger(SLF4JLoggerImplFactory.class); @@ -43,7 +34,7 @@ public class Main { try { opts.parse(args); } catch (final Exception e) { - log.error(e.getMessage()); + ConsoleUtils.error(e.getMessage()); opts.printHelp(); return; } @@ -54,24 +45,19 @@ public class Main { } // logger configuration - final LoggerContext loggerContext = (LoggerContext) LogManager.getContext(false); - final Configuration loggerConfigs = loggerContext.getConfiguration(); - - if (opts.isDebug()) - loggerConfigs.getLoggerConfig(LogManager.ROOT_LOGGER_NAME).setLevel(Level.DEBUG); - if (opts.isSilent()) { - for (final Entry<String, LoggerConfig> loggerConfig : loggerConfigs.getLoggers().entrySet()) - loggerConfig.getValue().setLevel(Level.OFF); - } + ConsoleUtils.setSilent(opts.isSilent()); - loggerContext.updateLoggers(); + // commands final List<Command> commands = new ArrayList<>(); if (opts.isTest()) commands.add(new TestCommand()); + if (opts.isClear()) + commands.add(new ClearCommand()); + if (opts.isCreate()) commands.add(new CreateModelCommand(opts.modelsToCreate())); @@ -81,12 +67,6 @@ public class Main { if (opts.isList()) commands.add(new ListModelsCommand()); - if (opts.isEdit()) - commands.add(new EditModelCommand(opts.modelToEdit())); - - if (opts.isClear()) - commands.add(new ClearCommand()); - if (opts.isImport()) commands.add(new ImportCommand(opts.selectedModels(), opts.filesToImport())); @@ -96,7 +76,8 @@ public class Main { if (opts.isIndex()) commands.add(new IndexingCommand(opts.selectedModels())); - // run commands + // run + if (commands.size() > 0) { for (final ListIterator<Command> it = commands.listIterator(); it.hasNext();) { final Command c = it.next(); @@ -105,10 +86,9 @@ public class Main { } catch (final Exception e) { final Throwable cause = e.getCause(); if (cause != null) - log.error(cause.getMessage()); + ConsoleUtils.error(cause.getMessage()); else - log.error(e.getMessage()); - log.debug(e.getMessage(), e); + ConsoleUtils.error(e.getMessage()); } } } else { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java index 7eb81319..e5693aa8 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java @@ -13,9 +13,9 @@ import java.util.HashMap; import java.util.Map; import de.vipra.util.Constants; -import de.vipra.util.ModelConfig; import de.vipra.util.ex.ConfigException; import de.vipra.util.model.ArticleFull; +import de.vipra.util.model.TopicModelConfig; public class Filebase { @@ -28,7 +28,7 @@ public class Filebase { private final FilebaseWordIndex wordIndex; private final FilebaseWindowIndex windowIndex; - public Filebase(final ModelConfig modelConfig, final File dataDir) throws ParseException, IOException { + public Filebase(final TopicModelConfig modelConfig, final File dataDir) throws ParseException, IOException { modelDir = new File(dataDir, modelConfig.getName()); file = new File(modelDir, FILE_NAME); newArticles = new HashMap<>(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWindowIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWindowIndex.java index 436d4978..4559a99f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWindowIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWindowIndex.java @@ -45,7 +45,6 @@ public class FilebaseWindowIndex { windowMap = new CountMap<>(); if (winFile.exists()) { final BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(winFile))); - in.readLine(); String line = null; while ((line = in.readLine()) != null) { final String[] parts = line.split(","); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java index 2448ee90..458af103 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java @@ -13,8 +13,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; import org.bson.types.ObjectId; import de.vipra.cmd.file.FilebaseIDDateIndex; @@ -23,7 +21,7 @@ import de.vipra.cmd.file.FilebaseWindowIndex; import de.vipra.cmd.file.FilebaseWordIndex; import de.vipra.util.ArrayUtils; import de.vipra.util.Config; -import de.vipra.util.ModelConfig; +import de.vipra.util.ConsoleUtils; import de.vipra.util.MongoUtils; import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; @@ -36,6 +34,7 @@ import de.vipra.util.model.SimilarArticle; import de.vipra.util.model.Topic; import de.vipra.util.model.TopicFull; import de.vipra.util.model.TopicModel; +import de.vipra.util.model.TopicModelConfig; import de.vipra.util.model.TopicModelFull; import de.vipra.util.model.TopicShare; import de.vipra.util.model.TopicWord; @@ -45,8 +44,6 @@ import de.vipra.util.service.Service.QueryBuilder; public class Analyzer { - public static final Logger log = LogManager.getLogger(Analyzer.class); - private final Config config; private final File dataDir; private final File dtmBinary; @@ -73,10 +70,10 @@ public class Analyzer { throw new AnalyzerException("dtm binary not found at path: " + config.getDtmPath() + ", check config key 'tm.dtmpath'"); } - public void analyze(final ModelConfig modelConfig, final boolean reread) + public void analyze(final TopicModelConfig modelConfig, final boolean reread) throws AnalyzerException, DatabaseException, ParseException, IOException, InterruptedException { - final File modelDir = new File(dataDir, modelConfig.getName()); + final File modelDir = modelConfig.getModelDir(dataDir); final File outDir = new File(modelDir, "out"); final File outDirSeq = new File(outDir, "lda-seq"); @@ -106,8 +103,6 @@ public class Analyzer { final String command = dtmBinary.getAbsolutePath() + " " + StringUtils.join(parameters, " "); - final FilebaseWindowIndex windowIndex = new FilebaseWindowIndex(modelDir, modelConfig.getWindowResolution()); - BufferedReader in; if (!reread) { @@ -122,7 +117,7 @@ public class Analyzer { int iteration = 0; while ((line = in.readLine()) != null) { if (line.contains("EM iter")) { - log.info("iteration " + iteration++); + ConsoleUtils.info("iteration " + iteration++); } } @@ -130,20 +125,18 @@ public class Analyzer { p.waitFor(); } - final FilebaseWordIndex wordIndex = new FilebaseWordIndex(modelConfig.getModelDir(dataDir)); - final FilebaseIDDateIndex idDateIndex = new FilebaseIDDateIndex(modelConfig.getModelDir(dataDir)); - - final QueryBuilder builder = QueryBuilder.builder().criteria("model.id", modelConfig.getName()); - dbArticles.deleteMultiple(builder); - dbTopics.deleteMultiple(builder); - dbSequences.deleteMultiple(builder); - dbWindows.deleteMultiple(builder); - dbTopicModels.deleteSingle(modelConfig.getName()); + final FilebaseWordIndex wordIndex = new FilebaseWordIndex(modelDir); + final FilebaseIDDateIndex idDateIndex = new FilebaseIDDateIndex(modelDir); + final FilebaseWindowIndex windowIndex = new FilebaseWindowIndex(modelDir, modelConfig.getWindowResolution()); final int topicCount = modelConfig.getkTopics(); - final int wordCount = wordIndex.size(); + assert topicCount > 0; + final int sequencesCount = windowIndex.size(); + assert sequencesCount > 0; + final int articlesCount = idDateIndex.size(); + final int wordCount = wordIndex.size(); // read topic distributions @@ -171,14 +164,14 @@ public class Analyzer { // read topic definition files and create topics - final TopicModelFull newTopicModel = new TopicModelFull(modelConfig.getName()); + final TopicModelFull topicModel = new TopicModelFull(modelConfig.getName(), modelConfig); final List<Window> newWindows = new ArrayList<>(sequencesCount); final List<SequenceFull> newSequences = new ArrayList<>(topicCount * sequencesCount); final List<TopicFull> newTopics = new ArrayList<>(topicCount); - log.info("vocabulary size: " + wordCount); - log.info("sequences: " + sequencesCount); - log.info("topics: " + topicCount); + ConsoleUtils.info("vocabulary size: " + wordCount); + ConsoleUtils.info("sequences: " + sequencesCount); + ConsoleUtils.info("topics: " + topicCount); final boolean seqRelativeCutoff = modelConfig.getMinRelativeProbability() > 0; @@ -189,7 +182,7 @@ public class Analyzer { newWindow.setStartDate(windowIndex.startDate(idxSeq)); newWindow.setEndDate(windowIndex.endDate(idxSeq)); newWindow.setWindowResolution(modelConfig.getWindowResolution()); - newWindow.setModel(new TopicModel(newTopicModel.getId())); + newWindow.setTopicModel(new TopicModel(topicModel.getId())); newWindows.add(newWindow); } @@ -205,7 +198,7 @@ public class Analyzer { final TopicFull newTopic = new TopicFull(); final List<Sequence> newTopicSequences = new ArrayList<>(sequencesCount); newTopic.setSequences(newTopicSequences); - newTopic.setModel(new TopicModel(newTopicModel.getId())); + newTopic.setTopicModel(new TopicModel(topicModel.getId())); newTopics.add(newTopic); in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile))); @@ -271,7 +264,7 @@ public class Analyzer { newSequenceFull.setRelevance(relevance); newSequenceFull.setRelevanceChange(relevance - prevRelevance); newSequenceFull.setTopic(new Topic(newTopic.getId())); - newSequenceFull.setModel(new TopicModel(newTopicModel.getId())); + newSequenceFull.setTopicModel(new TopicModel(topicModel.getId())); newSequences.add(newSequenceFull); newTopicSequences.add(new Sequence(newSequenceFull.getId())); @@ -294,7 +287,7 @@ public class Analyzer { // sort topic words and generate topic name final List<TopicWord> topTopicWordsList = new ArrayList<>(topTopicWords); Collections.sort(topTopicWordsList); - newTopic.setName(TopicFull.getNameFromWords(topTopicWordsList)); + newTopic.setName(TopicFull.getNameFromWords(modelConfig.getTopicAutoNamingWords(), topTopicWordsList)); // calculate average final double average = relevanceSum / sequencesCount; @@ -325,14 +318,17 @@ public class Analyzer { newTopic.setRisingDecayRelevance(risingDecayRelevance); } - // create topics and words - try { - dbWindows.createMultiple(newWindows); - dbSequences.createMultiple(newSequences); - dbTopics.createMultiple(newTopics); - } catch (final DatabaseException e) { - throw new AnalyzerException(e); - } + // recreate windows, sequences and topics + + final QueryBuilder builder = QueryBuilder.builder().criteria("topicModel", new TopicModel(modelConfig.getName())); + + dbWindows.deleteMultiple(builder); + dbSequences.deleteMultiple(builder); + dbTopics.deleteMultiple(builder); + + dbWindows.createMultiple(newWindows); + dbSequences.createMultiple(newSequences); + dbTopics.createMultiple(newTopics); // create topic references and store document similarities @@ -392,19 +388,19 @@ public class Analyzer { // update article with topic references (partial update) final ArticleFull article = new ArticleFull(); article.setId(entry.getId()); - article.setModel(new TopicModel(newTopicModel.getId())); + article.setTopicModel(new TopicModel(topicModel.getId())); article.setTopics(newTopicRefs); article.setSimilarArticles(similarArticles); try { - dbArticles.updateSingle(article, "model", "topics", "similarArticles"); + dbArticles.updateSingle(article, "topicModel", "topics", "similarArticles"); } catch (final DatabaseException e) { - log.error(e); + ConsoleUtils.error(e); } } } - dbTopicModels.createSingle(newTopicModel); + dbTopicModels.replaceSingle(topicModel); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java index b731886a..76d2c1bd 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java @@ -4,60 +4,34 @@ import java.io.File; import java.io.IOException; import org.apache.commons.io.FileUtils; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.bson.types.ObjectId; import org.elasticsearch.client.Client; import de.vipra.util.Config; +import de.vipra.util.ConsoleUtils; import de.vipra.util.ESClient; -import de.vipra.util.model.Article; -import de.vipra.util.model.Sequence; -import de.vipra.util.model.Topic; -import de.vipra.util.model.TopicModel; -import de.vipra.util.model.Window; import de.vipra.util.service.MongoService; public class ClearCommand implements Command { - public static final Logger log = LogManager.getLogger(ClearCommand.class); - private Config config; - private MongoService<Article, ObjectId> dbArticles; - private MongoService<Topic, ObjectId> dbTopics; - private MongoService<Sequence, ObjectId> dbSequences; - private MongoService<Window, Integer> dbWindows; - private MongoService<TopicModel, String> dbTopicModels; private Client elasticClient; private void clear() throws Exception { config = Config.getConfig(); - dbArticles = MongoService.getDatabaseService(config, Article.class); - dbTopics = MongoService.getDatabaseService(config, Topic.class); - dbSequences = MongoService.getDatabaseService(config, Sequence.class); - dbWindows = MongoService.getDatabaseService(config, Window.class); - dbTopicModels = MongoService.getDatabaseService(config, TopicModel.class); elasticClient = ESClient.getClient(config); - - log.info("clearing database"); - dbArticles.drop(); - dbTopics.drop(); - dbSequences.drop(); - dbWindows.drop(); - dbTopicModels.drop(); - - log.info("clearing index"); + MongoService.dropDatabase(config); elasticClient.admin().indices().prepareDelete("_all").get(); try { - log.info("clearing filebase"); final File dataDir = config.getDataDirectory(); if (dataDir.exists() && dataDir.isDirectory()) { FileUtils.deleteDirectory(dataDir); } } catch (final IOException e) { - log.warn("could not delete data directory: " + config.getDataDirectory().getAbsolutePath()); + ConsoleUtils.warn("could not delete data directory: " + config.getDataDirectory().getAbsolutePath()); } + + ConsoleUtils.info("cleared"); } @Override diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java index fc93cfc4..8d11e4f0 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java @@ -2,19 +2,14 @@ package de.vipra.cmd.option; import java.io.File; -import org.apache.commons.io.IOUtils; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - import de.vipra.util.Config; -import de.vipra.util.Constants; -import de.vipra.util.FileUtils; -import de.vipra.util.ModelConfig; +import de.vipra.util.ConsoleUtils; +import de.vipra.util.model.TopicModelConfig; +import de.vipra.util.model.TopicModelFull; +import de.vipra.util.service.MongoService; public class CreateModelCommand implements Command { - public static final Logger log = LogManager.getLogger(CreateModelCommand.class); - private final String[] names; public CreateModelCommand(final String[] names) { @@ -27,12 +22,14 @@ public class CreateModelCommand implements Command { return; final Config config = Config.getConfig(); + final MongoService<TopicModelFull, String> dbTopicModels = MongoService.getDatabaseService(config, TopicModelFull.class); - final String modelConfigString; - if (config.getModelConfigTemplate() == null) { - modelConfigString = IOUtils.toString(FileUtils.getResource(Constants.MODEL_FILE)); + final TopicModelConfig modelConfig; + + if (config.getModelConfigTemplate() != null) { + modelConfig = new TopicModelConfig(config.getModelConfigTemplate()); } else { - modelConfigString = Config.mapper.writeValueAsString(config.getModelConfigTemplate()); + modelConfig = new TopicModelConfig(); } for (final String name : names) { @@ -43,12 +40,12 @@ public class CreateModelCommand implements Command { throw new Exception("model with that name already exists: " + name); if (!modelDir.mkdirs()) throw new Exception("could not create model directory: " + modelDir.getAbsolutePath()); - final File modelConfigFile = new File(modelDir, Constants.MODEL_FILE); - final ModelConfig modelConfig = Config.mapper.readValue(modelConfigString, ModelConfig.class); + modelConfig.setName(name); - org.apache.commons.io.FileUtils.write(modelConfigFile, Config.mapper.writeValueAsString(modelConfig)); - config.getModelConfigs().put(name, modelConfig); - log.info("model created: " + name); + final TopicModelFull topicModel = new TopicModelFull(name, modelConfig); + dbTopicModels.createSingle(topicModel); + config.getTopicModelConfigs().put(name, modelConfig); + ConsoleUtils.info("model created: " + name); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteModelCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteModelCommand.java index 7d10cf6a..8c771a0a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteModelCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteModelCommand.java @@ -1,16 +1,15 @@ package de.vipra.cmd.option; import java.io.File; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; +import java.util.Arrays; import de.vipra.util.Config; +import de.vipra.util.ConsoleUtils; +import de.vipra.util.model.TopicModel; +import de.vipra.util.service.MongoService; public class DeleteModelCommand implements Command { - public static final Logger log = LogManager.getLogger(DeleteModelCommand.class); - private final String[] names; public DeleteModelCommand(final String[] names) { @@ -20,13 +19,16 @@ public class DeleteModelCommand implements Command { @Override public void run() throws Exception { final Config config = Config.getConfig(); + final MongoService<TopicModel, String> dbTopicModels = MongoService.getDatabaseService(config, TopicModel.class); + for (final String name : names) { final File modelDir = new File(config.getDataDirectory(), name); if (modelDir.exists()) { org.apache.commons.io.FileUtils.deleteDirectory(modelDir); - log.info("model deleted: " + name); + ConsoleUtils.info("model deleted: " + name); } } + dbTopicModels.deleteMultiple(Arrays.asList(names)); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/EditModelCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/EditModelCommand.java deleted file mode 100644 index 510757b1..00000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/EditModelCommand.java +++ /dev/null @@ -1,28 +0,0 @@ -package de.vipra.cmd.option; - -import java.awt.Desktop; -import java.io.File; - -import de.vipra.util.Config; -import de.vipra.util.ModelConfig; -import de.vipra.util.ex.ConfigException; - -public class EditModelCommand implements Command { - - private final File configFile; - - public EditModelCommand(final String model) throws ConfigException { - final Config config = Config.getConfig(); - final ModelConfig modelConfig = config.getModelConfigs().get(model); - configFile = modelConfig.getConfigFile(config.getDataDirectory()); - } - - @Override - public void run() throws Exception { - if (!configFile.exists()) - throw new Exception("missing model configuration file: " + configFile.getAbsolutePath()); - - Desktop.getDesktop().open(configFile); - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 94211ee1..17a18c6c 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -6,13 +6,9 @@ import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; +import java.util.EnumSet; import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; import org.bson.types.ObjectId; import org.json.simple.JSONArray; import org.json.simple.JSONObject; @@ -24,15 +20,23 @@ import de.vipra.cmd.file.FilebaseException; import de.vipra.cmd.text.ProcessedText; import de.vipra.cmd.text.Processor; import de.vipra.cmd.text.ProcessorException; +import de.vipra.cmd.text.SpotlightAnalyzer; +import de.vipra.cmd.text.SpotlightResource; +import de.vipra.cmd.text.SpotlightResponse; import de.vipra.util.Config; +import de.vipra.util.ConsoleUtils; import de.vipra.util.Constants; -import de.vipra.util.ModelConfig; +import de.vipra.util.Constants.ProcessorMode; import de.vipra.util.StringUtils; import de.vipra.util.Timer; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleFull; import de.vipra.util.model.ArticleStats; +import de.vipra.util.model.TextEntity; +import de.vipra.util.model.TopicModel; +import de.vipra.util.model.TopicModelConfig; +import de.vipra.util.model.TopicModelFull; import de.vipra.util.service.MongoService; public class ImportCommand implements Command { @@ -61,82 +65,18 @@ public class ImportCommand implements Command { } } - public static class ImportTask implements Runnable { - - private final JSONObject object; - private final Processor processor; - private final ArticleBuffer buffer; - private final Filebase filebase; - - public ImportTask(final JSONObject object, final Processor processor, final ArticleBuffer buffer, final Filebase filebase) { - this.object = object; - this.processor = processor; - this.buffer = buffer; - this.filebase = filebase; - } - - @Override - public void run() { - final ArticleFull article = articleFromJSON(object); - - try { - // preprocess text - final ProcessedText processedText = processor.process(article.getText()); - article.setProcessedText(processedText.getWords()); - article.setWords(processedText.getArticleWords()); - - // generate article stats - final ArticleStats stats = new ArticleStats(); - stats.setWordCount(processedText.getWordCount()); - stats.setProcessedWordCount(processedText.getReducedWordCount()); - stats.setReductionRatio(processedText.getReductionRatio()); - article.setStats(stats); - - // add article to mongodb - buffer.add(article); - - // add article to filebase if long enough - if (processedText.getReducedWordCount() >= Constants.DOCUMENT_MIN_LENGTH) - filebase.add(article); - - log.info("imported \"" + object.get("title")); - } catch (final ProcessorException e) { - log.error("could not preprocess text of article '" + article.getTitle() + "'"); - } catch (final DatabaseException e) { - log.error("could not save processed article in the database '" + article.getTitle() + "'"); - } catch (final FilebaseException e) { - log.error("could not save processed article in the filebase '" + article.getTitle() + "'"); - } - } - - private ArticleFull articleFromJSON(final JSONObject obj) { - final ArticleFull article = new ArticleFull(); - article.setId(new ObjectId()); - if (obj.containsKey("title")) - article.setTitle(obj.get("title").toString()); - if (obj.containsKey("text")) - article.setText(obj.get("text").toString()); - if (obj.containsKey("url")) - article.setUrl(obj.get("url").toString()); - if (obj.containsKey("date")) - article.setDate(obj.get("date").toString()); - return article; - } - - } - - public static final Logger log = LogManager.getLogger(ImportCommand.class); - private final String[] models; - private final int threadCount; private final List<File> files = new ArrayList<>(); private final JSONParser parser = new JSONParser(); private Config config; private MongoService<ArticleFull, ObjectId> dbArticles; + private MongoService<TopicModelFull, String> dbTopicModels; + private TopicModelConfig modelConfig; + private SpotlightAnalyzer spotlightAnalyzer; private Filebase filebase; private Processor processor; private ArticleBuffer buffer; - private ExecutorService executor; + private TopicModelFull topicModel; /** * Import command to import articles into the database, do topic modeling @@ -149,7 +89,6 @@ public class ImportCommand implements Command { */ public ImportCommand(final String[] models, final String[] paths) { this.models = models; - threadCount = Runtime.getRuntime().availableProcessors() * 10; addPaths(paths); } @@ -159,13 +98,13 @@ public class ImportCommand implements Command { } } - public void addPaths(final File[] paths) { + private void addPaths(final File[] paths) { for (final File path : paths) { addPath(path); } } - public void addPath(final File file) { + private void addPath(final File file) { if (file.isFile()) { files.add(file); } else if (file.isDirectory()) { @@ -180,6 +119,20 @@ public class ImportCommand implements Command { } } + private ArticleFull articleFromJSON(final JSONObject obj) { + final ArticleFull article = new ArticleFull(); + article.setId(new ObjectId()); + if (obj.containsKey("title")) + article.setTitle(obj.get("title").toString()); + if (obj.containsKey("text")) + article.setText(obj.get("text").toString()); + if (obj.containsKey("url")) + article.setUrl(obj.get("url").toString()); + if (obj.containsKey("date")) + article.setDate(obj.get("date").toString()); + return article; + } + /** * import a single article into the database and filebase * @@ -188,8 +141,61 @@ public class ImportCommand implements Command { * @throws Exception */ private void importArticle(final JSONObject object) { - final ImportTask task = new ImportTask(object, processor, buffer, filebase); - executor.execute(task); + final ArticleFull article = articleFromJSON(object); + + if (EnumSet.of(ProcessorMode.ENTITIES, ProcessorMode.TEXT_WITH_ENTITIES).contains(modelConfig.getProcessorMode())) { + try { + final SpotlightResponse spotlightResponse = spotlightAnalyzer.analyze(article.getText()); + + final List<TextEntity> textEntities = new ArrayList<>(spotlightResponse.getResources().size()); + final StringBuilder sb = new StringBuilder(); + + for (final SpotlightResource sr : spotlightResponse.getResources()) { + textEntities.add(new TextEntity(sr.getSurfaceForm(), sr.getUri())); + + for (final String type : sr.getTypes()) { + final String[] parts = type.split(":"); + sb.append(" ").append(parts[parts.length - 1]); + } + } + + // TODO do sth with this + } catch (final IOException e) { + ConsoleUtils.error("could not analyze text with spotlight: " + e.getMessage()); + } + } + + try { + // preprocess text + final ProcessedText processedText = processor.process(modelConfig, article.getText()); + + if (processedText.getReducedWordCount() < modelConfig.getDocumentMinimumLength()) { + ConsoleUtils.info(" skipped \"" + object.get("title")); + } else { + article.setProcessedText(processedText.getWords()); + article.setWords(processedText.getArticleWords()); + article.setTopicModel(new TopicModel(topicModel.getId())); + + // generate article stats + final ArticleStats stats = new ArticleStats(); + stats.setWordCount(processedText.getWordCount()); + stats.setProcessedWordCount(processedText.getReducedWordCount()); + stats.setReductionRatio(processedText.getReductionRatio()); + article.setStats(stats); + + // add article to data- and filebase + buffer.add(article); + filebase.add(article); + + ConsoleUtils.info("imported \"" + object.get("title")); + } + } catch (final ProcessorException e) { + ConsoleUtils.error("could not preprocess text of article '" + article.getTitle() + "'"); + } catch (final DatabaseException e) { + ConsoleUtils.error("could not save processed article in the database '" + article.getTitle() + "'"); + } catch (final FilebaseException e) { + ConsoleUtils.error("could not save processed article in the filebase '" + article.getTitle() + "'"); + } } /** @@ -214,7 +220,7 @@ public class ImportCommand implements Command { importArticle((JSONObject) data); imported++; } else { - log.error("unknown data format"); + ConsoleUtils.error("unknown data format"); } return imported; @@ -228,13 +234,17 @@ public class ImportCommand implements Command { return imported; } - private void importForModel(final ModelConfig modelConfig) + private void importForModel(final TopicModelConfig modelConfig) throws java.text.ParseException, IOException, ConfigException, ParseException, InterruptedException, DatabaseException { + this.modelConfig = modelConfig; + if (this.modelConfig.getProcessorMode() == ProcessorMode.ENTITIES || this.modelConfig.getProcessorMode() == ProcessorMode.TEXT_WITH_ENTITIES) + spotlightAnalyzer = new SpotlightAnalyzer(modelConfig); + buffer = new ArticleBuffer(dbArticles); filebase = new Filebase(modelConfig, config.getDataDirectory()); + topicModel = new TopicModelFull(modelConfig.getName(), modelConfig); - log.info("using data directory: " + config.getDataDirectory().getAbsolutePath()); - log.info("using " + threadCount + " " + StringUtils.quantity(threadCount, "thread")); + dbTopicModels.replaceSingle(topicModel); final Timer timer = new Timer(); timer.restart(); @@ -242,36 +252,28 @@ public class ImportCommand implements Command { /* * import files into database and filebase */ - log.info("file import"); - final int imported = importFiles(files); - executor.shutdown(); - executor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + importFiles(files); buffer.save(); - timer.lap("import"); /* * write filebase */ - log.info("writing file index"); filebase.sync(); - timer.lap("filebase write"); /* * run information */ - log.info("imported " + imported + " new " + StringUtils.quantity(imported, "article")); - log.info(timer.toString()); - log.info("done in " + StringUtils.timeString(timer.total())); + ConsoleUtils.info("done in " + StringUtils.timeString(timer.total())); } @Override public void run() throws java.text.ParseException, IOException, ParseException, InterruptedException, DatabaseException, Exception { config = Config.getConfig(); dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); + dbTopicModels = MongoService.getDatabaseService(config, TopicModelFull.class); processor = new Processor(); - executor = Executors.newFixedThreadPool(threadCount); - for (final String model : models) { - importForModel(config.getModelConfig(model)); + for (final TopicModelConfig modelConfig : config.getTopicModelConfigs(models)) { + importForModel(modelConfig); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/IndexingCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/IndexingCommand.java index e72bed65..3f58c23f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/IndexingCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/IndexingCommand.java @@ -4,27 +4,25 @@ import java.io.IOException; import java.text.ParseException; import java.util.Map; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; import org.bson.types.ObjectId; import org.elasticsearch.client.Client; +import org.elasticsearch.index.IndexNotFoundException; import de.vipra.cmd.file.FilebaseIDDateIndex; import de.vipra.cmd.file.FilebaseIDDateIndexEntry; import de.vipra.util.Config; +import de.vipra.util.ConsoleUtils; import de.vipra.util.ESClient; import de.vipra.util.ESSerializer; -import de.vipra.util.ModelConfig; import de.vipra.util.MongoUtils; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleFull; +import de.vipra.util.model.TopicModelConfig; import de.vipra.util.service.MongoService; public class IndexingCommand implements Command { - public static final Logger log = LogManager.getLogger(IndexingCommand.class); - private final String[] models; private Config config; private MongoService<ArticleFull, ObjectId> dbArticles; @@ -35,26 +33,28 @@ public class IndexingCommand implements Command { this.models = models; } - private void indexForModel(final ModelConfig modelConfig) throws ParseException, IOException, ConfigException, DatabaseException { + private void indexForModel(final TopicModelConfig modelConfig) throws ParseException, IOException, ConfigException, DatabaseException { final FilebaseIDDateIndex index = new FilebaseIDDateIndex(modelConfig.getModelDir(config.getDataDirectory())); final String indexName = modelConfig.getName() + "-articles"; - // clear index - // elasticClient.admin().indices().prepareDelete("_all").get(); - elasticClient.admin().indices().prepareDelete(indexName).get(); + try { + // clear index + elasticClient.admin().indices().prepareDelete(indexName).get(); + } catch (final IndexNotFoundException e) {} for (final FilebaseIDDateIndexEntry entry : index) { // get article from database final ArticleFull article = dbArticles.getSingle(MongoUtils.objectId(entry.getId()), true); if (article == null) { - log.error("no article found in db for id " + entry.getId()); + ConsoleUtils.error("no article found in db for id: " + entry.getId()); continue; } // index article final Map<String, Object> source = elasticSerializer.serialize(article); elasticClient.prepareIndex(indexName, "article", article.getId().toString()).setSource(source).get(); + ConsoleUtils.info("indexed \"" + article.getTitle() + "\""); } elasticClient.close(); @@ -66,8 +66,8 @@ public class IndexingCommand implements Command { dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); elasticClient = ESClient.getClient(config); elasticSerializer = new ESSerializer<>(ArticleFull.class); - for (final String model : models) { - indexForModel(config.getModelConfig(model)); + for (final TopicModelConfig modelConfig : config.getTopicModelConfigs(models)) { + indexForModel(modelConfig); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ListModelsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ListModelsCommand.java index d203417e..54e4df82 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ListModelsCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ListModelsCommand.java @@ -2,22 +2,26 @@ package de.vipra.cmd.option; import java.util.Map.Entry; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; +import org.fusesource.jansi.Ansi; import de.vipra.util.Config; -import de.vipra.util.ModelConfig; +import de.vipra.util.ConsoleUtils; +import de.vipra.util.model.TopicModelConfig; +import edu.stanford.nlp.util.StringUtils; public class ListModelsCommand implements Command { - public static final Logger log = LogManager.getLogger(ListModelsCommand.class); - @Override public void run() throws Exception { - log.info("existing models:"); + ConsoleUtils.info("existing models:"); final Config config = Config.getConfig(); - for (final Entry<String, ModelConfig> entry : config.getModelConfigs().entrySet()) - log.info(" " + entry.getValue().getName()); + int longestModelName = 0; + for (final Entry<String, TopicModelConfig> entry : config.getTopicModelConfigs().entrySet()) + longestModelName = Math.max(longestModelName, entry.getValue().getName().length()); + for (final Entry<String, TopicModelConfig> entry : config.getTopicModelConfigs().entrySet()) + ConsoleUtils + .info(" " + Ansi.ansi().a(Ansi.Attribute.INTENSITY_BOLD).a(StringUtils.pad(entry.getValue().getName(), longestModelName)).reset() + + " " + entry.getValue().toString()); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java index 490ef4cb..3a61ed45 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java @@ -3,22 +3,18 @@ package de.vipra.cmd.option; import java.io.IOException; import java.text.ParseException; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - import de.vipra.cmd.lda.Analyzer; import de.vipra.cmd.lda.AnalyzerException; import de.vipra.util.Config; -import de.vipra.util.ModelConfig; +import de.vipra.util.ConsoleUtils; import de.vipra.util.StringUtils; import de.vipra.util.Timer; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; +import de.vipra.util.model.TopicModelConfig; public class ModelingCommand implements Command { - public static final Logger log = LogManager.getLogger(ModelingCommand.class); - private final String[] models; private final boolean reread; @@ -27,7 +23,7 @@ public class ModelingCommand implements Command { this.reread = reread; } - private void modelForModel(final ModelConfig modelConfig) + private void modelForModel(final TopicModelConfig modelConfig) throws AnalyzerException, ConfigException, DatabaseException, ParseException, IOException, InterruptedException { final Analyzer analyzer = new Analyzer(); @@ -37,22 +33,21 @@ public class ModelingCommand implements Command { /* * do topic modeling */ - log.info("topic modeling"); + ConsoleUtils.info("topic modeling"); analyzer.analyze(modelConfig, reread); timer.lap("topic modeling"); /* * run information */ - log.info(timer.toString()); - log.info("done in " + StringUtils.timeString(timer.total())); + ConsoleUtils.info("done in " + StringUtils.timeString(timer.total())); } @Override public void run() throws Exception { final Config config = Config.getConfig(); - for (final String model : models) { - modelForModel(config.getModelConfig(model)); + for (final TopicModelConfig modelConfig : config.getTopicModelConfigs(models)) { + modelForModel(modelConfig); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/TestCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/TestCommand.java index b9b1418c..38a8b374 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/TestCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/TestCommand.java @@ -1,39 +1,36 @@ package de.vipra.cmd.option; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; import org.bson.types.ObjectId; import org.elasticsearch.client.transport.NoNodeAvailableException; import org.elasticsearch.client.transport.TransportClient; import de.vipra.util.Config; +import de.vipra.util.ConsoleUtils; import de.vipra.util.ESClient; import de.vipra.util.model.Article; import de.vipra.util.service.MongoService; public class TestCommand implements Command { - public static final Logger log = LogManager.getLogger(TestCommand.class); - @Override public void run() throws Exception { // test if configuration readable - log.info("reading configuration..."); + ConsoleUtils.info("reading configuration..."); final Config config = Config.getConfig(); // test if database is accessible - log.info("testing mongodb connection..."); + ConsoleUtils.info("testing mongodb connection..."); final MongoService<Article, ObjectId> dbArticles = MongoService.getDatabaseService(config, Article.class); dbArticles.count(null); // test if elasticsearch is accessible - log.info("testing elasticsearch connection..."); + ConsoleUtils.info("testing elasticsearch connection..."); final TransportClient esclient = ESClient.getClient(config); if (esclient.connectedNodes().isEmpty()) { throw new NoNodeAvailableException("no elasticsearch nodes available"); } - log.info("all tests passed"); + ConsoleUtils.info("all tests passed"); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/plugin/ClassNameRegexFilter.java b/vipra-cmd/src/main/java/de/vipra/cmd/plugin/ClassNameRegexFilter.java deleted file mode 100644 index 76c416a8..00000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/plugin/ClassNameRegexFilter.java +++ /dev/null @@ -1,77 +0,0 @@ -package de.vipra.cmd.plugin; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.logging.log4j.core.LogEvent; -import org.apache.logging.log4j.core.config.plugins.Plugin; -import org.apache.logging.log4j.core.config.plugins.PluginAttribute; -import org.apache.logging.log4j.core.config.plugins.PluginFactory; -import org.apache.logging.log4j.core.filter.AbstractFilter; - -/** - * http://rohithag.blogspot.de/2014/04/log4j2-separate-log-files-by.html - */ -@Plugin(name = "ClassNameRegexFilter", category = "Core", elementType = "filter", printObject = true) -public final class ClassNameRegexFilter extends AbstractFilter { - - private static final long serialVersionUID = -6931373371808638290L; - - private final Pattern pattern; - - private ClassNameRegexFilter(final Pattern pattern, final Result onMatch, final Result onMismatch) { - super(onMatch, onMismatch); - this.pattern = pattern; - } - - @Override - public Result filter(final LogEvent event) { - return filter(event.getLoggerName()); - } - - private Result filter(final String className) { - if (className == null) { - return onMismatch; - } - final Matcher m = pattern.matcher(className); - return m.matches() ? onMatch : onMismatch; - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder(); - sb.append("pattern=").append(pattern.toString()); - return sb.toString(); - } - - /** - * Create a Filter that matches a regular expression. - * - * @param regex - * The regular expression to match. - * @param match - * The action to perform when a match occurs. - * @param mismatch - * The action to perform when a mismatch occurs. - * @return The Log4jRegexFilter. - */ - @PluginFactory - public static ClassNameRegexFilter createFilter(@PluginAttribute("regex") final String regex, @PluginAttribute("onMatch") final String match, - @PluginAttribute("onMismatch") final String mismatch) { - if (regex == null) { - LOGGER.error("A regular expression must be provided for RegexFilter"); - return null; - } - Pattern pattern; - try { - pattern = Pattern.compile(regex); - } catch (final Exception ex) { - LOGGER.error("RegexFilter caught exception compiling pattern: " + regex + " cause: " + ex.getMessage()); - return null; - } - final Result onMatch = Result.toResult(match); - final Result onMismatch = Result.toResult(mismatch); - - return new ClassNameRegexFilter(pattern, onMatch, onMismatch); - } -} \ No newline at end of file diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java index 01e1c45a..41723aa8 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java @@ -3,10 +3,8 @@ package de.vipra.cmd.text; import java.util.List; import java.util.Properties; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - import de.vipra.util.Constants; +import de.vipra.util.model.TopicModelConfig; import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; @@ -18,8 +16,6 @@ import edu.stanford.nlp.util.StringUtils; public class Processor { - public static final Logger log = LogManager.getLogger(Processor.class); - private final StanfordCoreNLP nlp; public Processor() { @@ -38,7 +34,7 @@ public class Processor { nlp = new StanfordCoreNLP(props); } - public ProcessedText process(final String input) throws ProcessorException { + public ProcessedText process(final TopicModelConfig modelConfig, final String input) throws ProcessorException { final Annotation doc = new Annotation(input.toLowerCase()); nlp.annotate(doc); final StringBuilder sb = new StringBuilder(); @@ -55,7 +51,7 @@ public class Processor { if (b == null || !b) { // filter out infrequent words final Long count = word.get(FrequencyAnnotator.class); - if (count != null && count >= Constants.DOCUMENT_MIN_WORD_FREQ) { + if (count != null && count >= modelConfig.getDocumentMinimumWordFrequency()) { final String lemma = word.get(LemmaAnnotation.class); // collect unique words sb.append(lemma).append(" "); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightAnalyzer.java new file mode 100644 index 00000000..7c041f68 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightAnalyzer.java @@ -0,0 +1,53 @@ +package de.vipra.cmd.text; + +import java.io.BufferedReader; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLEncoder; + +import de.vipra.util.Config; +import de.vipra.util.URLUtils; +import de.vipra.util.ex.ConfigException; +import de.vipra.util.model.TopicModelConfig; + +public class SpotlightAnalyzer { + + private final URL spotlightUrl; + private final TopicModelConfig modelConfig; + + public SpotlightAnalyzer(final TopicModelConfig modelConfig) throws MalformedURLException, ConfigException { + final Config config = Config.getConfig(); + spotlightUrl = new URL(URLUtils.concat(config.getSpotlightUrl(), "/rest/annotate")); + this.modelConfig = modelConfig; + } + + public SpotlightResponse analyze(String text) throws IOException { + text = "confidence=" + modelConfig.getSpotlightConfidence() + "&support=" + modelConfig.getSpotlightSupport() + "&text=" + + URLEncoder.encode(text, "UTF-8"); + + final HttpURLConnection connection = (HttpURLConnection) spotlightUrl.openConnection(); + connection.setRequestMethod("POST"); + connection.setRequestProperty("Content-Length", Integer.toString(text.getBytes().length)); + connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); + connection.setRequestProperty("Accept", "application/json"); + connection.setUseCaches(false); + connection.setDoOutput(true); + + final DataOutputStream out = new DataOutputStream(connection.getOutputStream()); + out.writeBytes(text); + out.close(); + + final BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream())); + final StringBuilder result = new StringBuilder(); + String line = null; + while ((line = in.readLine()) != null) + result.append(line); + + return Config.mapper.readValue(result.toString(), SpotlightResponse.class); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResource.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResource.java new file mode 100644 index 00000000..baf54d3d --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResource.java @@ -0,0 +1,95 @@ +package de.vipra.cmd.text; + +import java.util.Arrays; +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonSetter; + +@JsonIgnoreProperties(ignoreUnknown = true) +public class SpotlightResource { + + @JsonProperty("@URI") + private String uri; + + @JsonProperty("@support") + private int support; + + @JsonProperty("@types") + private List<String> types; + + @JsonProperty("@surfaceForm") + private String surfaceForm; + + @JsonProperty("@offset") + private int offset; + + @JsonProperty("@similarityScore") + private double similarityScore; + + @JsonProperty("@percentageOfSecondRank") + private double percentageOfSecondRank; + + public String getUri() { + return uri; + } + + public void setUri(final String uri) { + this.uri = uri; + } + + public int getSupport() { + return support; + } + + public void setSupport(final int support) { + this.support = support; + } + + public List<String> getTypes() { + return types; + } + + public void setTypes(final List<String> types) { + this.types = types; + } + + @JsonSetter("@types") + public void setTypes(final String types) { + this.types = Arrays.asList(types.split(",")); + } + + public String getSurfaceForm() { + return surfaceForm; + } + + public void setSurfaceForm(final String surfaceForm) { + this.surfaceForm = surfaceForm; + } + + public int getOffset() { + return offset; + } + + public void setOffset(final int offset) { + this.offset = offset; + } + + public double getSimilarityScore() { + return similarityScore; + } + + public void setSimilarityScore(final double similarityScore) { + this.similarityScore = similarityScore; + } + + public double getPercentageOfSecondRank() { + return percentageOfSecondRank; + } + + public void setPercentageOfSecondRank(final double percentageOfSecondRank) { + this.percentageOfSecondRank = percentageOfSecondRank; + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java new file mode 100644 index 00000000..0cb0ce0c --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java @@ -0,0 +1,22 @@ +package de.vipra.cmd.text; + +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; + +@JsonIgnoreProperties(ignoreUnknown = true) +public class SpotlightResponse { + + @JsonProperty("Resources") + private List<SpotlightResource> resources; + + public List<SpotlightResource> getResources() { + return resources; + } + + public void setResources(final List<SpotlightResource> resources) { + this.resources = resources; + } + +} diff --git a/vipra-cmd/src/main/resources/config.json b/vipra-cmd/src/main/resources/config.json index 137c93ce..ba2f39ec 100644 --- a/vipra-cmd/src/main/resources/config.json +++ b/vipra-cmd/src/main/resources/config.json @@ -13,6 +13,15 @@ "dynamicMinIterations": 100, "dynamicMaxIterations": 1000, "staticIterations": 100, + "topicAutoNamingWords": 4, + "maxSimilarDocuments": 10, + "documentMinimumLength": 10, + "documentMinimumWordFrequency": 5, + "spotlightSupport": 0, + "spotlightConfidence": 0.5, + "minRelativeProbability": 0.01, + "risingDecayLambda": 0.0, + "maxSimilarDocumentsDivergence": 0.25, "windowResolution": "YEAR", "processorMode": "TEXT" } diff --git a/vipra-cmd/src/main/resources/log4j2.xml b/vipra-cmd/src/main/resources/log4j2.xml deleted file mode 100644 index 88658479..00000000 --- a/vipra-cmd/src/main/resources/log4j2.xml +++ /dev/null @@ -1,14 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<Configuration packages="de.vipra.cmd.plugin"> - <Appenders> - <Console name="Console" target="SYSTEM_OUT"> - <PatternLayout pattern="%highlight{%-5level - %msg%n}{FATAL=red,ERROR=red,WARN=red,INFO=normal,DEBUG=normal,TRACE=normal}" /> - <ClassNameRegexFilter regex="de.vipra.*" onMatch="ACCEPT" onMismatch="DENY"/> - </Console> - </Appenders> - <Loggers> - <Root level="INFO"> - <AppenderRef ref="Console" /> - </Root> - </Loggers> -</Configuration> \ No newline at end of file diff --git a/vipra-cmd/src/main/resources/log4j2dev.xml b/vipra-cmd/src/main/resources/log4j2dev.xml deleted file mode 100644 index 8c371647..00000000 --- a/vipra-cmd/src/main/resources/log4j2dev.xml +++ /dev/null @@ -1,15 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<Configuration> - <Appenders> - <Console name="Console" target="SYSTEM_OUT"> - <PatternLayout pattern="%d{HH:mm:ss.SSS} %-5level %logger{36} - %msg%n" /> - </Console> - </Appenders> - <Loggers> - <Root level="ALL"> - <AppenderRef ref="Console" /> - </Root> - <Logger name="org.mongodb" level="ERROR" /> - <Logger name="org.elasticsearch.transport.netty" level="ERROR" /> - </Loggers> -</Configuration> \ No newline at end of file diff --git a/vipra-cmd/src/main/resources/model.json b/vipra-cmd/src/main/resources/model.json deleted file mode 100644 index 0eed5c36..00000000 --- a/vipra-cmd/src/main/resources/model.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "name": "", - "kTopics": 20, - "dynamicMinIterations": 100, - "dynamicMaxIterations": 1000, - "staticIterations": 100, - "topicAutoNamingWords": 4, - "maxSimilarDocuments": 20, - "minRelativeProbability": 0.01, - "risingDecayLambda": 0.0, - "maxSimilarDocumentsDivergence": 0.25, - "windowResolution": "YEAR", - "processorMode": "TEXT" -} \ No newline at end of file diff --git a/vipra-ui/app/html/about.html b/vipra-ui/app/html/about.html index 35f097b5..a7343ea7 100644 --- a/vipra-ui/app/html/about.html +++ b/vipra-ui/app/html/about.html @@ -140,15 +140,6 @@ The number of topics to be generated in the topic modeling process. </td> </tr> - <tr> - <th>K topic words</th> - <td ng-bind-template="{{::info.const.ktopicwords}}"></td> - </tr> - <tr class="well"> - <td colspan="2"> - The maximum number of words that are associated to a single topic. - </td> - </tr> <tr> <th>Rising decay weight</th> <td ng-bind-template="{{::info.const.decaylambda}}"></td> @@ -168,15 +159,6 @@ <it>maximum_probability * minimum_relative_probability</it>. </td> </tr> - <tr> - <th>Minimum share</th> - <td ng-bind-template="{{::info.const.minshare}}"></td> - </tr> - <tr class="well"> - <td colspan="2"> - The minimum share of a topic to be accepted for an article. Topic shares are renormalized after rejecting topics below this threshold. - </td> - </tr> <tr> <th>Maximum similar documents</th> <td ng-bind-template="{{::info.const.maxsimdocs}}"></td> @@ -213,21 +195,21 @@ </td> </tr> <tr> - <th>Word minimum frequency</th> - <td ng-bind-template="{{::info.const.docminfreq}}"></td> + <th>Document minimum word count</th> + <td ng-bind-template="{{::info.const.docminlength}}"></td> </tr> <tr class="well"> <td colspan="2"> - The minimum word frequency for unique words in an article to be used in the topic modeling process. Unique words with a lower frequency are ignored. + The minimum article word count. Articles with less words are not included in the topic modeling process. </td> </tr> <tr> - <th>Document minimum word count</th> - <td ng-bind-template="{{::info.const.docminlength}}"></td> + <th>Document minimum word frequency</th> + <td ng-bind-template="{{::info.const.docminwordfreq}}"></td> </tr> <tr class="well"> <td colspan="2"> - The minimum article word count. Articles with less words are not included in the topic modeling process. + The minimum article word frequency. Words that occurr less than this frequency are stripped from the article. </td> </tr> <tr> diff --git a/vipra-ui/app/html/articles/index.html b/vipra-ui/app/html/articles/index.html index 681ebf3e..e284dcbe 100644 --- a/vipra-ui/app/html/articles/index.html +++ b/vipra-ui/app/html/articles/index.html @@ -25,8 +25,6 @@ <tr ng-repeat="article in articles"> <td> <a ui-sref="articles.show({id: article.id})" ng-bind="::article.title"></a> - - <small class="text-muted" ng-bind-template="[{{::Vipra.formatDate(article.date)}}]"></small> </td> </tr> </tbody> diff --git a/vipra-ui/app/html/index.html b/vipra-ui/app/html/index.html index ec0dbe4d..d27f4c3d 100644 --- a/vipra-ui/app/html/index.html +++ b/vipra-ui/app/html/index.html @@ -54,4 +54,4 @@ </div> </div> </div> -<div ng-cloak ui-view></div> +<div ng-cloak ui-view></div> \ No newline at end of file diff --git a/vipra-ui/app/index.html b/vipra-ui/app/index.html index 64f09a52..49d4af3a 100644 --- a/vipra-ui/app/index.html +++ b/vipra-ui/app/index.html @@ -70,6 +70,11 @@ </li> </ul> <ul class="nav navbar-nav navbar-right"> + <li> + <a data-toggle="modal" data-target="#topicModelModal"> + Models + </a> + </li> <li ui-sref-active="active"> <a ui-sref="about"> About @@ -81,7 +86,48 @@ </div> <!-- /.container-fluid --> </nav> - <div class="main" ui-view ng-cloak></div> + <div class="main" ui-view ng-cloak ng-show="topicModel"></div> + + <div id="topicModelModal" class="modal fade" tabindex="-1" role="dialog" data-backdrop="static" data-keyboard="false" bs-modal> + <div class="modal-dialog modal-lg"> + <div class="modal-content"> + <div class="modal-header"> + <button type="button" class="close" data-dismiss="modal" aria-label="Close" ng-show="topicModel"><span aria-hidden="true">×</span></button> + <h4 class="modal-title">Topic Models</h4> + </div> + <div class="modal-body"> + <ul class="list-group" ng-show="topicModels.length"> + <button type="button" class="list-group-item" ng-repeat="model in topicModels" ng-click="changeTopicModel(model)" ng-class="{active:topicModel.id===model.id}"> + <span class="badge" ng-bind="model.modelConfig.kTopics"></span> + <span ng-bind="model.id"></span> + </button> + </ul> + <p class="text-center" ng-show="loading.any"> + Loading... + </p> + <p ng-hide="topicModels.length || loading.any"> + No topic models in the database. Create a topic model and import data into it to begin. + </p> + <h4>Quick start</h4> + <ol> + <li> + Create a model: + <pre>vipra -C some_model</pre> + </li> + <li> + Import data into it: + <pre>vipra -S some_model -I data.json</pre> + </li> + <li> + Generate topic data: + <pre>vipra -S some_model -Mi</pre> + </li> + </ol> + </div> + </div> + </div> + </div> + </body> </html> diff --git a/vipra-ui/app/js/controllers.js b/vipra-ui/app/js/controllers.js index 58d8fd9e..5fb343e0 100644 --- a/vipra-ui/app/js/controllers.js +++ b/vipra-ui/app/js/controllers.js @@ -2,7 +2,7 @@ * Vipra Application * Controllers ******************************************************************************/ -/* globals angular, Vipra, moment, vis, console, prompt, randomColor, Highcharts */ +/* globals angular, Vipra, moment, vis, console, prompt, randomColor, Highcharts, $ */ (function() { "use strict"; @@ -12,9 +12,26 @@ 'vipra.factories' ]); - app.controller('RootController', ['$scope', '$state', function($scope, $state) { - $scope.$state = $state; - }]); + app.controller('RootController', ['$scope', '$state', 'TopicModelFactory', + function($scope, $state, TopicModelFactory) { + + $scope.$state = $state; + + TopicModelFactory.query({ + fields: 'modelConfig' + }, function(data) { + $scope.topicModels = data; + }, function(err) { + $scope.errors = err; + }); + + $scope.changeTopicModel = function(topicModel) { + $scope.topicModel = topicModel; + $('#topicModelModal').modal('hide'); + }; + + } + ]); /** * Index controller @@ -24,29 +41,37 @@ $scope.search = $location.search().query; - ArticleFactory.query({ - limit: 3, - sort: '-created' - }, function(data) { - $scope.latestArticles = data; - }, function(err) { - $scope.errors = err; - }); + $scope.$watch('topicModel', function(topicModel) { + if(!topicModel) return; + + ArticleFactory.query({ + topicModel: topicModel.id, + limit: 3, + sort: '-created' + }, function(data) { + $scope.latestArticles = data; + }, function(err) { + $scope.errors = err; + }); - TopicFactory.query({ - limit: 3, - sort: '-created' - }, function(data) { - $scope.latestTopics = data; - }, function(err) { - $scope.errors = err; + TopicFactory.query({ + topicModel: topicModel.id, + limit: 3, + sort: '-created' + }, function(data) { + $scope.latestTopics = data; + }, function(err) { + $scope.errors = err; + }); }); - $scope.$watch('search', function() { - if ($scope.search) { + $scope.$watchGroup(['search', 'topicModel'], function() { + if ($scope.search && $scope.topicModel) { $location.search('query', $scope.search); $scope.searching = true; + SearchFactory.query({ + topicModel: $scope.topicModel.id, limit: 10, query: $scope.search }, function(data) { @@ -501,8 +526,11 @@ $scope.page = Math.max($location.search().page || 1, 1); $scope.limit = 100; - $scope.$watchGroup(['page', 'opts.sortkey', 'opts.sortdir'], function() { + $scope.$watchGroup(['page', 'opts.sortkey', 'opts.sortdir', 'topicModel'], function() { + if(!$scope.topicModel) return; + TopicFactory.query({ + topicModel: $scope.topicModel.id, skip: ($scope.page - 1) * $scope.limit, limit: $scope.limit, sort: ($scope.opts.sortdir ? '' : '-') + $scope.opts.sortkey @@ -530,15 +558,20 @@ sortwords: '-likeliness' }; - TopicFactory.get({ - id: $stateParams.id - }, function(data) { - $scope.topic = data; - $scope.topicCreated = Vipra.formatDateTime($scope.topic.created); - $scope.topicModified = Vipra.formatDateTime($scope.topic.modified); - $scope.redrawGraph(); - }, function(err) { - $scope.errors = err; + $scope.$watch('topicModel', function() { + if(!$scope.topicModel) return; + + TopicFactory.get({ + id: $stateParams.id, + topicModel: $scope.topicModel.id + }, function(data) { + $scope.topic = data; + $scope.topicCreated = Vipra.formatDateTime($scope.topic.created); + $scope.topicModified = Vipra.formatDateTime($scope.topic.modified); + $scope.redrawGraph(); + }, function(err) { + $scope.errors = err; + }); }); $scope.redrawGraph = function() { @@ -568,7 +601,9 @@ $scope.endRename = function(save) { delete $scope.renameErrors; if (save) { - TopicFactory.update({ id: $scope.topic.id }, $scope.topic, function(data) { + TopicFactory.update({ + id: $scope.topic.id + }, $scope.topic, function(data) { $scope.topic = data; $scope.isRename = false; }, function(err) { @@ -590,17 +625,18 @@ $scope.$watch('opts.seqstyle', $scope.redrawGraph); $scope.$watch('opts.chartstyle', $scope.redrawGraph); - $scope.$watch('sequenceId', function(sequence) { - if (sequence) { - SequenceFactory.get({ - id: sequence, - topWords: 20 - }, function(data) { - $scope.sequence = data; - }, function(err) { - $scope.errors = err; - }); - } + $scope.$watchGroup(['sequenceId', 'topicModel'], function() { + if(!$scope.sequenceId || !$scope.topicModel) return; + + SequenceFactory.get({ + id: $scope.sequenceId, + topicModel: $scope.topicModel.id, + topWords: 20 + }, function(data) { + $scope.sequence = data; + }, function(err) { + $scope.errors = err; + }); }); } ]); @@ -619,9 +655,12 @@ $scope.page = Math.max($location.search().page || 1, 1); $scope.limit = 100; - $scope.$watchGroup(['page', 'opts.sortkey', 'opts.sortdir'], function() { + $scope.$watchGroup(['page', 'opts.sortkey', 'opts.sortdir', 'topicModel'], function() { + if(!$scope.topicModel) return; + TopicFactory.articles({ id: $stateParams.id, + topicModel: $scope.topicModel.id, skip: ($scope.page - 1) * $scope.limit, limit: $scope.limit, sort: ($scope.opts.sortdir ? '' : '-') + $scope.opts.sortkey diff --git a/vipra-ui/app/js/directives.js b/vipra-ui/app/js/directives.js index 404d439d..82900e4e 100644 --- a/vipra-ui/app/js/directives.js +++ b/vipra-ui/app/js/directives.js @@ -146,6 +146,14 @@ } ]); + app.directive('bsModal', [function() { + return { + link: function($scope, $elem) { + $elem.modal(); + } + }; + }]); + app.directive('sequenceDropdown', [function() { return { scope: { diff --git a/vipra-ui/app/js/factories.js b/vipra-ui/app/js/factories.js index b65bb073..f5ff9043 100644 --- a/vipra-ui/app/js/factories.js +++ b/vipra-ui/app/js/factories.js @@ -34,6 +34,10 @@ return $resource(Vipra.config.restUrl + '/info'); }]); + app.factory('TopicModelFactory', ['$resource', function($resource) { + return $resource(Vipra.config.restUrl + '/topicmodels'); + }]); + // https://gist.github.com/Fluidbyte/4718380 app.factory('Store', ['$state', function($state) { return function(key, value) { diff --git a/vipra-ui/app/less/app.less b/vipra-ui/app/less/app.less index 8f1d3a1c..d3e6ef59 100644 --- a/vipra-ui/app/less/app.less +++ b/vipra-ui/app/less/app.less @@ -371,6 +371,11 @@ topic-menu { display: inline-block; } +[bs-list] > li { + .pointer; + +} + @-moz-keyframes spin { 100% { -moz-transform: rotateY(360deg); diff --git a/vipra-util/pom.xml b/vipra-util/pom.xml index 1519155c..4fd1e610 100644 --- a/vipra-util/pom.xml +++ b/vipra-util/pom.xml @@ -28,6 +28,13 @@ <version>2.4</version> </dependency> + <!-- Jansi --> + <dependency> + <groupId>org.fusesource.jansi</groupId> + <artifactId>jansi</artifactId> + <version>1.11</version> + </dependency> + <!-- SLF4j logging --> <dependency> <groupId>org.slf4j</groupId> diff --git a/vipra-util/src/main/java/de/vipra/util/Config.java b/vipra-util/src/main/java/de/vipra/util/Config.java index 80263695..2aace5d8 100644 --- a/vipra-util/src/main/java/de/vipra/util/Config.java +++ b/vipra-util/src/main/java/de/vipra/util/Config.java @@ -3,7 +3,10 @@ package de.vipra.util; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.commons.io.IOUtils; @@ -17,6 +20,8 @@ import com.fasterxml.jackson.databind.SerializationFeature; import de.vipra.util.ex.ConfigException; import de.vipra.util.model.Model; +import de.vipra.util.model.TopicModelConfig; +import de.vipra.util.model.TopicModelFull; import de.vipra.util.service.MongoService; public class Config { @@ -37,12 +42,12 @@ public class Config { private String databaseName = Constants.DATABASE_NAME; private String elasticSearchHost = Constants.ES_HOST; private int elasticSearchPort = Constants.ES_PORT; - private ModelConfig modelConfigTemplate = new ModelConfig(); + private TopicModelConfig modelConfigTemplate = new TopicModelConfig(); private String spotlightUrl; private String dtmPath; @JsonIgnore - private Map<String, ModelConfig> modelConfigs; + private Map<String, TopicModelConfig> topicModelConfigs; public String getDatabaseHost() { return databaseHost; @@ -100,30 +105,40 @@ public class Config { this.dtmPath = dtmPath; } - public Map<String, ModelConfig> getModelConfigs() { - return modelConfigs; + public Map<String, TopicModelConfig> getTopicModelConfigs() { + return topicModelConfigs; } - public ModelConfig getModelConfig(final String name) throws Exception { - final ModelConfig modelConfig = modelConfigs.get(name); + public Collection<TopicModelConfig> getTopicModelConfigs(final String[] names) throws Exception { + final List<TopicModelConfig> topicModelConfigs = new ArrayList<>(names.length); + for (final String name : names) { + if (name.equalsIgnoreCase("all")) + return this.topicModelConfigs.values(); + topicModelConfigs.add(getTopicModelConfig(name)); + } + return topicModelConfigs; + } + + public TopicModelConfig getTopicModelConfig(final String name) throws Exception { + final TopicModelConfig modelConfig = topicModelConfigs.get(name); if (modelConfig == null) throw new Exception("unknown model: " + name); return modelConfig; } - public void setModelConfigs(final Map<String, ModelConfig> modelConfigs) { - this.modelConfigs = modelConfigs; + public void setTopicModelConfigs(final Map<String, TopicModelConfig> topicModelConfigs) { + this.topicModelConfigs = topicModelConfigs; } public void setDataDirectory(final String dataDirectory) { this.dataDirectory = dataDirectory; } - public ModelConfig getModelConfigTemplate() { + public TopicModelConfig getModelConfigTemplate() { return modelConfigTemplate; } - public void setModelConfigTemplate(final ModelConfig modelConfigTemplate) { + public void setModelConfigTemplate(final TopicModelConfig modelConfigTemplate) { this.modelConfigTemplate = modelConfigTemplate; } @@ -235,22 +250,13 @@ public class Config { if (instance == null) throw new ConfigException("could not read configuration"); - // read model configurations - final File dataDir = instance.getDataDirectory(); - final Map<String, ModelConfig> modelConfigs = new HashMap<>(); - for (final File file : dataDir.listFiles()) { - if (file.isDirectory()) { - final File modelConfigFile = new File(file, Constants.MODEL_FILE); - if (!modelConfigFile.exists()) - throw new ConfigException("missing model configuration file: " + modelConfigFile.getAbsolutePath()); - final ModelConfig configDtm = mapper.readValue(modelConfigFile, ModelConfig.class); - if (configDtm.getName() == null || configDtm.getName().isEmpty()) - throw new ConfigException("models must have a name: " + modelConfigFile.getAbsolutePath()); - modelConfigs.put(configDtm.getName(), configDtm); - } - } - instance.modelConfigs = modelConfigs; - + // read topic model configs + final MongoService<TopicModelFull, String> dbTopicModels = MongoService.getDatabaseService(instance, TopicModelFull.class); + final List<TopicModelFull> topicModels = dbTopicModels.getAll(); + final Map<String, TopicModelConfig> topicModelConfigs = new HashMap<>(topicModels.size()); + for (final TopicModelFull topicModel : topicModels) + topicModelConfigs.put(topicModel.getId(), topicModel.getModelConfig()); + instance.setTopicModelConfigs(topicModelConfigs); } catch (final IOException e) { throw new ConfigException(e); } diff --git a/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java b/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java new file mode 100644 index 00000000..7e2ee2f0 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java @@ -0,0 +1,38 @@ +package de.vipra.util; + +import org.fusesource.jansi.Ansi; +import org.fusesource.jansi.Ansi.Color; + +public class ConsoleUtils { + + private static boolean silent = false; + private static int pad = 5; + + public static void setSilent(final boolean s) { + silent = s; + } + + public static void info(final String msg) { + if (!silent) + System.out.println(label("INFO") + " - " + msg); + } + + public static void warn(final String msg) { + if (!silent) + System.out.println(label("WARN") + " - " + msg); + } + + public static void error(final String msg) { + if (!silent) + System.err.println(label("ERROR") + " - " + Ansi.ansi().fg(Color.RED).a(msg).reset()); + } + + public static void error(final Throwable t) { + error(t.getMessage()); + } + + private static String label(final String label) { + return StringUtils.pad(label, pad); + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index c9b1d394..ea6d2591 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -30,7 +30,6 @@ public class Constants { */ public static final String CONFIG_FILE = "config.json"; - public static final String MODEL_FILE = "model.json"; /* * DATABASE @@ -73,16 +72,10 @@ public class Constants { */ public static final int K_TOPICS = 20; - /** - * Number of words in a discovered topic, if the selected topic modeling - * library supports this parameter. Default 50. - */ - public static final int K_TOPIC_WORDS = 50; - /** * This value is a weight to the rising decay caulculation of topic * relevances. The higher this value, the more focus is put on later - * sequences containing more recent documents. Default 0. + * sequences containing more recent documents. Default 0.0. */ public static final double RISING_DECAY_LAMBDA = 0.0; @@ -93,19 +86,13 @@ public class Constants { public static final double MIN_RELATIVE_PROB = 0.01; /** - * The minimum share of a topic to be accepted for an article. Topic shares - * are renormalized after rejecting topics below this threshold. - */ - public static final double MINIMUM_SHARE = 0.01; - - /** - * Maximum number of similar documents for each document. + * Maximum number of similar documents for each document. Default 10. */ - public static final int MAX_SIMILAR_DOCUMENTS = 20; + public static final int MAX_SIMILAR_DOCUMENTS = 10; /** * Maximum divergence between a document and similar documents. Lower values - * mean more similar documents (less divergence). Default 1.0. + * mean more similar documents (less divergence). Default 0.25. */ public static final double MAX_SIMILAR_DOCUMENTS_DIVERGENCE = 0.25; @@ -126,16 +113,26 @@ public class Constants { public static final int STATIC_ITER = 200; /** - * Minimum word frequency for words to be used for topic modeling. All words - * below this frequency in a document are filtered out before generating the - * topic model. Default 10. + * Minumum number of words per document. Default 10. + */ + public static final int DOCUMENT_MIN_LENGTH = 10; + + /** + * Minimum word frequency for a word to be accepted. Default 5. */ - public static final int DOCUMENT_MIN_WORD_FREQ = 10; + public static final int DOCUMENT_MIN_WORD_FREQ = 5; /** - * Minumum number of words per document. Default 10. + * Minimum number of dbpedia inlinks for an entity annotation to be + * accepted. Default 0. */ - public static final int DOCUMENT_MIN_LENGTH = 10; + public static final int SPOTLIGHT_SUPPORT = 0; + + /** + * Disambiguation confidence. Eliminates top n percent of inconfident + * annotations. Ranges from 0 to 1. Default 0.5. + */ + public static final double SPOTLIGHT_CONFIDENCE = 0.5; /** * The dynamic topic modeling window resolution to be used. This value is diff --git a/vipra-util/src/main/java/de/vipra/util/StringUtils.java b/vipra-util/src/main/java/de/vipra/util/StringUtils.java index 295b1dde..c054ac0a 100644 --- a/vipra-util/src/main/java/de/vipra/util/StringUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/StringUtils.java @@ -187,4 +187,29 @@ public class StringUtils { return sb.toString(); } + public static String pad(final String str, final int length, final String pad, final boolean left) { + if (str.length() >= length) + return str; + final StringBuilder sb = new StringBuilder(); + if (!left) + sb.append(str); + for (int i = 0; i < length - str.length(); i++) + sb.append(pad); + if (left) + sb.append(str); + return sb.toString(); + } + + public static String pad(final String str, final int length, final String pad) { + return pad(str, length, pad, false); + } + + public static String pad(final String str, final int length) { + return pad(str, length, " ", false); + } + + public static String pad(final String str, final int length, final boolean left) { + return pad(str, length, " ", left); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/URLUtils.java b/vipra-util/src/main/java/de/vipra/util/URLUtils.java new file mode 100644 index 00000000..d02544ed --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/URLUtils.java @@ -0,0 +1,23 @@ +package de.vipra.util; + +import java.net.MalformedURLException; + +public class URLUtils { + + public static String concat(String url, final String path) throws MalformedURLException { + if (url.endsWith("/")) { + if (path.startsWith("/")) + url += path.substring(1); + else + url += path; + } else { + if (path.startsWith("/")) + url += path; + else + url += "/" + path; + } + + return url; + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java index d66d4b8b..179af172 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java @@ -15,6 +15,7 @@ import org.mongodb.morphia.annotations.Id; import org.mongodb.morphia.annotations.Index; import org.mongodb.morphia.annotations.Indexes; import org.mongodb.morphia.annotations.PrePersist; +import org.mongodb.morphia.annotations.Reference; import org.mongodb.morphia.annotations.Transient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -50,9 +51,9 @@ public class ArticleFull implements Model<ObjectId>, Serializable { @ElasticIndex("date") private Date date; - @Embedded + @Reference @QueryIgnore(multi = true) - private TopicModel model; + private TopicModel topicModel; @Embedded @QueryIgnore(multi = true) @@ -150,17 +151,17 @@ public class ArticleFull implements Model<ObjectId>, Serializable { } } - public TopicModel getModel() { - return model; + public TopicModel getTopicModel() { + return topicModel; } - @ElasticIndex("model") - public String serializeModel() { - return model.getId(); + @ElasticIndex("topicmodel") + public String serializeTopicModel() { + return topicModel.getId(); } - public void setModel(final TopicModel model) { - this.model = model; + public void setTopicModel(final TopicModel topicModel) { + this.topicModel = topicModel; } public List<TopicShare> getTopics() { diff --git a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java index ee2e0c77..35752e9e 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java @@ -6,6 +6,7 @@ import org.bson.types.ObjectId; import org.mongodb.morphia.annotations.Embedded; import org.mongodb.morphia.annotations.Entity; import org.mongodb.morphia.annotations.Id; +import org.mongodb.morphia.annotations.Reference; import de.vipra.util.an.QueryIgnore; @@ -16,9 +17,9 @@ public class Sequence implements Model<ObjectId>, Comparable<Sequence>, Serializ @Id private ObjectId id = new ObjectId(); - @Embedded + @Reference @QueryIgnore(multi = true) - private TopicModel model; + private TopicModel topicModel; @Embedded private Window window; @@ -43,12 +44,12 @@ public class Sequence implements Model<ObjectId>, Comparable<Sequence>, Serializ this.id = id; } - public TopicModel getModel() { - return model; + public TopicModel getTopicModel() { + return topicModel; } - public void setModel(final TopicModel model) { - this.model = model; + public void setTopicModel(final TopicModel topicModel) { + this.topicModel = topicModel; } public Window getWindow() { diff --git a/vipra-util/src/main/java/de/vipra/util/model/SequenceFull.java b/vipra-util/src/main/java/de/vipra/util/model/SequenceFull.java index 91201a0f..8efdbb74 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/SequenceFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/SequenceFull.java @@ -18,9 +18,9 @@ public class SequenceFull implements Model<ObjectId>, Comparable<SequenceFull>, @Id private ObjectId id = new ObjectId(); - @Embedded + @Reference @QueryIgnore(multi = true) - private TopicModel model; + private TopicModel topicModel; @Embedded private Window window; @@ -47,12 +47,12 @@ public class SequenceFull implements Model<ObjectId>, Comparable<SequenceFull>, this.id = id; } - public TopicModel getModel() { - return model; + public TopicModel getTopicModel() { + return topicModel; } - public void setModel(final TopicModel model) { - this.model = model; + public void setTopicModel(final TopicModel topicModel) { + this.topicModel = topicModel; } public Window getWindow() { diff --git a/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java b/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java new file mode 100644 index 00000000..d0bba63f --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java @@ -0,0 +1,38 @@ +package de.vipra.util.model; + +import java.io.Serializable; + +import org.mongodb.morphia.annotations.Embedded; + +@SuppressWarnings("serial") +@Embedded +public class TextEntity implements Serializable { + + private String entity; + + private String url; + + public TextEntity() {} + + public TextEntity(final String entity, final String url) { + this.entity = entity; + this.url = url; + } + + public String getEntity() { + return entity; + } + + public void setEntity(final String entity) { + this.entity = entity; + } + + public String getUrl() { + return url; + } + + public void setUrl(final String url) { + this.url = url; + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java index 260b998a..f875db28 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java @@ -6,13 +6,11 @@ import java.util.Date; import java.util.List; import org.bson.types.ObjectId; -import org.mongodb.morphia.annotations.Embedded; import org.mongodb.morphia.annotations.Entity; import org.mongodb.morphia.annotations.Id; import org.mongodb.morphia.annotations.PrePersist; import org.mongodb.morphia.annotations.Reference; -import de.vipra.util.Constants; import de.vipra.util.MongoUtils; import de.vipra.util.StringUtils; import de.vipra.util.an.QueryIgnore; @@ -24,9 +22,9 @@ public class TopicFull implements Model<ObjectId>, Serializable { @Id private ObjectId id = new ObjectId(); - @Embedded + @Reference @QueryIgnore(multi = true) - private TopicModel model; + private TopicModel topicModel; private String name; @@ -67,12 +65,12 @@ public class TopicFull implements Model<ObjectId>, Serializable { this.id = MongoUtils.objectId(id); } - public TopicModel getModel() { - return model; + public TopicModel getTopicModel() { + return topicModel; } - public void setModel(final TopicModel model) { - this.model = model; + public void setTopicModel(final TopicModel topicModel) { + this.topicModel = topicModel; } public String getName() { @@ -154,10 +152,10 @@ public class TopicFull implements Model<ObjectId>, Serializable { created = modified; } - public static String getNameFromWords(final List<TopicWord> words) { + public static String getNameFromWords(final int wordsNum, final List<TopicWord> words) { String name = null; if (words != null && words.size() > 0) { - final int size = Math.min(Constants.TOPIC_AUTO_NAMING_WORDS, words.size()); + final int size = Math.min(wordsNum, words.size()); final List<String> topWords = new ArrayList<>(size); for (int i = 0; i < size; i++) { topWords.add(words.get(i).getWord()); @@ -188,7 +186,7 @@ public class TopicFull implements Model<ObjectId>, Serializable { @Override public String toString() { - return "TopicFull [id=" + id + ", model=" + model + ", name=" + name + ", sequences=" + sequences + ", avgRelevance=" + avgRelevance + return "TopicFull [id=" + id + ", model=" + topicModel + ", name=" + name + ", sequences=" + sequences + ", avgRelevance=" + avgRelevance + ", varRelevance=" + varRelevance + ", risingRelevance=" + risingRelevance + ", fallingRelevance=" + fallingRelevance + ", risingDecayRelevance=" + risingDecayRelevance + ", created=" + created + ", modified=" + modified + "]"; } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicModel.java b/vipra-util/src/main/java/de/vipra/util/model/TopicModel.java index 3cb061df..e205a16b 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicModel.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicModel.java @@ -6,8 +6,8 @@ import org.mongodb.morphia.annotations.Entity; import org.mongodb.morphia.annotations.Id; @SuppressWarnings("serial") -@Entity(noClassnameStored = true) -public class TopicModel implements Model<String>, Serializable { +@Entity(value = "topicmodels", noClassnameStored = true) +public class TopicModel implements Model<String>, Comparable<TopicModel>, Serializable { @Id private String id; @@ -26,6 +26,11 @@ public class TopicModel implements Model<String>, Serializable { @Override public void setId(final String id) { this.id = id; + } + + @Override + public int compareTo(final TopicModel o) { + return id.compareTo(o.getId()); }; } diff --git a/vipra-util/src/main/java/de/vipra/util/ModelConfig.java b/vipra-util/src/main/java/de/vipra/util/model/TopicModelConfig.java similarity index 54% rename from vipra-util/src/main/java/de/vipra/util/ModelConfig.java rename to vipra-util/src/main/java/de/vipra/util/model/TopicModelConfig.java index e89dd8b5..b916ea46 100644 --- a/vipra-util/src/main/java/de/vipra/util/ModelConfig.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicModelConfig.java @@ -1,11 +1,17 @@ -package de.vipra.util; +package de.vipra.util.model; import java.io.File; +import java.io.Serializable; +import org.mongodb.morphia.annotations.Embedded; + +import de.vipra.util.Constants; import de.vipra.util.Constants.ProcessorMode; import de.vipra.util.Constants.WindowResolution; -public class ModelConfig { +@SuppressWarnings("serial") +@Embedded +public class TopicModelConfig implements Serializable { private String name; private int kTopics = Constants.K_TOPICS; @@ -14,12 +20,36 @@ public class ModelConfig { private int staticIterations = Constants.STATIC_ITER; private int topicAutoNamingWords = Constants.TOPIC_AUTO_NAMING_WORDS; private int maxSimilarDocuments = Constants.MAX_SIMILAR_DOCUMENTS; + private int documentMinimumLength = Constants.DOCUMENT_MIN_LENGTH; + private int documentMinimumWordFrequency = Constants.DOCUMENT_MIN_WORD_FREQ; + private int spotlightSupport = Constants.SPOTLIGHT_SUPPORT; + private double spotlightConfidence = Constants.SPOTLIGHT_CONFIDENCE; private double minRelativeProbability = Constants.MIN_RELATIVE_PROB; private double risingDecayLambda = Constants.RISING_DECAY_LAMBDA; private double maxSimilarDocumentsDivergence = Constants.MAX_SIMILAR_DOCUMENTS_DIVERGENCE; private WindowResolution windowResolution = Constants.WINDOW_RESOLUTION; private ProcessorMode processorMode = Constants.PROCESSOR_MODE; + public TopicModelConfig() {} + + public TopicModelConfig(final TopicModelConfig topicModelConfig) { + kTopics = topicModelConfig.getkTopics(); + dynamicMinIterations = topicModelConfig.getDynamicMinIterations(); + dynamicMaxIterations = topicModelConfig.getDynamicMaxIterations(); + staticIterations = topicModelConfig.getStaticIterations(); + topicAutoNamingWords = topicModelConfig.getTopicAutoNamingWords(); + maxSimilarDocuments = topicModelConfig.getMaxSimilarDocuments(); + documentMinimumLength = topicModelConfig.getDocumentMinimumLength(); + documentMinimumWordFrequency = topicModelConfig.getDocumentMinimumWordFrequency(); + spotlightSupport = topicModelConfig.getSpotlightSupport(); + spotlightConfidence = topicModelConfig.getSpotlightConfidence(); + minRelativeProbability = topicModelConfig.getMinRelativeProbability(); + risingDecayLambda = topicModelConfig.getRisingDecayLambda(); + maxSimilarDocumentsDivergence = topicModelConfig.getMaxSimilarDocumentsDivergence(); + windowResolution = topicModelConfig.getWindowResolution(); + processorMode = topicModelConfig.getProcessorMode(); + } + public String getName() { return name; } @@ -68,6 +98,22 @@ public class ModelConfig { this.topicAutoNamingWords = topicAutoNamingWords; } + public int getDocumentMinimumLength() { + return documentMinimumLength; + } + + public void setDocumentMinimumLength(final int documentMinimumLength) { + this.documentMinimumLength = documentMinimumLength; + } + + public int getDocumentMinimumWordFrequency() { + return documentMinimumWordFrequency; + } + + public void setDocumentMinimumWordFrequency(final int documentMinimumWordFrequency) { + this.documentMinimumWordFrequency = documentMinimumWordFrequency; + } + public int getMaxSimilarDocuments() { return maxSimilarDocuments; } @@ -76,6 +122,22 @@ public class ModelConfig { this.maxSimilarDocuments = maxSimilarDocuments; } + public int getSpotlightSupport() { + return spotlightSupport; + } + + public void setSpotlightSupport(final int spotlightSupport) { + this.spotlightSupport = spotlightSupport; + } + + public double getSpotlightConfidence() { + return spotlightConfidence; + } + + public void setSpotlightConfidence(final double spotlightConfidence) { + this.spotlightConfidence = spotlightConfidence; + } + public double getMinRelativeProbability() { return minRelativeProbability; } @@ -120,8 +182,10 @@ public class ModelConfig { return new File(dataDir, name); } - public File getConfigFile(final File dataDir) { - return new File(getModelDir(dataDir), Constants.MODEL_FILE); + @Override + public String toString() { + return "[window=" + windowResolution + ", mode=" + processorMode + ", k=" + kTopics + ", iter=" + staticIterations + "/" + + dynamicMinIterations + "-" + dynamicMaxIterations + "]"; } } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicModelFull.java b/vipra-util/src/main/java/de/vipra/util/model/TopicModelFull.java index b82838c7..13a0a138 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicModelFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicModelFull.java @@ -1,21 +1,37 @@ package de.vipra.util.model; +import java.io.Serializable; + +import org.mongodb.morphia.annotations.Embedded; import org.mongodb.morphia.annotations.Entity; import org.mongodb.morphia.annotations.Id; +import de.vipra.util.an.QueryIgnore; + @SuppressWarnings("serial") -@Entity(noClassnameStored = true) -public class TopicModelFull implements Model<String> { +@Entity(value = "topicmodels", noClassnameStored = true) +public class TopicModelFull implements Model<String>, Comparable<TopicModelFull>, Serializable { @Id private String id; + private String description; + + @Embedded + @QueryIgnore(multi = true) + private TopicModelConfig modelConfig; + public TopicModelFull() {} public TopicModelFull(final String id) { this.id = id; } + public TopicModelFull(final String id, final TopicModelConfig modelConfig) { + this.id = id; + this.modelConfig = modelConfig; + } + @Override public String getId() { return id; @@ -26,4 +42,25 @@ public class TopicModelFull implements Model<String> { this.id = id; } + public String getDescription() { + return description; + } + + public void setDescription(final String description) { + this.description = description; + } + + public TopicModelConfig getModelConfig() { + return modelConfig; + } + + public void setModelConfig(final TopicModelConfig modelConfig) { + this.modelConfig = modelConfig; + } + + @Override + public int compareTo(final TopicModelFull o) { + return id.compareTo(o.getId()); + }; + } diff --git a/vipra-util/src/main/java/de/vipra/util/model/Window.java b/vipra-util/src/main/java/de/vipra/util/model/Window.java index 3ab0ba58..da4e4f5c 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Window.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Window.java @@ -3,9 +3,9 @@ package de.vipra.util.model; import java.io.Serializable; import java.util.Date; -import org.mongodb.morphia.annotations.Embedded; import org.mongodb.morphia.annotations.Entity; import org.mongodb.morphia.annotations.Id; +import org.mongodb.morphia.annotations.Reference; import de.vipra.util.Constants.WindowResolution; import de.vipra.util.an.QueryIgnore; @@ -17,9 +17,9 @@ public class Window implements Model<Integer>, Serializable, Comparable<Window> @Id private Integer id; - @Embedded + @Reference @QueryIgnore(multi = true) - private TopicModel model; + private TopicModel topicModel; private Date startDate; @@ -37,12 +37,12 @@ public class Window implements Model<Integer>, Serializable, Comparable<Window> this.id = id; } - public TopicModel getModel() { - return model; + public TopicModel getTopicModel() { + return topicModel; } - public void setModel(final TopicModel model) { - this.model = model; + public void setTopicModel(final TopicModel model) { + topicModel = model; } public Date getStartDate() { diff --git a/vipra-util/src/main/java/de/vipra/util/service/MongoService.java b/vipra-util/src/main/java/de/vipra/util/service/MongoService.java index 714cbf64..99cafa80 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/MongoService.java +++ b/vipra-util/src/main/java/de/vipra/util/service/MongoService.java @@ -131,6 +131,7 @@ public class MongoService<Type extends Model<IdType>, IdType> implements Service if (t == null) throw new DatabaseException(new NullPointerException("entities are null")); final List<Type> list = ListUtils.toList(t); + datastore.save(list); return list; } @@ -185,7 +186,7 @@ public class MongoService<Type extends Model<IdType>, IdType> implements Service } @Override - public void updateSingle(final Type t, final String... fields) throws DatabaseException { + public void updateSingle(final Type t, final boolean upsert, final String... fields) throws DatabaseException { if (t == null) throw new DatabaseException(new NullPointerException("entity is null")); if (t.getId() == null) @@ -215,10 +216,15 @@ public class MongoService<Type extends Model<IdType>, IdType> implements Service } } if (!noChanges) - datastore.update(query, ops); + datastore.update(query, ops, upsert); } } + @Override + public void updateSingle(final Type t, final String... fields) throws DatabaseException { + updateSingle(t, false, fields); + } + @Override public void drop() { datastore.getCollection(clazz).drop(); @@ -247,4 +253,8 @@ public class MongoService<Type extends Model<IdType>, IdType> implements Service return new MongoService<Type, IdType>(mongo, clazz); } + public static void dropDatabase(final Config config) throws ConfigException { + config.getMongo().getClient().dropDatabase(config.getDatabaseName()); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/service/Service.java b/vipra-util/src/main/java/de/vipra/util/service/Service.java index cc58335c..839e6989 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/Service.java +++ b/vipra-util/src/main/java/de/vipra/util/service/Service.java @@ -122,6 +122,19 @@ public interface Service<Type extends Model<IdType>, IdType, E extends Exception */ void replaceMultiple(Iterable<Type> ts) throws E; + /** + * Updates a single entity in the database + * + * @param t + * Entity to be updated + * @param upsert + * true to insert if not exists + * @param fields + * Fields to be updated + * @throws E + */ + void updateSingle(Type t, boolean upsert, String... fields) throws E; + /** * Updates a single entity in the database * -- GitLab