diff --git a/vipra-backend/.settings/org.eclipse.jdt.core.prefs b/vipra-backend/.settings/org.eclipse.jdt.core.prefs index bf52a73f0cb4d7fde47825bf5e19ce3b293969af..7f75c624478b28a64f4921be52b50e939a2c477c 100644 --- a/vipra-backend/.settings/org.eclipse.jdt.core.prefs +++ b/vipra-backend/.settings/org.eclipse.jdt.core.prefs @@ -282,7 +282,7 @@ org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line=false org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line=false org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line=false org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line=false -org.eclipse.jdt.core.formatter.lineSplit=120 +org.eclipse.jdt.core.formatter.lineSplit=150 org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column=false org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column=false org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body=0 diff --git a/vipra-backend/src/main/java/de/vipra/rest/provider/CORSResponseFilter.java b/vipra-backend/src/main/java/de/vipra/rest/provider/CORSResponseFilter.java index c93f8553f999e1cce639de3c8df1bbbb9e54ea05..f2ee89b30bf2acdf825051ae6595efe73ecef6bb 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/provider/CORSResponseFilter.java +++ b/vipra-backend/src/main/java/de/vipra/rest/provider/CORSResponseFilter.java @@ -20,8 +20,7 @@ public class CORSResponseFilter implements ContainerResponseFilter { } @Override - public void filter(final ContainerRequestContext request, final ContainerResponseContext response) - throws IOException { + public void filter(final ContainerRequestContext request, final ContainerResponseContext response) throws IOException { response.getHeaders().add("Access-Control-Allow-Origin", "*"); response.getHeaders().add("Access-Control-Allow-Headers", "Origin, Content-Type, Accept, Authorization"); response.getHeaders().add("Access-Control-Allow-Credentials", "true"); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/ArticleResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/ArticleResource.java index e235dc29642ffc27070e858d8bef547e1e6daafb..ff1c102db0b20bf19818850efe779e8605992d75 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/ArticleResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/ArticleResource.java @@ -52,9 +52,9 @@ public class ArticleResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response getArticles(@QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, - @QueryParam("sort") @DefaultValue("date") final String sortBy, @QueryParam("fields") final String fields, - @QueryParam("word") final String word) { + public Response getArticles(@QueryParam("model") final String model, @QueryParam("skip") final Integer skip, + @QueryParam("limit") final Integer limit, @QueryParam("sort") @DefaultValue("date") final String sortBy, + @QueryParam("fields") final String fields, @QueryParam("word") final String word) { final ResponseWrapper<List<ArticleFull>> res = new ResponseWrapper<>(); if (res.hasErrors()) @@ -65,6 +65,9 @@ public class ArticleResource { if (fields != null && !fields.isEmpty()) query.fields(true, StringUtils.getFields(fields)); + if (model != null && !model.isEmpty()) + query.criteria("model.id", model); + if (word != null && !word.isEmpty()) query.criteria("words.word.id", word); @@ -90,8 +93,7 @@ public class ArticleResource { public Response getArticle(@PathParam("id") final String id, @QueryParam("fields") final String fields) { final ResponseWrapper<ArticleFull> res = new ResponseWrapper<>(); if (id == null || id.trim().length() == 0) { - res.addError(new APIError(Response.Status.BAD_REQUEST, "ID is empty", - String.format(Messages.BAD_REQUEST, "id cannot be empty"))); + res.addError(new APIError(Response.Status.BAD_REQUEST, "ID is empty", String.format(Messages.BAD_REQUEST, "id cannot be empty"))); return res.badRequest(); } @@ -107,8 +109,7 @@ public class ArticleResource { if (article != null) { return res.ok(article); } else { - res.addError(new APIError(Response.Status.NOT_FOUND, "Resource not found", - String.format(Messages.NOT_FOUND, "article", id))); + res.addError(new APIError(Response.Status.NOT_FOUND, "Resource not found", String.format(Messages.NOT_FOUND, "article", id))); return res.notFound(); } } @@ -146,8 +147,7 @@ public class ArticleResource { final int del = deleted > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) deleted; switch (del) { case 0: - res.addError(new APIError(Response.Status.NOT_FOUND, "Article not found", - String.format(Messages.NOT_FOUND, "article", id))); + res.addError(new APIError(Response.Status.NOT_FOUND, "Article not found", String.format(Messages.NOT_FOUND, "article", id))); return res.notFound(); case 1: return res.noContent(); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java index b00357b72ab94e51e70672897c434ab05ab0c87c..1a3d18310883ad428a639d6f9e5701f1eb2cb791 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java @@ -19,7 +19,6 @@ import de.vipra.util.NestedMap; import de.vipra.util.StringUtils; import de.vipra.util.model.Article; import de.vipra.util.model.Topic; -import de.vipra.util.model.Word; import de.vipra.util.service.MongoService; @Path("info") @@ -38,7 +37,6 @@ public class InfoResource { final Config config = Config.getConfig(); final MongoService<Article, ObjectId> dbArticles = MongoService.getDatabaseService(config, Article.class); final MongoService<Topic, ObjectId> dbTopics = MongoService.getDatabaseService(config, Topic.class); - final MongoService<Word, String> dbWords = MongoService.getDatabaseService(config, Word.class); // vm info info.put("vm.starttime", rb.getStartTime()); @@ -62,11 +60,8 @@ public class InfoResource { // database info info.put("db.articles", dbArticles.count(null)); info.put("db.topics", dbTopics.count(null)); - info.put("db.words", dbWords.count(null)); // constants - info.put("const.analyzer", Constants.ANALYZER); - info.put("const.processor", Constants.PROCESSOR); info.put("const.windowres", Constants.WINDOW_RESOLUTION); info.put("const.importbuf", Constants.IMPORT_BUFFER_MAX); info.put("const.esboosttopics", Constants.ES_BOOST_TOPICS); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java index bd0d0f63b6085be4e29c4f8cb613a5ce8292aa2e..434b58ee7243627b06208f8cf854db5e9b3cb655 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java @@ -45,7 +45,7 @@ public class SearchResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response doSearch(@QueryParam("skip") Integer skip, @QueryParam("limit") Integer limit, + public Response doSearch(@QueryParam("model") final String model, @QueryParam("skip") Integer skip, @QueryParam("limit") Integer limit, @QueryParam("fields") final String fields, @QueryParam("query") final String query) { final ResponseWrapper<List<ArticleFull>> res = new ResponseWrapper<>(); @@ -58,11 +58,15 @@ public class SearchResource { if (query == null || query.isEmpty() || limit == 0) return res.noContent(); + String indexName = "_all"; + if (model != null && !model.isEmpty()) + indexName = model + "-articles"; + SearchResponse response = null; try { - response = client.prepareSearch("articles") - .setQuery(QueryBuilders.multiMatchQuery(query, "topics^" + Constants.ES_BOOST_TOPICS, - "title^" + Constants.ES_BOOST_TITLES, "_all")) + response = client.prepareSearch(indexName) + .setQuery( + QueryBuilders.multiMatchQuery(query, "topics^" + Constants.ES_BOOST_TOPICS, "title^" + Constants.ES_BOOST_TITLES, "_all")) .setFrom(skip).setSize(limit).execute().actionGet(); } catch (final Exception e) { e.printStackTrace(); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/SequenceResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/SequenceResource.java index 9fa08d25de4dc25155a297097ed71144aea763c3..b22aed1860b69d60f86aa8701308e5bf6a76bcaa 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/SequenceResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/SequenceResource.java @@ -25,6 +25,7 @@ import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; import de.vipra.util.model.SequenceFull; import de.vipra.util.service.MongoService; +import de.vipra.util.service.Service.QueryBuilder; @Path("sequences") public class SequenceResource { @@ -38,16 +39,23 @@ public class SequenceResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response getSequences(@QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, - @QueryParam("sort") @DefaultValue("id") final String sortBy, @QueryParam("fields") final String fields) { + public Response getSequences(@QueryParam("model") final String model, @QueryParam("skip") final Integer skip, + @QueryParam("limit") final Integer limit, @QueryParam("sort") @DefaultValue("id") final String sortBy, + @QueryParam("fields") final String fields) { final ResponseWrapper<List<SequenceFull>> res = new ResponseWrapper<>(); if (res.hasErrors()) return Response.status(Response.Status.BAD_REQUEST).entity(res).build(); try { - final List<SequenceFull> sequences = dbSequences.getMultiple(skip, limit, sortBy, - StringUtils.getFields(fields)); + final QueryBuilder query = QueryBuilder.builder().skip(skip).limit(limit).sortBy(sortBy); + if (fields != null && !fields.isEmpty()) + query.fields(true, StringUtils.getFields(fields)); + + if (model != null && !model.isEmpty()) + query.criteria("model.id", model); + + final List<SequenceFull> sequences = dbSequences.getMultiple(query); if ((skip != null && skip > 0) || (limit != null && limit > 0)) res.addHeader("total", dbSequences.count(null)); @@ -69,8 +77,7 @@ public class SequenceResource { @QueryParam("topWords") final Integer topWords) throws ConfigException, IOException { final ResponseWrapper<SequenceFull> res = new ResponseWrapper<>(); if (id == null || id.trim().length() == 0) { - res.addError(new APIError(Response.Status.BAD_REQUEST, "ID is empty", - String.format(Messages.BAD_REQUEST, "id cannot be empty"))); + res.addError(new APIError(Response.Status.BAD_REQUEST, "ID is empty", String.format(Messages.BAD_REQUEST, "id cannot be empty"))); return res.badRequest(); } @@ -88,8 +95,7 @@ public class SequenceResource { sequence.getWords().subList(topWords, sequence.getWords().size()).clear(); return res.ok(sequence); } else { - res.addError(new APIError(Response.Status.NOT_FOUND, "Resource not found", - String.format(Messages.NOT_FOUND, "sequence", id))); + res.addError(new APIError(Response.Status.NOT_FOUND, "Resource not found", String.format(Messages.NOT_FOUND, "sequence", id))); return res.notFound(); } } diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/TopicModelResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/TopicModelResource.java new file mode 100644 index 0000000000000000000000000000000000000000..493929b45c6e893dbc02c5a8a6abfe600bd2166b --- /dev/null +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/TopicModelResource.java @@ -0,0 +1,93 @@ +package de.vipra.rest.resource; + +import java.io.IOException; +import java.util.List; + +import javax.servlet.ServletContext; +import javax.ws.rs.DefaultValue; +import javax.ws.rs.GET; +import javax.ws.rs.Path; +import javax.ws.rs.PathParam; +import javax.ws.rs.Produces; +import javax.ws.rs.QueryParam; +import javax.ws.rs.core.Context; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; + +import de.vipra.rest.Messages; +import de.vipra.rest.model.APIError; +import de.vipra.rest.model.ResponseWrapper; +import de.vipra.util.Config; +import de.vipra.util.StringUtils; +import de.vipra.util.ex.ConfigException; +import de.vipra.util.model.TopicModelFull; +import de.vipra.util.service.MongoService; +import de.vipra.util.service.Service.QueryBuilder; + +@Path("topicmodels") +public class TopicModelResource { + + final MongoService<TopicModelFull, String> dbTopicModels; + + public TopicModelResource(@Context final ServletContext servletContext) throws ConfigException, IOException { + final Config config = Config.getConfig(); + dbTopicModels = MongoService.getDatabaseService(config, TopicModelFull.class); + } + + @GET + @Produces(MediaType.APPLICATION_JSON) + public Response getTopicModels(@QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, + @QueryParam("sort") @DefaultValue("id") final String sortBy, @QueryParam("fields") final String fields) { + final ResponseWrapper<List<TopicModelFull>> res = new ResponseWrapper<>(); + + if (res.hasErrors()) + return Response.status(Response.Status.BAD_REQUEST).entity(res).build(); + + try { + final QueryBuilder query = QueryBuilder.builder().skip(skip).limit(limit).sortBy(sortBy); + if (fields != null && !fields.isEmpty()) + query.fields(true, StringUtils.getFields(fields)); + + final List<TopicModelFull> topicModels = dbTopicModels.getMultiple(query); + + if ((skip != null && skip > 0) || (limit != null && limit > 0)) + res.addHeader("total", dbTopicModels.count(null)); + else + res.addHeader("total", topicModels.size()); + + return res.ok(topicModels); + } catch (final Exception e) { + e.printStackTrace(); + res.addError(new APIError(Response.Status.BAD_REQUEST, "Error", e.getMessage())); + return Response.status(Response.Status.BAD_REQUEST).entity(res).build(); + } + } + + @GET + @Produces(MediaType.APPLICATION_JSON) + @Path("{id}") + public Response getTopicModel(@PathParam("id") final String id, @QueryParam("fields") final String fields) { + final ResponseWrapper<TopicModelFull> res = new ResponseWrapper<>(); + if (id == null || id.trim().length() == 0) { + res.addError(new APIError(Response.Status.BAD_REQUEST, "ID is empty", String.format(Messages.BAD_REQUEST, "id cannot be empty"))); + return res.badRequest(); + } + + TopicModelFull topicModel; + try { + topicModel = dbTopicModels.getSingle(id, false, StringUtils.getFields(fields)); + } catch (final Exception e) { + e.printStackTrace(); + res.addError(new APIError(Response.Status.BAD_REQUEST, "Error", e.getMessage())); + return res.badRequest(); + } + + if (topicModel != null) { + return res.ok(topicModel); + } else { + res.addError(new APIError(Response.Status.NOT_FOUND, "Resource not found", String.format(Messages.NOT_FOUND, "topicmodel", id))); + return res.notFound(); + } + } + +} diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java index c7f665a9d1833a0be7ce8b366701bba6b0c5c3cc..c749773e1e8779f299d1de512d81aea54eaa3fc8 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java @@ -47,15 +47,23 @@ public class TopicResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response getTopics(@QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, - @QueryParam("sort") @DefaultValue("name") final String sortBy, @QueryParam("fields") final String fields) { + public Response getTopics(@QueryParam("model") final String model, @QueryParam("skip") final Integer skip, + @QueryParam("limit") final Integer limit, @QueryParam("sort") @DefaultValue("name") final String sortBy, + @QueryParam("fields") final String fields) { final ResponseWrapper<List<TopicFull>> res = new ResponseWrapper<>(); if (res.hasErrors()) return Response.status(Response.Status.BAD_REQUEST).entity(res).build(); try { - final List<TopicFull> topics = dbTopics.getMultiple(skip, limit, sortBy, StringUtils.getFields(fields)); + final QueryBuilder query = QueryBuilder.builder().skip(skip).limit(limit).sortBy(sortBy); + if (fields != null && !fields.isEmpty()) + query.fields(true, StringUtils.getFields(fields)); + + if (model != null && !model.isEmpty()) + query.criteria("model.id", model); + + final List<TopicFull> topics = dbTopics.getMultiple(query); if ((skip != null && skip > 0) || (limit != null && limit > 0)) res.addHeader("total", dbTopics.count(null)); @@ -73,12 +81,10 @@ public class TopicResource { @GET @Produces(MediaType.APPLICATION_JSON) @Path("{id}") - public Response getTopic(@PathParam("id") final String id, @QueryParam("fields") final String fields) - throws ConfigException, IOException { + public Response getTopic(@PathParam("id") final String id, @QueryParam("fields") final String fields) throws ConfigException, IOException { final ResponseWrapper<TopicFull> res = new ResponseWrapper<>(); if (id == null || id.trim().length() == 0) { - res.addError(new APIError(Response.Status.BAD_REQUEST, "ID is empty", - String.format(Messages.BAD_REQUEST, "id cannot be empty"))); + res.addError(new APIError(Response.Status.BAD_REQUEST, "ID is empty", String.format(Messages.BAD_REQUEST, "id cannot be empty"))); return res.badRequest(); } @@ -94,8 +100,7 @@ public class TopicResource { if (topic != null) { return res.ok(topic); } else { - res.addError(new APIError(Response.Status.NOT_FOUND, "Resource not found", - String.format(Messages.NOT_FOUND, "topic", id))); + res.addError(new APIError(Response.Status.NOT_FOUND, "Resource not found", String.format(Messages.NOT_FOUND, "topic", id))); return res.notFound(); } } @@ -103,14 +108,13 @@ public class TopicResource { @GET @Produces(MediaType.APPLICATION_JSON) @Path("{id}/articles") - public Response getArticles(@PathParam("id") final String id, @QueryParam("skip") final Integer skip, - @QueryParam("limit") final Integer limit, @QueryParam("sort") @DefaultValue("title") final String sortBy, - @QueryParam("fields") final String fields, @Context final UriInfo uriInfo) { + public Response getArticles(@PathParam("id") final String id, @QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, + @QueryParam("sort") @DefaultValue("title") final String sortBy, @QueryParam("fields") final String fields, + @Context final UriInfo uriInfo) { final ResponseWrapper<List<ArticleFull>> res = new ResponseWrapper<>(); try { final Topic topic = new Topic(MongoUtils.objectId(id)); - final QueryBuilder query = QueryBuilder.builder().criteria("topics.topic", topic).skip(skip).limit(limit) - .sortBy(sortBy); + final QueryBuilder query = QueryBuilder.builder().criteria("topics.topic", topic).skip(skip).limit(limit).sortBy(sortBy); if (fields != null && !fields.isEmpty()) query.fields(true, StringUtils.getFields(fields)); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/WindowResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/WindowResource.java index bf3b922ffcf5fb36ebca83b54534788f93406f01..8528a5d7d9fd31c30185d4e315f56050b063e90a 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/WindowResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/WindowResource.java @@ -4,9 +4,11 @@ import java.io.IOException; import java.util.List; import javax.servlet.ServletContext; +import javax.ws.rs.DefaultValue; import javax.ws.rs.GET; import javax.ws.rs.Path; import javax.ws.rs.Produces; +import javax.ws.rs.QueryParam; import javax.ws.rs.core.Context; import javax.ws.rs.core.MediaType; import javax.ws.rs.core.Response; @@ -14,9 +16,11 @@ import javax.ws.rs.core.Response; import de.vipra.rest.model.APIError; import de.vipra.rest.model.ResponseWrapper; import de.vipra.util.Config; +import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; import de.vipra.util.model.Window; import de.vipra.util.service.MongoService; +import de.vipra.util.service.Service.QueryBuilder; @Path("windows") public class WindowResource { @@ -30,14 +34,29 @@ public class WindowResource { @GET @Produces(MediaType.APPLICATION_JSON) - public Response getWindows() { + public Response getWindows(@QueryParam("model") final String model, @QueryParam("skip") final Integer skip, + @QueryParam("limit") final Integer limit, @QueryParam("sort") @DefaultValue("startDate") final String sortBy, + @QueryParam("fields") final String fields) { final ResponseWrapper<List<Window>> res = new ResponseWrapper<>(); if (res.hasErrors()) return Response.status(Response.Status.BAD_REQUEST).entity(res).build(); try { - final List<Window> windows = dbWindows.getAll(); + final QueryBuilder query = QueryBuilder.builder().skip(skip).limit(limit).sortBy(sortBy); + if (fields != null && !fields.isEmpty()) + query.fields(true, StringUtils.getFields(fields)); + + if (model != null && !model.isEmpty()) + query.criteria("model.id", model); + + final List<Window> windows = dbWindows.getMultiple(query); + + if ((skip != null && skip > 0) || (limit != null && limit > 0)) + res.addHeader("total", dbWindows.count(null)); + else + res.addHeader("total", windows.size()); + return res.ok(windows); } catch (final Exception e) { e.printStackTrace(); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/WordResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/WordResource.java deleted file mode 100644 index d08b101cbf40bae1ec096d660e0801708b963425..0000000000000000000000000000000000000000 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/WordResource.java +++ /dev/null @@ -1,118 +0,0 @@ -package de.vipra.rest.resource; - -import java.io.IOException; -import java.util.List; - -import javax.servlet.ServletContext; -import javax.ws.rs.Consumes; -import javax.ws.rs.DefaultValue; -import javax.ws.rs.GET; -import javax.ws.rs.Path; -import javax.ws.rs.PathParam; -import javax.ws.rs.Produces; -import javax.ws.rs.QueryParam; -import javax.ws.rs.core.Context; -import javax.ws.rs.core.MediaType; -import javax.ws.rs.core.Response; - -import org.bson.types.ObjectId; - -import de.vipra.rest.Messages; -import de.vipra.rest.model.APIError; -import de.vipra.rest.model.ResponseWrapper; -import de.vipra.util.Config; -import de.vipra.util.StringUtils; -import de.vipra.util.ex.ConfigException; -import de.vipra.util.model.TopicFull; -import de.vipra.util.model.Word; -import de.vipra.util.service.MongoService; -import de.vipra.util.service.Service.QueryBuilder; - -@Path("words") -public class WordResource { - - final MongoService<Word, String> dbWords; - final MongoService<TopicFull, ObjectId> dbTopics; - - public WordResource(@Context final ServletContext servletContext) throws ConfigException, IOException { - final Config config = Config.getConfig(); - dbWords = MongoService.getDatabaseService(config, Word.class); - dbTopics = MongoService.getDatabaseService(config, TopicFull.class); - } - - @GET - @Produces(MediaType.APPLICATION_JSON) - public Response getWords(@QueryParam("skip") final Integer skip, @QueryParam("limit") final Integer limit, - @QueryParam("sort") @DefaultValue("id") final String sortBy, @QueryParam("fields") final String fields) { - final ResponseWrapper<List<Word>> res = new ResponseWrapper<>(); - - if (res.hasErrors()) - return res.badRequest(); - - try { - final List<Word> words = dbWords.getMultiple(skip, limit, sortBy, StringUtils.getFields(fields)); - - if ((skip != null && skip > 0) || (limit != null && limit > 0)) - res.addHeader("total", dbWords.count(null)); - else - res.addHeader("total", words.size()); - - return res.ok(words); - } catch (final Exception e) { - e.printStackTrace(); - res.addError(new APIError(Response.Status.BAD_REQUEST, "Error", e.getMessage())); - return res.badRequest(); - } - } - - @GET - @Produces(MediaType.APPLICATION_JSON) - @Consumes(MediaType.APPLICATION_JSON) - @Path("{id}") - public Response getWord(@PathParam("id") final String id, @QueryParam("fields") final String fields) { - final ResponseWrapper<Word> res = new ResponseWrapper<>(); - if (id == null || id.trim().length() == 0) { - res.addError(new APIError(Response.Status.BAD_REQUEST, "ID is empty", - String.format(Messages.BAD_REQUEST, "id cannot be empty"))); - return res.badRequest(); - } - - Word word; - try { - word = dbWords.getSingle(id, false, StringUtils.getFields(fields)); - } catch (final Exception e) { - e.printStackTrace(); - res.addError(new APIError(Response.Status.BAD_REQUEST, "Error", e.getMessage())); - return res.badRequest(); - } - - if (word != null) { - return res.ok(word); - } else { - final String msg = String.format(Messages.NOT_FOUND, "id", id); - res.addError(new APIError(Response.Status.NOT_FOUND, "Resource not found", msg)); - return res.notFound(); - } - } - - @GET - @Produces(MediaType.APPLICATION_JSON) - @Consumes(MediaType.APPLICATION_JSON) - @Path("{id}/topics") - public Response getWordTopics(@PathParam("id") final String id, @QueryParam("fields") final String fields) { - final ResponseWrapper<List<TopicFull>> res = new ResponseWrapper<>(); - try { - final Word word = new Word(id); - final QueryBuilder query = QueryBuilder.builder().fields(true, "id", "name").criteria("words.word", word); - if (fields != null && !fields.isEmpty()) - query.fields(true, StringUtils.getFields(fields)); - final List<TopicFull> topics = dbTopics.getMultiple(query); - return res.ok(topics); - } catch (final Exception e) { - e.printStackTrace(); - res.addError(new APIError(Response.Status.BAD_REQUEST, "Error", e.getMessage())); - return res.badRequest(); - } - } - -} diff --git a/vipra-backend/src/main/java/de/vipra/rest/serializer/ObjectIdDeserializer.java b/vipra-backend/src/main/java/de/vipra/rest/serializer/ObjectIdDeserializer.java index d3d6c3d3a05536ed1c26c65261539d1b86bc5053..5b7b391cff12e320e3620a5689df8c3604ecffa3 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/serializer/ObjectIdDeserializer.java +++ b/vipra-backend/src/main/java/de/vipra/rest/serializer/ObjectIdDeserializer.java @@ -14,8 +14,7 @@ import de.vipra.util.MongoUtils; public class ObjectIdDeserializer extends JsonDeserializer<ObjectId> { @Override - public ObjectId deserialize(final JsonParser p, final DeserializationContext ctxt) - throws IOException, JsonProcessingException { + public ObjectId deserialize(final JsonParser p, final DeserializationContext ctxt) throws IOException, JsonProcessingException { return MongoUtils.objectId(p.getValueAsString()); } diff --git a/vipra-backend/src/main/java/de/vipra/ws/WebSocket.java b/vipra-backend/src/main/java/de/vipra/ws/WebSocket.java index ee8e5f00e02942b578a0e8b1d455522ae68318ba..5cfd37f8f617f20c07ddd0b9a0222acb6417e3c9 100644 --- a/vipra-backend/src/main/java/de/vipra/ws/WebSocket.java +++ b/vipra-backend/src/main/java/de/vipra/ws/WebSocket.java @@ -46,8 +46,7 @@ public class WebSocket { } @OnMessage - public void handleMessage(final String input, final Session session) - throws JsonParseException, JsonMappingException, IOException { + public void handleMessage(final String input, final Session session) throws JsonParseException, JsonMappingException, IOException { log.trace("message received"); try { final WebSocketMessage msg = mapper.readValue(input, WebSocketMessage.class); diff --git a/vipra-backend/src/main/resources/config.json b/vipra-backend/src/main/resources/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b5a9ad2919414e3006df5a8acbbfeeeee475bf8b --- /dev/null +++ b/vipra-backend/src/main/resources/config.json @@ -0,0 +1,7 @@ +{ + "databaseHost": "127.0.0.1", + "databasePort": 27017, + "databaseName": "test", + "elasticSearchHost": "127.0.0.1", + "elasticSearchPort": 9300 +} \ No newline at end of file diff --git a/vipra-backend/src/main/resources/config.properties b/vipra-backend/src/main/resources/config.properties deleted file mode 100644 index 0ca6de7ca9d55514c174287484d9da96e410addb..0000000000000000000000000000000000000000 --- a/vipra-backend/src/main/resources/config.properties +++ /dev/null @@ -1,6 +0,0 @@ -db.host=localhost -db.port=27017 -db.name=test -es.host=localhost -es.port=9300 -tm.dtmpath=/home/eike/repos/master/dtm_release/dtm/main \ No newline at end of file diff --git a/vipra-cmd/.settings/org.eclipse.jdt.core.prefs b/vipra-cmd/.settings/org.eclipse.jdt.core.prefs index bf52a73f0cb4d7fde47825bf5e19ce3b293969af..7f75c624478b28a64f4921be52b50e939a2c477c 100644 --- a/vipra-cmd/.settings/org.eclipse.jdt.core.prefs +++ b/vipra-cmd/.settings/org.eclipse.jdt.core.prefs @@ -282,7 +282,7 @@ org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line=false org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line=false org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line=false org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line=false -org.eclipse.jdt.core.formatter.lineSplit=120 +org.eclipse.jdt.core.formatter.lineSplit=150 org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column=false org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column=false org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body=0 diff --git a/vipra-cmd/runcfg/CMD - Clear.launch b/vipra-cmd/runcfg/CMD - Clear.launch index c3bf3fd22b7ad771914ff3db5f914d87cdd36900..054ed66ad23da2f5d9078266ac4c921e2db471fb 100644 --- a/vipra-cmd/runcfg/CMD - Clear.launch +++ b/vipra-cmd/runcfg/CMD - Clear.launch @@ -11,7 +11,7 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-cn"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="--clear"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> diff --git a/vipra-cmd/runcfg/CMD - Modeling.launch b/vipra-cmd/runcfg/CMD - Modeling.launch index fcb96a478a92d07d5613a2de4d94c5e26d456879..9560a948281cff94a0c57ab56a3b27264e02007c 100644 --- a/vipra-cmd/runcfg/CMD - Modeling.launch +++ b/vipra-cmd/runcfg/CMD - Modeling.launch @@ -11,7 +11,7 @@ </listAttribute> <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> -<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-m"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-g"/> <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> <stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java b/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java index 7498b4a2ee97e8d508c9ad3cccc171f1fef8ff11..9d7209b33b58399395d0ff66f5dbda692140fb74 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java @@ -17,16 +17,8 @@ public class CmdOptions extends Options { public static final String OPT_SHELL = "x"; public static final String OPT_SHELL_LONG = "shell"; - public static final String OPT_CLEAR = "c"; public static final String OPT_CLEAR_LONG = "clear"; - public static final String OPT_STATS = "p"; - public static final String OPT_STATS_LONG = "print-stats"; - - public static final String OPT_DEFAULTS = "n"; - public static final String OPT_DEFAULTS_LONG = "defaults"; - - public static final String OPT_DEBUG = "d"; public static final String OPT_DEBUG_LONG = "debug"; public static final String OPT_TEST = "t"; @@ -35,11 +27,8 @@ public class CmdOptions extends Options { public static final String OPT_SILENT = "s"; public static final String OPT_SILENT_LONG = "silent"; - public static final String OPT_CONFIG = "o"; - public static final String OPT_CONFIG_LONG = "config"; - - public static final String OPT_MODELING = "m"; - public static final String OPT_MODELING_LONG = "modeling"; + public static final String OPT_MODELING = "g"; + public static final String OPT_MODELING_LONG = "gen-model"; public static final String OPT_INDEXING = "e"; public static final String OPT_INDEXING_LONG = "indexing"; @@ -47,24 +36,41 @@ public class CmdOptions extends Options { public static final String OPT_REREAD = "r"; public static final String OPT_REREAD_LONG = "reread"; + public static final String OPT_CREATE_MODEL = "c"; + public static final String OPT_CREATE_MODEL_LONG = "create-model"; + + public static final String OPT_DELETE_MODEL = "d"; + public static final String OPT_DELETE_MODEL_LONG = "delete-model"; + + public static final String OPT_CHOOSE_MODEL = "m"; + public static final String OPT_CHOOSE_MODEL_LONG = "model"; + + public static final String OPT_LIST_MODELS = "l"; + public static final String OPT_LIST_MODELS_LONG = "list-models"; + + public static final String OPT_CONFIG_MODEL = "o"; + public static final String OPT_CONFIG_MODEL_LONG = "config-model"; + public CmdOptions() { addOption(Option.builder(OPT_HELP).longOpt(OPT_HELP_LONG).desc("print this message").build()); - addOption(Option.builder(OPT_SHELL).longOpt(OPT_SHELL_LONG).hasArg(true).argName("name") - .desc("run from a shell script").build()); - addOption(Option.builder(OPT_IMPORT).longOpt(OPT_IMPORT_LONG).hasArgs().argName("files/dirs...") - .desc("import articles into the database").build()); - addOption(Option.builder(OPT_CLEAR).longOpt(OPT_CLEAR_LONG).desc("clear database and filebase").build()); - addOption(Option.builder(OPT_STATS).longOpt(OPT_STATS_LONG).desc("gather database and filebase information") + addOption(Option.builder(OPT_SHELL).longOpt(OPT_SHELL_LONG).hasArg(true).argName("name").desc("run from a shell script").build()); + addOption(Option.builder(OPT_IMPORT).longOpt(OPT_IMPORT_LONG).hasArgs().argName("files/dirs...").desc("import articles into the database") .build()); - addOption(Option.builder(OPT_DEFAULTS).longOpt(OPT_DEFAULTS_LONG).desc("accept default decisions").build()); - addOption(Option.builder(OPT_DEBUG).longOpt(OPT_DEBUG_LONG).desc("show debug information").build()); + addOption(Option.builder().longOpt(OPT_CLEAR_LONG).desc("clear database and filebase").build()); + addOption(Option.builder().longOpt(OPT_DEBUG_LONG).desc("show debug information").build()); addOption(Option.builder(OPT_TEST).longOpt(OPT_TEST_LONG).desc("system tests").build()); addOption(Option.builder(OPT_SILENT).longOpt(OPT_SILENT_LONG).desc("mute all output").build()); - addOption(Option.builder(OPT_CONFIG).longOpt(OPT_CONFIG_LONG).desc("show configuration").build()); addOption(Option.builder(OPT_MODELING).longOpt(OPT_MODELING_LONG).desc("regenerate topic model").build()); addOption(Option.builder(OPT_INDEXING).longOpt(OPT_INDEXING_LONG).desc("regenerate search index").build()); - addOption(Option.builder(OPT_REREAD).longOpt(OPT_REREAD_LONG) - .desc("reread model files, ignored when remodeling").build()); + addOption(Option.builder(OPT_REREAD).longOpt(OPT_REREAD_LONG).desc("reread model files").build()); + addOption(Option.builder(OPT_CREATE_MODEL).longOpt(OPT_CREATE_MODEL_LONG).hasArgs().argName("name/s...").desc("create a new topic model") + .build()); + addOption(Option.builder(OPT_DELETE_MODEL).longOpt(OPT_DELETE_MODEL_LONG).hasArgs().argName("name/s...") + .desc("delete an existing topic model").build()); + addOption(Option.builder(OPT_CHOOSE_MODEL).longOpt(OPT_CHOOSE_MODEL_LONG).hasArgs().argName("name/s...") + .desc("choose topic model(s) for further actions").build()); + addOption(Option.builder(OPT_LIST_MODELS).longOpt(OPT_LIST_MODELS_LONG).desc("list existing models").build()); + addOption(Option.builder(OPT_CONFIG_MODEL).longOpt(OPT_CONFIG_MODEL_LONG).hasArg().argName("name").desc("configure a model").build()); } public void printHelp(final String cmd) { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java index e50d9f9bbe16219b9eb722bced3578e348c0c799..d004f7620cc00373d99d78ea215433dc4588359b 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java @@ -1,19 +1,22 @@ package de.vipra.cmd; -import static de.vipra.cmd.CmdOptions.OPT_CLEAR; -import static de.vipra.cmd.CmdOptions.OPT_CONFIG; -import static de.vipra.cmd.CmdOptions.OPT_DEBUG; -import static de.vipra.cmd.CmdOptions.OPT_DEFAULTS; +import static de.vipra.cmd.CmdOptions.OPT_CHOOSE_MODEL; +import static de.vipra.cmd.CmdOptions.OPT_CLEAR_LONG; +import static de.vipra.cmd.CmdOptions.OPT_CONFIG_MODEL; +import static de.vipra.cmd.CmdOptions.OPT_CREATE_MODEL; +import static de.vipra.cmd.CmdOptions.OPT_DEBUG_LONG; +import static de.vipra.cmd.CmdOptions.OPT_DELETE_MODEL; import static de.vipra.cmd.CmdOptions.OPT_HELP; import static de.vipra.cmd.CmdOptions.OPT_IMPORT; import static de.vipra.cmd.CmdOptions.OPT_INDEXING; +import static de.vipra.cmd.CmdOptions.OPT_LIST_MODELS; import static de.vipra.cmd.CmdOptions.OPT_MODELING; import static de.vipra.cmd.CmdOptions.OPT_REREAD; import static de.vipra.cmd.CmdOptions.OPT_SHELL; import static de.vipra.cmd.CmdOptions.OPT_SILENT; -import static de.vipra.cmd.CmdOptions.OPT_STATS; import static de.vipra.cmd.CmdOptions.OPT_TEST; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.ListIterator; @@ -37,12 +40,17 @@ import com.mongodb.MongoTimeoutException; import de.vipra.cmd.option.ClearCommand; import de.vipra.cmd.option.Command; -import de.vipra.cmd.option.ConfigCommand; +import de.vipra.cmd.option.ConfigModelCommand; +import de.vipra.cmd.option.CreateModelCommand; +import de.vipra.cmd.option.DeleteModelCommand; import de.vipra.cmd.option.ImportCommand; import de.vipra.cmd.option.IndexingCommand; +import de.vipra.cmd.option.ListModelsCommand; import de.vipra.cmd.option.ModelingCommand; -import de.vipra.cmd.option.StatsCommand; import de.vipra.cmd.option.TestCommand; +import de.vipra.util.Config; +import de.vipra.util.ConfigDtm; +import de.vipra.util.ex.ConfigException; public class Main { @@ -55,7 +63,7 @@ public class Main { System.err.close(); } - public static void main(final String[] args) { + public static void main(final String[] args) throws IOException, ConfigException { final CommandLineParser parser = new DefaultParser(); final CmdOptions options = new CmdOptions(); String cmd = "vipra-cmd.jar"; @@ -71,7 +79,7 @@ public class Main { if (cline.hasOption(OPT_SHELL)) { cmd = cline.getOptionValue(OPT_SHELL); if (cmd == null) { - cmd = "vipra-cmd.sh"; + cmd = "vipra.sh"; } } @@ -84,7 +92,7 @@ public class Main { final LoggerContext loggerContext = (LoggerContext) LogManager.getContext(false); final Configuration loggerConfigs = loggerContext.getConfiguration(); - if (cline.hasOption(OPT_DEBUG)) + if (cline.hasOption(OPT_DEBUG_LONG)) loggerConfigs.getLoggerConfig(LogManager.ROOT_LOGGER_NAME).setLevel(Level.DEBUG); if (cline.hasOption(OPT_SILENT)) { @@ -94,32 +102,72 @@ public class Main { loggerContext.updateLoggers(); - // check if default decisions should be auto chosen - final boolean defaults = cline.hasOption(OPT_DEFAULTS); - // get commands final List<Command> commands = new ArrayList<>(); - if (cline.hasOption(OPT_CONFIG)) - commands.add(new ConfigCommand()); - if (cline.hasOption(OPT_TEST)) commands.add(new TestCommand()); - if (cline.hasOption(OPT_CLEAR)) - commands.add(new ClearCommand(defaults)); + if (cline.hasOption(OPT_CREATE_MODEL)) + commands.add(new CreateModelCommand(cline.getOptionValues(OPT_CREATE_MODEL))); + + if (cline.hasOption(OPT_DELETE_MODEL)) + commands.add(new DeleteModelCommand(cline.getOptionValues(OPT_DELETE_MODEL))); + + if (cline.hasOption(OPT_LIST_MODELS)) + commands.add(new ListModelsCommand()); + + if (cline.hasOption(OPT_CONFIG_MODEL)) + commands.add(new ConfigModelCommand(cline.getOptionValue(OPT_CONFIG_MODEL))); + + final Config config = Config.getConfig(); + final List<ConfigDtm> configDtms = new ArrayList<>(); + if (cline.hasOption(OPT_CHOOSE_MODEL)) { + for (final String model : cline.getOptionValues(OPT_CHOOSE_MODEL)) { + if (model.toLowerCase().equals("all")) { + configDtms.clear(); + for (final Entry<String, ConfigDtm> entry : config.getDtmConfigurations().entrySet()) + configDtms.add(entry.getValue()); + break; + } else { + final ConfigDtm configDtm = config.getDtmConfigurations().get(model); + if (configDtm == null) { + log.error("unknown model: " + model); + return; + } + configDtms.add(configDtm); + } + } + } else if (config.isDefaultAllModels()) { + configDtms.clear(); + for (final Entry<String, ConfigDtm> entry : config.getDtmConfigurations().entrySet()) + configDtms.add(entry.getValue()); + } + + if (cline.hasOption(OPT_IMPORT) || cline.hasOption(OPT_MODELING) || cline.hasOption(OPT_REREAD) || cline.hasOption(OPT_INDEXING)) { + if (!cline.hasOption(OPT_CHOOSE_MODEL) || configDtms.isEmpty()) { + log.error("no models chosen"); + return; + } + } + + if (cline.hasOption(OPT_CLEAR_LONG)) + commands.add(new ClearCommand()); - if (cline.hasOption(OPT_IMPORT)) - commands.add(new ImportCommand(cline.getOptionValues(OPT_IMPORT))); + if (cline.hasOption(OPT_IMPORT)) { + for (final ConfigDtm configDtm : configDtms) + commands.add(new ImportCommand(configDtm, cline.getOptionValues(OPT_IMPORT))); + } - if (cline.hasOption(OPT_MODELING) || cline.hasOption(OPT_REREAD)) - commands.add(new ModelingCommand(!cline.hasOption(OPT_MODELING) && cline.hasOption(OPT_REREAD))); + if (cline.hasOption(OPT_MODELING) || cline.hasOption(OPT_REREAD)) { + final boolean reread = !cline.hasOption(OPT_MODELING) && cline.hasOption(OPT_REREAD); + for (final ConfigDtm configDtm : configDtms) + commands.add(new ModelingCommand(configDtm, reread)); + } if (cline.hasOption(OPT_INDEXING)) - commands.add(new IndexingCommand()); - - if (cline.hasOption(OPT_STATS)) - commands.add(new StatsCommand()); + for (final ConfigDtm configDtm : configDtms) + commands.add(new IndexingCommand(configDtm)); // run commands if (commands.size() > 0) { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java deleted file mode 100644 index fd81162a0bb9f98ae7efa49d23607c8916cb78a4..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java +++ /dev/null @@ -1,106 +0,0 @@ -package de.vipra.cmd.file; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import de.vipra.cmd.ex.FilebaseException; -import de.vipra.cmd.file.DTMIndex.ArticleDate; -import de.vipra.util.Constants; -import de.vipra.util.FileUtils; -import de.vipra.util.model.ArticleFull; - -public class DTMFilebase extends Filebase { - - public static final String FILE_NAME = "dtm-mult.dat"; - - private final DTMIndex seqindex; - private final DTMVocabulary vocab; - private final File modelFile; - - public DTMFilebase(final File dataDir) throws FilebaseException { - super(dataDir, "dtm"); - - final File modelDir = getModelDir(); - try { - seqindex = new DTMIndex(modelDir); - } catch (IOException | ParseException e) { - throw new FilebaseException("could not read date index file", e); - } - - try { - vocab = new DTMVocabulary(modelDir, false); - } catch (final IOException e) { - throw new FilebaseException("could not read vocabulary file", e); - } - modelFile = getModelFile(FILE_NAME); - } - - @Override - public synchronized void write(final List<ArticleFull> articles) throws IOException { - if (!articles.isEmpty()) { - for (int i = 0; i < articles.size(); i++) - seqindex.add(articles.get(i).getDate(), i); - - // use temp file - final File modelFileTmp = getModelFile(FILE_NAME + ".tmp"); - Iterator<String> lines = null; - if (modelFile.exists()) - lines = FileUtils.iterateFileLines(modelFile); - - // concatenates the existing model file with new article entries in - // the temp file. The existing model is expected to be sorted - // correctly, therefore the file can be iterated sequentially. - // Because the database id index is created by the abstract - // filebase, it needs to be recreated with the new order - final BufferedWriter writer = new BufferedWriter( - new OutputStreamWriter(new FileOutputStream(modelFileTmp))); - final FilebaseIndex index = getIndex(); - final Iterator<String> currIndex = index.iterator(); - final List<String> newIndex = new ArrayList<>(); - for (final ArticleDate articleDate : seqindex.getArticleDates()) { - if (articleDate.isNew()) { - final ArticleFull article = articles.get(articleDate.index); - newIndex.add(article.getId().toString()); - writer.write(vocab.transform(article.getProcessedText())); - } else { - if (lines == null) { - writer.close(); - throw new IOException("index inconsistency: missing article file"); - } - writer.write(lines.next()); - newIndex.add(currIndex.next()); - } - writer.write(Constants.LINE_SEP); - } - writer.close(); - - // reset index to new order imposed by article dates - index.set(newIndex); - - // replace model file by temp file - if (modelFile.exists() && !modelFile.delete()) - throw new IOException("could not delete file " + modelFile.getAbsolutePath()); - if (!modelFileTmp.renameTo(modelFile)) - throw new IOException( - "could not rename tmp file " + modelFileTmp.getAbsolutePath() + " to " + modelFile.getName()); - - } - } - - @Override - public synchronized void close() throws IOException { - super.close(); - - // write vocabulary and windows - vocab.close(); - seqindex.close(); - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMIndex.java deleted file mode 100644 index 600c897048760abac0dfd9c285c2a3c1d6da350b..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMIndex.java +++ /dev/null @@ -1,155 +0,0 @@ -package de.vipra.cmd.file; - -import java.io.BufferedWriter; -import java.io.Closeable; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import de.vipra.util.Constants; -import de.vipra.util.FileUtils; - -public class DTMIndex implements Closeable { - - public static final class SequenceCount implements Comparable<SequenceCount> { - - public Date startDate; - public Date endDate; - public int count = 1; - - @Override - public int compareTo(final SequenceCount o) { - return startDate.compareTo(o.startDate); - } - } - - public static final class ArticleDate implements Comparable<ArticleDate> { - - public final Date date; - public final int index; - - public ArticleDate(final Date date, final int index) { - this.date = date; - this.index = index; - } - - @Override - public int compareTo(final ArticleDate o) { - return date.compareTo(o.date); - } - - public boolean isNew() { - return index != -1; - } - } - - public static final String DATE_FILE_NAME = "dates"; - public static final String SEQ_FILE_NAME = "dtm-seq.dat"; - - private static final SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); - - private final File inFile; - private final File outFile; - private final Map<Date, SequenceCount> sequenceMap = new HashMap<>(); - private final List<SequenceCount> sequenceList = new ArrayList<>(); - private final List<ArticleDate> articleDates = new ArrayList<>(); - - public DTMIndex(final File modelDir) throws IOException, ParseException { - inFile = new File(modelDir, DATE_FILE_NAME); - outFile = new File(modelDir, SEQ_FILE_NAME); - - if (inFile.exists()) { - final List<String> lines = FileUtils.readFile(inFile); - for (final String line : lines) { - add(df.parse(line)); - } - } - } - - private void add(final Date date) { - add(date, -1); - } - - public void add(final Date date, final int newArticleIndex) { - final Date startDate = Constants.WINDOW_RESOLUTION.startDate(date); - SequenceCount sequence = sequenceMap.get(startDate); - if (sequence == null) { - sequence = new SequenceCount(); - sequence.startDate = startDate; - sequence.endDate = Constants.WINDOW_RESOLUTION.endDate(date); - sequenceMap.put(startDate, sequence); - sequenceList.add(sequence); - Collections.sort(sequenceList); - } else { - sequence.count++; - } - final ArticleDate sequenceDate = new ArticleDate(date, newArticleIndex); - articleDates.add(sequenceDate); - } - - public SequenceCount getSequence(final int index) { - return sequenceList.get(index); - } - - public Date getStartDate(final int index) { - return getSequence(index).startDate; - } - - public Date getEndDate(final int index) { - return getSequence(index).endDate; - } - - public int sequenceCount() { - return sequenceMap.size(); - } - - public int entryCount() { - return articleDates.size(); - } - - public List<ArticleDate> getArticleDates() { - Collections.sort(articleDates); - return articleDates; - } - - public int articleIndexStartForSequence(final int sequenceIndex) { - int offset = 0; - for (int i = 0; i < sequenceIndex; i++) - offset += sequenceList.get(i).count; - return offset; - } - - @Override - public void close() throws IOException { - // write date index - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(inFile, false))); - Collections.sort(articleDates); - for (final ArticleDate entry : articleDates) { - writer.write(df.format(entry.date)); - writer.write(Constants.LINE_SEP); - } - writer.close(); - - // write window index - writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile, false))); - writer.write(Integer.toString(sequenceMap.size())); - writer.write(Constants.LINE_SEP); - - // write window sizes - for (final SequenceCount sequence : sequenceList) { - writer.write(Integer.toString(sequence.count)); - writer.write(Constants.LINE_SEP); - } - writer.close(); - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java deleted file mode 100644 index ae4bcb2e1d0f4926e855583e5cb9c7223fe0d104..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java +++ /dev/null @@ -1,96 +0,0 @@ -package de.vipra.cmd.file; - -import java.io.Closeable; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; - -import de.vipra.util.FileUtils; -import de.vipra.util.model.Word; - -public class DTMVocabulary implements Closeable, Iterable<String> { - - public static final String FILE_NAME = "vocab"; - - private final File file; - private static List<String> vocables; - - public DTMVocabulary(final File modelDir) throws IOException { - this(modelDir, false); - } - - public DTMVocabulary(final File modelDir, final boolean reread) throws IOException { - file = new File(modelDir, FILE_NAME); - if (file.exists()) { - if (vocables == null || reread) - vocables = FileUtils.readFile(file); - } else if (vocables == null || reread) - vocables = new ArrayList<>(500); - } - - public void write() throws IOException { - org.apache.commons.io.FileUtils.writeLines(file, vocables, false); - } - - private int index(final String word) { - int index = vocables.indexOf(word); - if (index == -1) { - vocables.add(word); - index = vocables.size() - 1; - } - return index; - } - - public int size() { - return vocables.size(); - } - - public Word getWord(final int index) { - if (vocables.size() > index) - return new Word(vocables.get(index)); - return null; - } - - public String transform(final String[] words) { - // count unique words - final Map<String, Integer> wordMap = new HashMap<>(words.length); - for (final String word : words) { - final Integer count = wordMap.get(word); - if (count == null) - wordMap.put(word, 1); - else - wordMap.put(word, count + 1); - } - - // assemble string - // <unique word count> <index1>:<count1> <index2>:<count2> ... - final StringBuilder sb = new StringBuilder(); - sb.append(wordMap.size()); - for (final Entry<String, Integer> e : wordMap.entrySet()) { - final int index = index(e.getKey()); - sb.append(" ").append(index).append(":").append(e.getValue()); - } - - return sb.toString(); - } - - public String get(final int index) { - return vocables.get(index); - } - - @Override - public void close() throws IOException { - write(); - } - - @Override - public Iterator<String> iterator() { - return vocables.iterator(); - } - -} \ No newline at end of file diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java index 9a8db6e34714d258e09e64f75ba63d3857778800..1e8cacdcffcaf2ff7de2437f8ef602bb69ce5688 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java @@ -1,94 +1,99 @@ package de.vipra.cmd.file; -import java.io.Closeable; +import java.io.BufferedReader; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.text.ParseException; +import java.util.HashMap; +import java.util.Map; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import de.vipra.cmd.ex.FilebaseException; -import de.vipra.util.Config; +import de.vipra.util.ConfigDtm; import de.vipra.util.Constants; import de.vipra.util.ex.ConfigException; import de.vipra.util.model.ArticleFull; -public abstract class Filebase implements Closeable { +public class Filebase { - public static final Logger log = LogManager.getLogger(Filebase.class); + public static final String FILE_NAME = "dtm-mult.dat"; - private final String modelName; private final File modelDir; - private final FilebaseIndex index; - private final List<ArticleFull> articles; - - public Filebase(final File dataDir, final String modelName) throws FilebaseException { - this.modelName = modelName; - modelDir = new File(dataDir, modelName); - if (!modelDir.exists()) { - if (!modelDir.mkdirs()) { - throw new FilebaseException("could not create model directory: " + modelDir.getAbsolutePath()); - } - } - try { - index = new FilebaseIndex(modelDir); - } catch (final IOException e) { - throw new FilebaseException("could not read index: " + e.getMessage()); - } - articles = new ArrayList<>(Constants.IMPORT_BUFFER_MAX); - } - - public File getModelDir() { - return modelDir; - } - - public File getModelFile(final String fileName) { - if (fileName != null) - return new File(modelDir, fileName); - return new File(modelDir, modelName); - } - - public FilebaseIndex getIndex() { - return index; - } - - public List<ArticleFull> getArticles() { - return articles; - } - - @Override - public synchronized void close() throws IOException { - write(articles); - index.close(); + private final File file; + private final Map<String, ArticleFull> newArticles; + private final FilebaseIDDateIndex idDateIndex; + private final FilebaseWordIndex wordIndex; + private final FilebaseWindowIndex windowIndex; + + public Filebase(final ConfigDtm configDtm, final File dataDir) throws ParseException, IOException { + modelDir = new File(dataDir, configDtm.getName()); + file = new File(modelDir, FILE_NAME); + newArticles = new HashMap<>(); + idDateIndex = new FilebaseIDDateIndex(modelDir); + wordIndex = new FilebaseWordIndex(modelDir); + windowIndex = new FilebaseWindowIndex(modelDir, configDtm.getWindowResolution()); } public void add(final ArticleFull article) throws FilebaseException { - index.add(article.getId().toString()); - articles.add(article); - - if (articles.size() >= Constants.IMPORT_BUFFER_MAX) { - try { - log.info("buffer filled, writing filebase"); - write(articles); - articles.clear(); - } catch (final IOException e) { - throw new FilebaseException(e); - } - } + newArticles.put(article.getId().toString(), article); + idDateIndex.add(article.getId().toString(), article.getDate()); } - public abstract void write(List<ArticleFull> articles) throws IOException; + public void sync() throws IOException, ConfigException { + if (newArticles.isEmpty()) + return; + if (!modelDir.exists() && !modelDir.mkdirs()) + throw new FilebaseException("could not create data directory: " + modelDir.getAbsolutePath()); + + // delete tmp file if exists + final File tmpFile = new File(modelDir, FILE_NAME + ".tmp"); + if (tmpFile.exists() && !tmpFile.delete()) + throw new FilebaseException("could not delete tmp file: " + tmpFile.getAbsolutePath()); + + // old model reader + BufferedReader in = null; + if (file.exists()) + in = new BufferedReader(new InputStreamReader(new FileInputStream(file))); + + // tmp model writer + final BufferedWriter outModel = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpFile, false))); + + // merge existing model and new entries + for (final FilebaseIDDateIndexEntry entry : idDateIndex) { + if (entry.isNew()) { + final ArticleFull newArticle = newArticles.get(entry.getId()); + outModel.write(wordIndex.transform(newArticle.getProcessedText())); + outModel.write(Constants.LINE_SEP); + } else { + if (in == null) { + outModel.close(); + throw new FilebaseException("filebase inconsistency: missing article with id: " + entry.getId()); + } + outModel.write(in.readLine()); + outModel.write(Constants.LINE_SEP); + } - public static Filebase getFilebase(final Config config) throws FilebaseException, ConfigException { - final File dataDir = config.getDataDirectory(); - switch (Constants.ANALYZER) { - case DTM: - return new DTMFilebase(dataDir); - default: - return null; + windowIndex.add(entry.getDate()); } + + // close buffers + if (in != null) + in.close(); + outModel.close(); + + // move tmp file + if (file.exists() && !file.delete()) + throw new FilebaseException("could not delete model file: " + file.getAbsolutePath()); + if (!tmpFile.renameTo(file)) + throw new FilebaseException("could not rename tmp file: " + tmpFile.getAbsolutePath()); + + // sync indexes + idDateIndex.sync(); + wordIndex.sync(); + windowIndex.sync(); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseException.java similarity index 71% rename from vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java rename to vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseException.java index c74dc60ada0c74718f1c014f52d74d4c63949baa..a591e977f803ab5dce6a7c0ca69272323a67a73c 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/ex/FilebaseException.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseException.java @@ -1,6 +1,8 @@ -package de.vipra.cmd.ex; +package de.vipra.cmd.file; -public class FilebaseException extends Exception { +import java.io.IOException; + +public class FilebaseException extends IOException { private static final long serialVersionUID = 1L; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIDDateIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIDDateIndex.java new file mode 100644 index 0000000000000000000000000000000000000000..5d5a1acded1ef90172694f0e8444972de5a5b5df --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIDDateIndex.java @@ -0,0 +1,85 @@ +package de.vipra.cmd.file; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.Iterator; +import java.util.List; + +import de.vipra.util.Constants; + +public class FilebaseIDDateIndex implements Iterable<FilebaseIDDateIndexEntry> { + + public static final String FILE_NAME = "iddate.idx"; + + private boolean sorted = true; + private boolean dirty = false; + private final File file; + private final List<FilebaseIDDateIndexEntry> entries; + + public FilebaseIDDateIndex(final File modelDir) throws ParseException, IOException { + file = new File(modelDir, FILE_NAME); + entries = new ArrayList<>(); + if (file.exists()) { + final BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file))); + String line = null; + while ((line = in.readLine()) != null) { + final String[] parts = line.split(","); + entries.add(new FilebaseIDDateIndexEntry(parts[0], new Date(Long.parseLong(parts[1])), false)); + } + in.close(); + } + } + + public void sync() throws IOException { + if (!dirty) + return; + if (!sorted) + sort(); + final BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false))); + for (final FilebaseIDDateIndexEntry entry : entries) { + out.write(entry.getId()); + out.write(","); + out.write(Long.toString(entry.getDate().getTime())); + out.write(Constants.LINE_SEP); + } + out.close(); + dirty = false; + } + + public void add(final String id, final Date date) { + entries.add(new FilebaseIDDateIndexEntry(id, date, true)); + sorted = false; + dirty = true; + } + + public FilebaseIDDateIndexEntry get(final int index) { + return entries.get(index); + } + + public void sort() { + Collections.sort(entries); + sorted = true; + } + + public int size() { + return entries.size(); + } + + @Override + public Iterator<FilebaseIDDateIndexEntry> iterator() { + if (!sorted) + sort(); + return entries.iterator(); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIDDateIndexEntry.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIDDateIndexEntry.java new file mode 100644 index 0000000000000000000000000000000000000000..94b9995cec522974f4c3f02a0a3436450283585c --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIDDateIndexEntry.java @@ -0,0 +1,34 @@ +package de.vipra.cmd.file; + +import java.util.Date; + +public class FilebaseIDDateIndexEntry implements Comparable<FilebaseIDDateIndexEntry> { + + private final String id; + private final Date date; + private final boolean isNew; + + public FilebaseIDDateIndexEntry(final String id, final Date date, final boolean isNew) { + this.id = id; + this.date = date; + this.isNew = isNew; + } + + public String getId() { + return id; + } + + public Date getDate() { + return date; + } + + public boolean isNew() { + return isNew; + } + + @Override + public int compareTo(final FilebaseIDDateIndexEntry o) { + return date.compareTo(o.getDate()); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java deleted file mode 100644 index a533f822c7e24a5c526d8881970c3699f4d31f98..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java +++ /dev/null @@ -1,72 +0,0 @@ -package de.vipra.cmd.file; - -import java.io.Closeable; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import de.vipra.util.Constants; -import de.vipra.util.FileUtils; - -public class FilebaseIndex implements Closeable, Iterable<String> { - - public static final String FILE_NAME = "index"; - - private final File file; - private List<String> index; - - public FilebaseIndex(final File modelDir) throws IOException { - file = new File(modelDir, FILE_NAME); - if (file.exists()) { - index = new ArrayList<>(FileUtils.readFile(file)); - } else { - index = new ArrayList<>(); - } - } - - public void write() throws IOException { - org.apache.commons.io.FileUtils.writeLines(file, Constants.FILEBASE_ENCODING.name(), index, null, false); - } - - public int add(final String id) { - int i = indexOf(id); - if (i == -1) { - index.add(id); - i = index.size() - 1; - } - return i; - } - - public void set(final List<String> index) { - this.index = index; - } - - public int indexOf(final String id) { - return index.indexOf(id); - } - - public String get(final int i) { - return index.get(i); - } - - public boolean remove(final String id) { - return index.remove(id); - } - - public int size() { - return index.size(); - } - - @Override - public void close() throws IOException { - write(); - } - - @Override - public Iterator<String> iterator() { - return index.iterator(); - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWindowIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWindowIndex.java new file mode 100644 index 0000000000000000000000000000000000000000..436d4978947c26575dd57c61d6682cd3a556af1d --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWindowIndex.java @@ -0,0 +1,138 @@ +package de.vipra.cmd.file; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.List; + +import org.apache.commons.io.FileUtils; + +import de.vipra.util.Constants; +import de.vipra.util.Constants.WindowResolution; +import de.vipra.util.CountMap; + +public class FilebaseWindowIndex { + + public static final String MULT_FILE_NAME = "dtm-mult.dat"; + public static final String SEQ_FILE_NAME = "dtm-seq.dat"; + public static final String WIN_FILE_NAME = "windows.idx"; + + private boolean seqDirty = false; + private boolean winDirty = false; + private final File modelDir; + private final File seqFile; + private final File winFile; + private final WindowResolution windowResolution; + private List<Integer> windowSizes; + private List<Date> windowDates; + private final CountMap<Date> windowMap; + + public FilebaseWindowIndex(final File modelDir, final WindowResolution windowResolution) throws NumberFormatException, IOException { + this.modelDir = modelDir; + seqFile = new File(modelDir, SEQ_FILE_NAME); + winFile = new File(modelDir, WIN_FILE_NAME); + this.windowResolution = windowResolution; + windowSizes = new ArrayList<>(); + windowDates = new ArrayList<>(); + windowMap = new CountMap<>(); + if (winFile.exists()) { + final BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(winFile))); + in.readLine(); + String line = null; + while ((line = in.readLine()) != null) { + final String[] parts = line.split(","); + final Date date = new Date(Long.parseLong(parts[0])); + final int count = Integer.parseInt(parts[1]); + windowSizes.add(count); + windowDates.add(date); + windowMap.count(date, count); + } + in.close(); + } + } + + public void sync() throws IOException { + if (!winDirty) + return; + + if (!modelDir.exists() && !modelDir.mkdirs()) + throw new FilebaseException("could not create model directory: " + modelDir.getAbsolutePath()); + + final BufferedWriter winOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(winFile, false))); + final BufferedWriter seqOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seqFile, false))); + + final List<Date> dates = new ArrayList<>(windowMap.keySet()); + Collections.sort(dates); + + seqOut.write(Integer.toString(windowMap.size())); + seqOut.write(Constants.LINE_SEP); + + for (final Date date : dates) { + final int windowSize = windowMap.get(date); + winOut.write(Long.toString(date.getTime())); + winOut.write(","); + winOut.write(Integer.toString(windowSize)); + winOut.write(Constants.LINE_SEP); + seqOut.write(Integer.toString(windowSize)); + seqOut.write(Constants.LINE_SEP); + } + + winOut.close(); + seqOut.close(); + + winDirty = false; + } + + public void add(final Date date) { + windowMap.count(windowResolution.startDate(date)); + winDirty = true; + seqDirty = true; + } + + public int size() { + return windowSizes.size(); + } + + public int windowSize(final int index) { + if (seqDirty) + resizeWindows(); + return windowSizes.get(index); + } + + public Date startDate(final int index) { + if (seqDirty) + resizeWindows(); + return windowResolution.startDate(windowDates.get(index)); + } + + public Date endDate(final int index) { + if (seqDirty) + resizeWindows(); + return windowResolution.endDate(windowDates.get(index)); + } + + public void copy(final File modelFile) throws IOException { + FileUtils.copyFile(modelFile, new File(modelDir, MULT_FILE_NAME)); + } + + private void resizeWindows() { + final List<Date> dates = new ArrayList<>(windowMap.keySet()); + Collections.sort(dates); + windowSizes = new ArrayList<>(windowMap.size()); + windowDates = new ArrayList<>(windowMap.size()); + for (final Date date : dates) { + windowSizes.add(windowMap.get(date)); + windowDates.add(date); + } + seqDirty = false; + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java new file mode 100644 index 0000000000000000000000000000000000000000..420d97ad5b04610c771c8170be24b8fe3312d75b --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java @@ -0,0 +1,82 @@ +package de.vipra.cmd.file; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import de.vipra.util.CountMap; +import de.vipra.util.FileUtils; + +public class FilebaseWordIndex implements Iterable<String> { + + public static final String FILE_NAME = "word.idx"; + + private boolean dirty = false; + private final File file; + private final List<String> words; + private final Map<String, Integer> wordIndex; + private int nextIndex = 0; + + public FilebaseWordIndex(final File modelDir) throws IOException { + file = new File(modelDir, FILE_NAME); + if (file.exists()) { + words = FileUtils.readFile(file); + wordIndex = new HashMap<>(words.size()); + for (final String word : words) + wordIndex.put(word, nextIndex++); + } else { + words = new ArrayList<>(); + wordIndex = new HashMap<>(); + } + } + + public void sync() throws IOException { + if (!dirty) + return; + org.apache.commons.io.FileUtils.writeLines(file, words); + dirty = false; + } + + public String transform(final String[] words) { + final CountMap<String> countMap = new CountMap<>(); + for (final String word : words) + countMap.count(word); + + final StringBuilder sb = new StringBuilder(); + sb.append(countMap.size()); + for (final Entry<String, Integer> entry : countMap.entrySet()) + sb.append(" ").append(index(entry.getKey())).append(":").append(entry.getValue()); + + return sb.toString(); + } + + public int index(final String word) { + Integer index = wordIndex.get(word); + if (index == null) { + index = nextIndex++; + words.add(word); + wordIndex.put(word, index); + dirty = true; + } + return index; + } + + public String word(final int index) { + return words.get(index); + } + + public int size() { + return words.size(); + } + + @Override + public Iterator<String> iterator() { + return words.iterator(); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java index b62453ce717e9c4751305301abd7027f89b975cc..6f8126bef6a1e7f9da17c33f7f3e0cd9e7c6eb20 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java @@ -1,36 +1,410 @@ package de.vipra.cmd.lda; -import de.vipra.cmd.ex.AnalyzerException; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.bson.types.ObjectId; + +import de.vipra.cmd.file.FilebaseIDDateIndex; +import de.vipra.cmd.file.FilebaseIDDateIndexEntry; +import de.vipra.cmd.file.FilebaseWindowIndex; +import de.vipra.cmd.file.FilebaseWordIndex; +import de.vipra.util.ArrayUtils; import de.vipra.util.Config; +import de.vipra.util.ConfigDtm; import de.vipra.util.Constants; +import de.vipra.util.MongoUtils; +import de.vipra.util.StringUtils; +import de.vipra.util.ex.ConfigException; +import de.vipra.util.ex.DatabaseException; +import de.vipra.util.model.Article; +import de.vipra.util.model.ArticleFull; +import de.vipra.util.model.Sequence; +import de.vipra.util.model.SequenceFull; +import de.vipra.util.model.SimilarArticle; +import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicFull; +import de.vipra.util.model.TopicModel; +import de.vipra.util.model.TopicModelFull; +import de.vipra.util.model.TopicShare; +import de.vipra.util.model.TopicWord; +import de.vipra.util.model.Window; +import de.vipra.util.service.MongoService; +import de.vipra.util.service.Service.QueryBuilder; -public abstract class Analyzer { +public class Analyzer { - private final String name; + public static final Logger log = LogManager.getLogger(Analyzer.class); - protected Analyzer(final String name) { - this.name = name; - } + private final Config config; + private final File dataDir; + private final File dtmBinary; + private final MongoService<ArticleFull, ObjectId> dbArticles; + private final MongoService<TopicFull, ObjectId> dbTopics; + private final MongoService<SequenceFull, ObjectId> dbSequences; + private final MongoService<Window, Integer> dbWindows; + private final MongoService<TopicModelFull, String> dbTopicModels; - public String getName() { - return name; + public Analyzer() throws AnalyzerException, ConfigException { + config = Config.getConfig(); + dataDir = config.getDataDirectory(); + dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); + dbTopics = MongoService.getDatabaseService(config, TopicFull.class); + dbSequences = MongoService.getDatabaseService(config, SequenceFull.class); + dbWindows = MongoService.getDatabaseService(config, Window.class); + dbTopicModels = MongoService.getDatabaseService(config, TopicModelFull.class); + + // check for binary + if (config.getDtmPath() == null || config.getDtmPath().isEmpty()) + throw new AnalyzerException("dtm path not configured"); + dtmBinary = new File(config.getDtmPath()); + if (dtmBinary == null || !dtmBinary.exists()) + throw new AnalyzerException("dtm binary not found at path: " + config.getDtmPath() + ", check config key 'tm.dtmpath'"); } - public abstract void init(Config config) throws AnalyzerException; + public void analyze(final ConfigDtm configDtm, final boolean reread) + throws AnalyzerException, DatabaseException, ParseException, IOException, InterruptedException { + + final File modelDir = new File(dataDir, configDtm.getName()); + final File outDir = new File(modelDir, "out"); + final File outDirSeq = new File(outDir, "lda-seq"); + + final String[] parameters = { + // number of topics + "--ntopics=" + Constants.K_TOPICS, + // topc modeling mode + "--mode=fit", + // random seed (0 for pseudo random) + "--rng_seed=0", + // initialize model with lda + "--initialize_lda=true", + // top chain var (default 0.005) + "--top_chain_var=0.005", + // alpha (default 0.01) + "--alpha=0.01", + // minimum number if iterations + "--lda_sequence_min_iter=" + Constants.DYNAMIC_MIN_ITER, + // maximum number of iterations + "--lda_sequence_max_iter=" + Constants.DYNAMIC_MAX_ITER, + // em iter (default 20) + "--lda_max_em_iter=" + Constants.STATIC_ITER, + // input file prefix + "--corpus_prefix=" + modelDir.getAbsolutePath() + File.separator + "dtm", + // output directory + "--outname=" + outDir.getAbsolutePath() }; + + final String command = dtmBinary.getAbsolutePath() + " " + StringUtils.join(parameters, " "); + + final FilebaseWindowIndex windowIndex = new FilebaseWindowIndex(modelDir, configDtm.getWindowResolution()); + + BufferedReader in; + + if (!reread) { + final Process p = Runtime.getRuntime().exec(command, null); + if (!p.isAlive()) + throw new AnalyzerException("dtm process is dead"); + + // read from process output + in = new BufferedReader(new InputStreamReader(p.getErrorStream())); + + String line; + int iteration = 0; + while ((line = in.readLine()) != null) { + if (line.contains("EM iter")) { + log.info("iteration " + iteration++); + } + } + + in.close(); + p.waitFor(); + } + + final FilebaseWordIndex wordIndex = new FilebaseWordIndex(configDtm.getModelDir(dataDir)); + final FilebaseIDDateIndex idDateIndex = new FilebaseIDDateIndex(configDtm.getModelDir(dataDir)); + + final QueryBuilder builder = QueryBuilder.builder().criteria("model.id", configDtm.getName()); + dbArticles.deleteMultiple(builder); + dbTopics.deleteMultiple(builder); + dbSequences.deleteMultiple(builder); + dbWindows.deleteMultiple(builder); + dbTopicModels.deleteSingle(configDtm.getName()); + + final int wordCount = wordIndex.size(); + final int sequencesCount = windowIndex.size(); + final int articlesCount = idDateIndex.size(); + + // read topic distributions - public abstract void analyze(boolean reread) throws AnalyzerException; + final File gamFile = new File(outDirSeq, "gam.dat"); + if (!gamFile.exists()) + throw new AnalyzerException("file not found: " + gamFile.getAbsolutePath()); + in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile))); - public static Analyzer getAnalyzer(final Config config) throws AnalyzerException { - Analyzer analyzer = null; - switch (Constants.ANALYZER) { - case DTM: - analyzer = new DTMAnalyzer(); - break; - default: - return null; + final double[][] topicDistributions = new double[articlesCount][Constants.K_TOPICS]; + for (int idxArticle = 0; idxArticle < articlesCount; idxArticle++) { + // read distributions into matrix and sum + double topicDistributionSum = 0; + for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + final double topicDistribution = Double.parseDouble(in.readLine()); + topicDistributions[idxArticle][idxTopic] = topicDistribution; + topicDistributionSum += topicDistribution; + } + // normalize distributions by sum + for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + topicDistributions[idxArticle][idxTopic] /= topicDistributionSum; + } } - analyzer.init(config); - return analyzer; + + in.close(); + + // read topic definition files and create topics + + final TopicModelFull newTopicModel = new TopicModelFull(configDtm.getName()); + final List<Window> newWindows = new ArrayList<>(sequencesCount); + final List<SequenceFull> newSequences = new ArrayList<>(Constants.K_TOPICS * sequencesCount); + final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); + + log.info("vocabulary size: " + wordCount); + log.info("sequences: " + sequencesCount); + log.info("topics: " + Constants.K_TOPICS); + + final boolean seqRelativeCutoff = Constants.MINIMUM_RELATIVE_PROB > 0; + + // create sequence windows + for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { + final Window newWindow = new Window(); + newWindow.setId(idxSeq); + newWindow.setStartDate(windowIndex.startDate(idxSeq)); + newWindow.setEndDate(windowIndex.endDate(idxSeq)); + newWindow.setWindowResolution(Constants.WINDOW_RESOLUTION); + newWindow.setModel(new TopicModel(newTopicModel.getId())); + newWindows.add(newWindow); + } + + // for each topic + for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + final File seqFile = new File(outDirSeq, "topic-" + StringUtils.padNumber(idxTopic, 3) + "-var-e-log-prob.dat"); + if (!seqFile.exists()) { + in.close(); + throw new AnalyzerException("file not found: " + seqFile.getAbsolutePath()); + } + + // create new topic + final TopicFull newTopic = new TopicFull(); + final List<Sequence> newTopicSequences = new ArrayList<>(sequencesCount); + newTopic.setSequences(newTopicSequences); + newTopic.setModel(new TopicModel(newTopicModel.getId())); + newTopics.add(newTopic); + + in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile))); + + // read file lines into word x sequence matrix + // gather maximum likeliness per sequence and per word + final double[][] likelinesses = new double[wordCount][sequencesCount]; + for (int idxWord = 0; idxWord < wordCount; idxWord++) { + for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { + likelinesses[idxWord][idxSeq] = Double.parseDouble(in.readLine()); + } + } + + in.close(); + + // find maximum + final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses); + + // collect top words in each sequence for topic name + final Set<TopicWord> topTopicWords = new HashSet<>(); + + final double[] relevances = new double[sequencesCount]; + double relevanceSum = 0; + double prevRelevance = 0; + + // for each sequence + for (int idxSeq = 0, sequenceOffset = 0; idxSeq < sequencesCount; idxSeq++) { + // calculate relative cutoff probability + final double maxSeqLikeliness = maxSeqLikelinesses[idxSeq]; + final double minRelativeSeqLikeliness = Constants.MINIMUM_RELATIVE_PROB * Math.abs(maxSeqLikeliness); + + // collect words + final List<TopicWord> newSeqTopicWords = new ArrayList<>(wordCount); + for (int idxWord = 0; idxWord < wordCount; idxWord++) { + final double likeliness = likelinesses[idxWord][idxSeq]; + // check if word acceptable + if (!seqRelativeCutoff || (maxSeqLikeliness >= 0 && likeliness >= minRelativeSeqLikeliness) + || (maxSeqLikeliness < 0 && Math.abs(likeliness) >= minRelativeSeqLikeliness)) { + final String word = wordIndex.word(idxWord); + final TopicWord topicWord = new TopicWord(word, likeliness); + newSeqTopicWords.add(topicWord); + } + } + + if (!newSeqTopicWords.isEmpty()) { + Collections.sort(newSeqTopicWords, Comparator.reverseOrder()); + + // collect top words + topTopicWords.addAll(newSeqTopicWords.subList(0, Math.min(newSeqTopicWords.size(), Constants.TOPIC_AUTO_NAMING_WORDS))); + } + + // calculate topic sequence relevance + final int sequenceSize = windowIndex.windowSize(idxSeq); + double seqTopicDistribution = 0; + for (int idxArticle = sequenceOffset; idxArticle < sequenceOffset + sequenceSize; idxArticle++) + seqTopicDistribution += topicDistributions[idxArticle][idxTopic]; + final double relevance = seqTopicDistribution / sequenceSize; + + // create sequence + final SequenceFull newSequenceFull = new SequenceFull(); + newSequenceFull.setWindow(newWindows.get(idxSeq)); + newSequenceFull.setWords(newSeqTopicWords); + newSequenceFull.setRelevance(relevance); + newSequenceFull.setRelevanceChange(relevance - prevRelevance); + newSequenceFull.setTopic(new Topic(newTopic.getId())); + newSequenceFull.setModel(new TopicModel(newTopicModel.getId())); + newSequences.add(newSequenceFull); + newTopicSequences.add(new Sequence(newSequenceFull.getId())); + + // sequence offset is current position in list of sequences + // of this topic + sequenceOffset += sequenceSize; + + // relevance is summed up to calculate average later on + relevanceSum += relevance; + + // relevances are gathered to calculate variance and + // rising/falling relevance + relevances[idxSeq] = relevance; + + // previous relevance is remembered to calculate difference + // to next relevance + prevRelevance = relevance; + } + + // sort topic words and generate topic name + final List<TopicWord> topTopicWordsList = new ArrayList<>(topTopicWords); + Collections.sort(topTopicWordsList); + newTopic.setName(TopicFull.getNameFromWords(topTopicWordsList)); + + // calculate average + final double average = relevanceSum / sequencesCount; + newTopic.setAvgRelevance(average); + + // calculate variance + double variance = 0; + for (final double relevance : relevances) + variance += Math.pow(relevance - average, 2); + newTopic.setVarRelevance(variance / sequencesCount); + + // calculate rising/falling/rising-decay relevances + double risingRelevance = 0; + double fallingRelevance = 0; + double risingDecayRelevance = 0; + prevRelevance = relevances[0]; + for (int idxSeq2 = 1; idxSeq2 < relevances.length; idxSeq2++) { + final double relevanceDiff = relevances[idxSeq2] - prevRelevance; + if (relevanceDiff > 0) { + risingRelevance += relevanceDiff; + } else { + fallingRelevance += Math.abs(relevanceDiff); + } + risingDecayRelevance += Math.exp(-Constants.RISING_DECAY_LAMBDA * (sequencesCount - idxSeq2 + 1)) * relevanceDiff; + } + newTopic.setRisingRelevance(risingRelevance); + newTopic.setFallingRelevance(fallingRelevance); + newTopic.setRisingDecayRelevance(risingDecayRelevance); + } + + // create topics and words + try { + dbWindows.createMultiple(newWindows); + dbSequences.createMultiple(newSequences); + dbTopics.createMultiple(newTopics); + } catch (final DatabaseException e) { + throw new AnalyzerException(e); + } + + // create topic references and store document similarities + + int idxArticle = -1; + for (final FilebaseIDDateIndexEntry entry : idDateIndex) { + idxArticle++; + + final double[] topicDistribution = topicDistributions[idxArticle]; + + // create topic references + + double reducedShare = 0; + final List<TopicShare> newTopicRefs = new ArrayList<>(Constants.K_TOPICS); + for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { + if (topicDistribution[idxTopic] > 0.01) { + reducedShare += topicDistribution[idxTopic]; + final TopicShare newTopicRef = new TopicShare(); + final TopicFull topicFull = newTopics.get(idxTopic); + newTopicRef.setTopic(new Topic(topicFull.getId())); + newTopicRef.setShare(topicDistribution[idxTopic]); + newTopicRefs.add(newTopicRef); + } + } + + // calculate divergences + + final List<SimilarArticle> similarArticles = new ArrayList<>(articlesCount - 1); + + for (int idxArticle2 = 0; idxArticle2 < articlesCount; idxArticle2++) { + if (idxArticle == idxArticle2) + continue; + + final double divergence = ArrayUtils.jsDivergence(topicDistributions[idxArticle], topicDistributions[idxArticle2]); + if (divergence > Constants.MAX_DIVERGENCE) + continue; + + final SimilarArticle similarArticle = new SimilarArticle(); + similarArticle.setArticle(new Article(MongoUtils.objectId(idDateIndex.get(idxArticle2).getId()))); + similarArticle.setDivergence(divergence); + similarArticles.add(similarArticle); + } + + Collections.sort(similarArticles); + + if (similarArticles.size() > Constants.MAX_SIMILAR_DOCUMENTS) + similarArticles.subList(Constants.MAX_SIMILAR_DOCUMENTS, similarArticles.size()).clear(); + + // update article + + if (!newTopicRefs.isEmpty()) { + // renormalize share + for (final TopicShare newTopicRef : newTopicRefs) + newTopicRef.setShare(newTopicRef.getShare() / reducedShare); + + Collections.sort(newTopicRefs, Comparator.reverseOrder()); + + // update article with topic references (partial update) + final ArticleFull article = new ArticleFull(); + article.setId(entry.getId()); + article.setModel(new TopicModel(newTopicModel.getId())); + article.setTopics(newTopicRefs); + article.setSimilarArticles(similarArticles); + + try { + dbArticles.updateSingle(article, "model", "topics", "similarArticles"); + } catch (final DatabaseException e) { + log.error(e); + } + } + } + + dbTopicModels.createSingle(newTopicModel); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/ex/AnalyzerException.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/AnalyzerException.java similarity index 89% rename from vipra-cmd/src/main/java/de/vipra/cmd/ex/AnalyzerException.java rename to vipra-cmd/src/main/java/de/vipra/cmd/lda/AnalyzerException.java index 99566f56ef4d5e20a7c4a5a4637d2ce5b00c830f..756aad149b09056fa2ea2e84685db0524bcd8d4d 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/ex/AnalyzerException.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/AnalyzerException.java @@ -1,4 +1,4 @@ -package de.vipra.cmd.ex; +package de.vipra.cmd.lda; public class AnalyzerException extends Exception { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java deleted file mode 100644 index fc9aa5758f27decd50d7c1094a8de487b4252144..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ /dev/null @@ -1,419 +0,0 @@ -package de.vipra.cmd.lda; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.bson.types.ObjectId; - -import de.vipra.cmd.ex.AnalyzerException; -import de.vipra.cmd.file.DTMIndex; -import de.vipra.cmd.file.DTMVocabulary; -import de.vipra.cmd.file.FilebaseIndex; -import de.vipra.util.ArrayUtils; -import de.vipra.util.Config; -import de.vipra.util.Constants; -import de.vipra.util.MongoUtils; -import de.vipra.util.StringUtils; -import de.vipra.util.ex.ConfigException; -import de.vipra.util.ex.DatabaseException; -import de.vipra.util.model.Article; -import de.vipra.util.model.ArticleFull; -import de.vipra.util.model.Sequence; -import de.vipra.util.model.SequenceFull; -import de.vipra.util.model.SimilarArticle; -import de.vipra.util.model.Topic; -import de.vipra.util.model.TopicFull; -import de.vipra.util.model.TopicRef; -import de.vipra.util.model.TopicWord; -import de.vipra.util.model.Window; -import de.vipra.util.model.Word; -import de.vipra.util.service.MongoService; - -public class DTMAnalyzer extends Analyzer { - - public static final Logger log = LogManager.getLogger(DTMAnalyzer.class); - - public static final String NAME = "dtm"; - - private String command; - private File modelDir; - private File outDir; - private File outDirSeq; - private DTMVocabulary vocab; - private DTMIndex seqindex; - private FilebaseIndex idindex; - private MongoService<ArticleFull, ObjectId> dbArticles; - private MongoService<TopicFull, ObjectId> dbTopics; - private MongoService<SequenceFull, ObjectId> dbSequences; - private MongoService<Word, String> dbWords; - private MongoService<Window, Integer> dbWindows; - - protected DTMAnalyzer() { - super("Dynamic Topic Model Analyzer"); - } - - @Override - public void init(final Config config) throws AnalyzerException { - try { - final File dataDir = config.getDataDirectory(); - modelDir = new File(dataDir, NAME); - outDir = new File(modelDir, "out"); - outDirSeq = new File(outDir, "lda-seq"); - vocab = new DTMVocabulary(modelDir); - idindex = new FilebaseIndex(modelDir); - seqindex = new DTMIndex(modelDir); - dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); - dbTopics = MongoService.getDatabaseService(config, TopicFull.class); - dbSequences = MongoService.getDatabaseService(config, SequenceFull.class); - dbWords = MongoService.getDatabaseService(config, Word.class); - dbWindows = MongoService.getDatabaseService(config, Window.class); - } catch (ConfigException | IOException | ParseException e) { - throw new AnalyzerException(e); - } - - // check for binary - File dtmBinary = null; - if (config.dtmPath != null) - dtmBinary = new File(config.dtmPath); - if (dtmBinary == null || !dtmBinary.exists()) - throw new AnalyzerException( - "dtm binary not found at path: " + config.dtmPath + ", check config key 'tm.dtmpath'"); - - final String corpusPrefix = modelDir.getAbsolutePath() + File.separator + NAME; - final String outname = outDir.getAbsolutePath(); - - final String[] parameters = { - // number of topics - "--ntopics=" + Constants.K_TOPICS, - // topc modeling mode - "--mode=fit", - // random seed (0 for pseudo random) - "--rng_seed=0", - // initialize model with lda - "--initialize_lda=true", - // top chain var (default 0.005) - "--top_chain_var=0.005", - // alpha (default -10) - "--alpha=0.01", - // minimum number if iterations - "--lda_sequence_min_iter=" + Constants.DYNAMIC_MIN_ITER, - // maximum number of iterations - "--lda_sequence_max_iter=" + Constants.DYNAMIC_MAX_ITER, - // em iter (default 20) - "--lda_max_em_iter=" + Constants.STATIC_ITER, - // input file prefix - "--corpus_prefix=" + corpusPrefix, - // output directory - "--outname=" + outname }; - - command = dtmBinary.getAbsolutePath() + " " + StringUtils.join(parameters, " "); - } - - @Override - public void analyze(final boolean reread) throws AnalyzerException { - try { - BufferedReader in; - - if (!reread) { - final Process p = Runtime.getRuntime().exec(command, null); - if (!p.isAlive()) - throw new AnalyzerException("dtm process is dead"); - - // read from process output - in = new BufferedReader(new InputStreamReader(p.getErrorStream())); - - String line; - int iteration = 0; - while ((line = in.readLine()) != null) { - if (line.contains("EM iter")) { - log.info("iteration " + iteration++); - } - } - - in.close(); - p.waitFor(); - } - - final int wordCount = vocab.size(); - final int sequencesCount = seqindex.sequenceCount(); - final int articlesCount = idindex.size(); - - // read topic distributions - - final File gamFile = new File(outDirSeq, "gam.dat"); - in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile))); - - final double[][] topicDistributions = new double[articlesCount][Constants.K_TOPICS]; - for (int idxArticle = 0; idxArticle < articlesCount; idxArticle++) { - // read distributions into matrix and sum - double topicDistributionSum = 0; - for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { - final double topicDistribution = Double.parseDouble(in.readLine()); - topicDistributions[idxArticle][idxTopic] = topicDistribution; - topicDistributionSum += topicDistribution; - } - // normalize distributions by sum - for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { - topicDistributions[idxArticle][idxTopic] /= topicDistributionSum; - } - } - - in.close(); - - // read topic definition files and create topics - - final List<Window> newWindows = new ArrayList<>(sequencesCount); - final List<SequenceFull> newSequences = new ArrayList<>(Constants.K_TOPICS * sequencesCount); - final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); - final Set<Word> newWords = new HashSet<>(wordCount); - - log.info("vocabulary size: " + wordCount); - log.info("sequences: " + sequencesCount); - log.info("topics: " + Constants.K_TOPICS); - - final boolean seqRelativeCutoff = Constants.MINIMUM_RELATIVE_PROB > 0; - - // create sequence windows - for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { - final Window newWindow = new Window(); - newWindow.setId(idxSeq); - newWindow.setStartDate(seqindex.getStartDate(idxSeq)); - newWindow.setEndDate(seqindex.getEndDate(idxSeq)); - newWindow.setWindowResolution(Constants.WINDOW_RESOLUTION); - newWindows.add(newWindow); - } - - // for each topic - for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { - final File seqFile = new File(outDirSeq, - "topic-" + StringUtils.padNumber(idxTopic, 3) + "-var-e-log-prob.dat"); - - // create new topic - final TopicFull newTopic = new TopicFull(); - final List<Sequence> newTopicSequences = new ArrayList<>(sequencesCount); - newTopic.setSequences(newTopicSequences); - newTopics.add(newTopic); - - in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile))); - - // read file lines into word x sequence matrix - // gather maximum likeliness per sequence and per word - final double[][] likelinesses = new double[wordCount][sequencesCount]; - for (int idxWord = 0; idxWord < wordCount; idxWord++) { - for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { - likelinesses[idxWord][idxSeq] = Double.parseDouble(in.readLine()); - } - } - - in.close(); - - // find maximum - final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses); - - // collect top words in each sequence for topic name - final Set<TopicWord> topTopicWords = new HashSet<>(); - - final double[] relevances = new double[sequencesCount]; - double relevanceSum = 0; - double prevRelevance = 0; - - // for each sequence - for (int idxSeq = 0, sequenceOffset = 0; idxSeq < sequencesCount; idxSeq++) { - // calculate relative cutoff probability - final double maxSeqLikeliness = maxSeqLikelinesses[idxSeq]; - final double minRelativeSeqLikeliness = Constants.MINIMUM_RELATIVE_PROB - * Math.abs(maxSeqLikeliness); - - // collect words - final List<TopicWord> newSeqTopicWords = new ArrayList<>(wordCount); - for (int idxWord = 0; idxWord < wordCount; idxWord++) { - final double likeliness = likelinesses[idxWord][idxSeq]; - // check if word acceptable - if (!seqRelativeCutoff || (maxSeqLikeliness >= 0 && likeliness >= minRelativeSeqLikeliness) - || (maxSeqLikeliness < 0 && Math.abs(likeliness) >= minRelativeSeqLikeliness)) { - final Word word = vocab.getWord(idxWord); - newWords.add(word); - final TopicWord topicWord = new TopicWord(word, likeliness); - newSeqTopicWords.add(topicWord); - } - } - - if (!newSeqTopicWords.isEmpty()) { - Collections.sort(newSeqTopicWords, Comparator.reverseOrder()); - - // collect top words - topTopicWords.addAll(newSeqTopicWords.subList(0, - Math.min(newSeqTopicWords.size(), Constants.TOPIC_AUTO_NAMING_WORDS))); - } - - // calculate topic sequence relevance - final int sequenceSize = seqindex.getSequence(idxSeq).count; - double seqTopicDistribution = 0; - for (int idxArticle = sequenceOffset; idxArticle < sequenceOffset + sequenceSize; idxArticle++) - seqTopicDistribution += topicDistributions[idxArticle][idxTopic]; - final double relevance = seqTopicDistribution / sequenceSize; - - // create sequence - final SequenceFull newSequenceFull = new SequenceFull(); - newSequenceFull.setWindow(newWindows.get(idxSeq)); - newSequenceFull.setWords(newSeqTopicWords); - newSequenceFull.setRelevance(relevance); - newSequenceFull.setRelevanceChange(relevance - prevRelevance); - newSequenceFull.setTopic(new Topic(newTopic.getId())); - newSequences.add(newSequenceFull); - newTopicSequences.add(new Sequence(newSequenceFull.getId())); - - // sequence offset is current position in list of sequences - // of this topic - sequenceOffset += sequenceSize; - - // relevance is summed up to calculate average later on - relevanceSum += relevance; - - // relevances are gathered to calculate variance and - // rising/falling relevance - relevances[idxSeq] = relevance; - - // previous relevance is remembered to calculate difference - // to next relevance - prevRelevance = relevance; - } - - // sort topic words and generate topic name - final List<TopicWord> topTopicWordsList = new ArrayList<>(topTopicWords); - Collections.sort(topTopicWordsList); - newTopic.setName(TopicFull.getNameFromWords(topTopicWordsList)); - - // calculate average - final double average = relevanceSum / sequencesCount; - newTopic.setAvgRelevance(average); - - // calculate variance - double variance = 0; - for (final double relevance : relevances) - variance += Math.pow(relevance - average, 2); - newTopic.setVarRelevance(variance / sequencesCount); - - // calculate rising/falling/rising-decay relevances - double risingRelevance = 0; - double fallingRelevance = 0; - double risingDecayRelevance = 0; - prevRelevance = relevances[0]; - for (int idxSeq2 = 1; idxSeq2 < relevances.length; idxSeq2++) { - final double relevanceDiff = relevances[idxSeq2] - prevRelevance; - if (relevanceDiff > 0) { - risingRelevance += relevanceDiff; - } else { - fallingRelevance += Math.abs(relevanceDiff); - } - risingDecayRelevance += Math.exp(-Constants.RISING_DECAY_LAMBDA * (sequencesCount - idxSeq2 + 1)) - * relevanceDiff; - } - newTopic.setRisingRelevance(risingRelevance); - newTopic.setFallingRelevance(fallingRelevance); - newTopic.setRisingDecayRelevance(risingDecayRelevance); - } - - // recreate topics and words - dbWindows.drop(); - dbSequences.drop(); - dbTopics.drop(); - dbWords.drop(); - try { - dbWindows.createMultiple(newWindows); - dbSequences.createMultiple(newSequences); - dbTopics.createMultiple(newTopics); - dbWords.createMultiple(newWords); - } catch (final DatabaseException e) { - throw new AnalyzerException(e); - } - - // create topic references and store document similarities - - int idxArticle = -1; - for (final String articleId : idindex) { - idxArticle++; - - final double[] topicDistribution = topicDistributions[idxArticle]; - - // create topic references - - double reducedShare = 0; - final List<TopicRef> newTopicRefs = new ArrayList<>(Constants.K_TOPICS); - for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { - if (topicDistribution[idxTopic] > 0.01) { - reducedShare += topicDistribution[idxTopic]; - final TopicRef newTopicRef = new TopicRef(); - final TopicFull topicFull = newTopics.get(idxTopic); - newTopicRef.setTopic(new Topic(topicFull.getId())); - newTopicRef.setShare(topicDistribution[idxTopic]); - newTopicRefs.add(newTopicRef); - } - } - - // calculate divergences - - final List<SimilarArticle> similarArticles = new ArrayList<>(articlesCount - 1); - - for (int idxArticle2 = 0; idxArticle2 < articlesCount; idxArticle2++) { - if (idxArticle == idxArticle2) - continue; - - final double divergence = ArrayUtils.jsDivergence(topicDistributions[idxArticle], - topicDistributions[idxArticle2]); - if (divergence > Constants.MAX_DIVERGENCE) - continue; - - final SimilarArticle similarArticle = new SimilarArticle(); - similarArticle.setArticle(new Article(MongoUtils.objectId(idindex.get(idxArticle2)))); - similarArticle.setDivergence(divergence); - similarArticles.add(similarArticle); - } - - Collections.sort(similarArticles); - - if (similarArticles.size() > Constants.MAX_SIMILAR_DOCUMENTS) - similarArticles.subList(Constants.MAX_SIMILAR_DOCUMENTS, similarArticles.size()).clear(); - - // update article - - if (!newTopicRefs.isEmpty()) { - // renormalize share - for (final TopicRef newTopicRef : newTopicRefs) - newTopicRef.setShare(newTopicRef.getShare() / reducedShare); - - Collections.sort(newTopicRefs, Comparator.reverseOrder()); - - // update article with topic references (partial update) - final ArticleFull article = new ArticleFull(); - article.setId(articleId); - article.setTopics(newTopicRefs); - article.setSimilarArticles(similarArticles); - - try { - // TODO: using field name here. Hard to refactor - dbArticles.updateSingle(article, "topics", "similarArticles"); - } catch (final DatabaseException e) { - log.error(e); - } - } - } - - } catch (final Exception e) { - throw new AnalyzerException(e); - } - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java index 67f0bcaed40b4de669733e297feac96ac288a0a7..b731886a5e3cbb979612b74a66bb771f2219fe21 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java @@ -10,39 +10,41 @@ import org.bson.types.ObjectId; import org.elasticsearch.client.Client; import de.vipra.util.Config; -import de.vipra.util.ConsoleUtils; import de.vipra.util.ESClient; import de.vipra.util.model.Article; -import de.vipra.util.model.TopicFull; -import de.vipra.util.model.Word; +import de.vipra.util.model.Sequence; +import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicModel; +import de.vipra.util.model.Window; import de.vipra.util.service.MongoService; public class ClearCommand implements Command { public static final Logger log = LogManager.getLogger(ClearCommand.class); - private final boolean defaults; private Config config; private MongoService<Article, ObjectId> dbArticles; - private MongoService<TopicFull, ObjectId> dbTopics; - private MongoService<Word, String> dbWords; + private MongoService<Topic, ObjectId> dbTopics; + private MongoService<Sequence, ObjectId> dbSequences; + private MongoService<Window, Integer> dbWindows; + private MongoService<TopicModel, String> dbTopicModels; private Client elasticClient; - public ClearCommand(final boolean defaults) { - this.defaults = defaults; - } - private void clear() throws Exception { config = Config.getConfig(); dbArticles = MongoService.getDatabaseService(config, Article.class); - dbTopics = MongoService.getDatabaseService(config, TopicFull.class); - dbWords = MongoService.getDatabaseService(config, Word.class); + dbTopics = MongoService.getDatabaseService(config, Topic.class); + dbSequences = MongoService.getDatabaseService(config, Sequence.class); + dbWindows = MongoService.getDatabaseService(config, Window.class); + dbTopicModels = MongoService.getDatabaseService(config, TopicModel.class); elasticClient = ESClient.getClient(config); log.info("clearing database"); dbArticles.drop(); dbTopics.drop(); - dbWords.drop(); + dbSequences.drop(); + dbWindows.drop(); + dbTopicModels.drop(); log.info("clearing index"); elasticClient.admin().indices().prepareDelete("_all").get(); @@ -60,11 +62,7 @@ public class ClearCommand implements Command { @Override public void run() throws Exception { - if (!defaults) - log.info("to confirm clearing, type 'clear' and press enter"); - if (defaults || ConsoleUtils.confirm("clear")) { - clear(); - } + clear(); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ConfigCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ConfigCommand.java deleted file mode 100644 index b93b3a3434999f30ecdb37ba03890a78fbe01bb7..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ConfigCommand.java +++ /dev/null @@ -1,19 +0,0 @@ -package de.vipra.cmd.option; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import de.vipra.util.Config; - -public class ConfigCommand implements Command { - - public static final Logger log = LogManager.getLogger(ConfigCommand.class); - - @Override - public void run() throws Exception { - final Config config = Config.getConfig(); - log.info("Current configuration:"); - config.print(System.out, "", " : ", true, true, ' '); - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ConfigModelCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ConfigModelCommand.java new file mode 100644 index 0000000000000000000000000000000000000000..df0652672cd0e273ecd6e5f88853d9d71a9961eb --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ConfigModelCommand.java @@ -0,0 +1,33 @@ +package de.vipra.cmd.option; + +import java.awt.Desktop; +import java.io.File; + +import de.vipra.util.Config; +import de.vipra.util.ConfigDtm; +import de.vipra.util.ex.ConfigException; + +public class ConfigModelCommand implements Command { + + private final File configFile; + + public ConfigModelCommand(final String model) throws ConfigException { + final Config config = Config.getConfig(); + final ConfigDtm configDtm = config.getDtmConfigurations().get(model); + configFile = configDtm.getConfigFile(config.getDataDirectory()); + } + + @Override + public void run() throws Exception { + if (!configFile.exists()) + throw new Exception("missing model configuration file: " + configFile.getAbsolutePath()); + + final String editor = System.getenv("EDITOR"); + if (editor != null && !editor.isEmpty()) { + Runtime.getRuntime().exec(editor + " " + configFile.getAbsolutePath()); + } else { + Desktop.getDesktop().edit(configFile); + } + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java new file mode 100644 index 0000000000000000000000000000000000000000..a945bf4e022a7046c3071d94d21a66dbc0b05a00 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/CreateModelCommand.java @@ -0,0 +1,61 @@ +package de.vipra.cmd.option; + +import java.io.File; + +import org.apache.commons.io.IOUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; + +import de.vipra.util.Config; +import de.vipra.util.ConfigDtm; +import de.vipra.util.Constants; +import de.vipra.util.FileUtils; + +public class CreateModelCommand implements Command { + + public static final Logger log = LogManager.getLogger(CreateModelCommand.class); + + private final String[] names; + + public CreateModelCommand(final String[] names) { + this.names = names; + } + + @Override + public void run() throws Exception { + if (names.length == 0) + return; + + final Config config = Config.getConfig(); + + final ObjectMapper mapper = new ObjectMapper(); + final String modelConfig; + if (config.getModelConfigTemplate() == null) { + modelConfig = IOUtils.toString(FileUtils.getResource(Constants.MODEL_FILE)); + } else { + modelConfig = mapper.writeValueAsString(config.getModelConfigTemplate()); + } + + mapper.enable(SerializationFeature.INDENT_OUTPUT); + + for (final String name : names) { + if (name.toLowerCase().equals("all")) + throw new Exception("invalid model name: " + name); + final File modelDir = new File(config.getDataDirectory(), name); + if (modelDir.exists()) + throw new Exception("model with that name already exists: " + name); + if (!modelDir.mkdirs()) + throw new Exception("could not create model directory: " + modelDir.getAbsolutePath()); + final File modelConfigFile = new File(modelDir, Constants.MODEL_FILE); + final ConfigDtm configDtm = mapper.readValue(modelConfig, ConfigDtm.class); + configDtm.setName(name); + org.apache.commons.io.FileUtils.write(modelConfigFile, mapper.writeValueAsString(configDtm)); + config.getDtmConfigurations().put(name, configDtm); + log.info("model created: " + name); + } + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteModelCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteModelCommand.java new file mode 100644 index 0000000000000000000000000000000000000000..7d10cf6abafb1c8719ac287962faf12177cdbeec --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteModelCommand.java @@ -0,0 +1,32 @@ +package de.vipra.cmd.option; + +import java.io.File; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import de.vipra.util.Config; + +public class DeleteModelCommand implements Command { + + public static final Logger log = LogManager.getLogger(DeleteModelCommand.class); + + private final String[] names; + + public DeleteModelCommand(final String[] names) { + this.names = names; + } + + @Override + public void run() throws Exception { + final Config config = Config.getConfig(); + for (final String name : names) { + final File modelDir = new File(config.getDataDirectory(), name); + if (modelDir.exists()) { + org.apache.commons.io.FileUtils.deleteDirectory(modelDir); + log.info("model deleted: " + name); + } + } + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 4cc9527af87226834080a2ea29b362e323592286..d24064a9bc5ce1611945603d52bb13edf5a2c937 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -19,12 +19,13 @@ import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; -import de.vipra.cmd.ex.FilebaseException; -import de.vipra.cmd.ex.ProcessorException; import de.vipra.cmd.file.Filebase; +import de.vipra.cmd.file.FilebaseException; import de.vipra.cmd.text.ProcessedText; import de.vipra.cmd.text.Processor; +import de.vipra.cmd.text.ProcessorException; import de.vipra.util.Config; +import de.vipra.util.ConfigDtm; import de.vipra.util.Constants; import de.vipra.util.StringUtils; import de.vipra.util.Timer; @@ -67,8 +68,7 @@ public class ImportCommand implements Command { private final ArticleBuffer buffer; private final Filebase filebase; - public ImportTask(final JSONObject object, final Processor processor, final ArticleBuffer buffer, - final Filebase filebase) { + public ImportTask(final JSONObject object, final Processor processor, final ArticleBuffer buffer, final Filebase filebase) { this.object = object; this.processor = processor; this.buffer = buffer; @@ -96,13 +96,10 @@ public class ImportCommand implements Command { buffer.add(article); // add article to filebase if long enough - boolean imported = false; - if ((imported = processedText.getReducedWordCount() >= Constants.DOCUMENT_MIN_LENGTH)) + if (processedText.getReducedWordCount() >= Constants.DOCUMENT_MIN_LENGTH) filebase.add(article); - log.info("imported \"" + object.get("title") + "\"\r\n â”” text reduction: " - + (processedText.getReductionRatio() * 100) + "%, text length: [" + processedText.getWordCount() - + " -> " + processedText.getReducedWordCount() + "]" + (imported ? "" : " SKIPPED")); + log.info("imported \"" + object.get("title")); } catch (final ProcessorException e) { log.error("could not preprocess text of article '" + article.getTitle() + "'"); } catch (final DatabaseException e) { @@ -130,7 +127,8 @@ public class ImportCommand implements Command { public static final Logger log = LogManager.getLogger(ImportCommand.class); - private final ArrayList<File> files = new ArrayList<>(); + private final ConfigDtm configDtm; + private final List<File> files = new ArrayList<>(); private final JSONParser parser = new JSONParser(); private Config config; private MongoService<ArticleFull, ObjectId> dbArticles; @@ -145,10 +143,11 @@ public class ImportCommand implements Command { * command. * * @param paths - * Paths to all *.json files containing artiles or folders + * Paths to all *.json files containing articles or folders * containing *.json files. Not recursive. */ - public ImportCommand(final String[] paths) { + public ImportCommand(final ConfigDtm configDtm, final String[] paths) { + this.configDtm = configDtm; addPaths(paths); } @@ -228,18 +227,18 @@ public class ImportCommand implements Command { } @Override - public void run() throws IOException, ConfigException, FilebaseException, ParseException, DatabaseException, - InterruptedException { + public void run() throws IOException, ConfigException, FilebaseException, ParseException, DatabaseException, InterruptedException, + java.text.ParseException { final int threadCount = Runtime.getRuntime().availableProcessors() * 10; + config = Config.getConfig(); dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); - filebase = Filebase.getFilebase(config); - processor = Processor.getProcessor(); + processor = new Processor(); buffer = new ArticleBuffer(dbArticles); executor = Executors.newFixedThreadPool(threadCount); + filebase = new Filebase(configDtm, config.getDataDirectory()); log.info("using data directory: " + config.getDataDirectory().getAbsolutePath()); - log.info("using preprocessor: " + processor.getName()); log.info("using " + threadCount + " " + StringUtils.quantity(threadCount, "thread")); final Timer timer = new Timer(); @@ -259,7 +258,7 @@ public class ImportCommand implements Command { * write filebase */ log.info("writing file index"); - filebase.close(); + filebase.sync(); timer.lap("filebase write"); /* diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/IndexingCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/IndexingCommand.java index 6efae559bce826c8c89548955889a11054b9d4a7..f49c9a970c2db951bb7be863584ff2ce84bd5dd5 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/IndexingCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/IndexingCommand.java @@ -1,6 +1,5 @@ package de.vipra.cmd.option; -import java.util.Iterator; import java.util.Map; import org.apache.logging.log4j.LogManager; @@ -8,9 +7,10 @@ import org.apache.logging.log4j.Logger; import org.bson.types.ObjectId; import org.elasticsearch.client.Client; -import de.vipra.cmd.file.Filebase; -import de.vipra.cmd.file.FilebaseIndex; +import de.vipra.cmd.file.FilebaseIDDateIndex; +import de.vipra.cmd.file.FilebaseIDDateIndexEntry; import de.vipra.util.Config; +import de.vipra.util.ConfigDtm; import de.vipra.util.ESClient; import de.vipra.util.ESSerializer; import de.vipra.util.MongoUtils; @@ -21,32 +21,38 @@ public class IndexingCommand implements Command { public static final Logger log = LogManager.getLogger(IndexingCommand.class); + private final ConfigDtm configDtm; + + public IndexingCommand(final ConfigDtm configDtm) { + this.configDtm = configDtm; + } + @Override public void run() throws Exception { + // TODO use configDtm final Config config = Config.getConfig(); - final MongoService<ArticleFull, ObjectId> dbArticles = MongoService.getDatabaseService(config, - ArticleFull.class); - final Filebase filebase = Filebase.getFilebase(config); - final FilebaseIndex index = filebase.getIndex(); + final MongoService<ArticleFull, ObjectId> dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); + final FilebaseIDDateIndex index = new FilebaseIDDateIndex(configDtm.getModelDir(config.getDataDirectory())); final Client elasticClient = ESClient.getClient(config); final ESSerializer<ArticleFull> elasticSerializer = new ESSerializer<>(ArticleFull.class); + final String indexName = configDtm.getName() + "-articles"; + // clear index - elasticClient.admin().indices().prepareDelete("_all").get(); + // elasticClient.admin().indices().prepareDelete("_all").get(); + elasticClient.admin().indices().prepareDelete(indexName).get(); - final Iterator<String> indexIter = index.iterator(); - while (indexIter.hasNext()) { + for (final FilebaseIDDateIndexEntry entry : index) { // get article from database - final String id = indexIter.next(); - final ArticleFull article = dbArticles.getSingle(MongoUtils.objectId(id), true); + final ArticleFull article = dbArticles.getSingle(MongoUtils.objectId(entry.getId()), true); if (article == null) { - log.error("no article found in db for id " + id); + log.error("no article found in db for id " + entry.getId()); continue; } // index article final Map<String, Object> source = elasticSerializer.serialize(article); - elasticClient.prepareIndex("articles", "article", article.getId().toString()).setSource(source).get(); + elasticClient.prepareIndex(indexName, "article", article.getId().toString()).setSource(source).get(); } elasticClient.close(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ListModelsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ListModelsCommand.java new file mode 100644 index 0000000000000000000000000000000000000000..eeb95c6459ab7127dca33affd6105cc27d247ec0 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ListModelsCommand.java @@ -0,0 +1,23 @@ +package de.vipra.cmd.option; + +import java.util.Map.Entry; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import de.vipra.util.Config; +import de.vipra.util.ConfigDtm; + +public class ListModelsCommand implements Command { + + public static final Logger log = LogManager.getLogger(ListModelsCommand.class); + + @Override + public void run() throws Exception { + log.info("existing models:"); + final Config config = Config.getConfig(); + for (final Entry<String, ConfigDtm> entry : config.getDtmConfigurations().entrySet()) + log.info(" " + entry.getValue().getName()); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java index 74ae029cba21ccd5be5b9553c24892265fe42c53..4a0f221985378ce1847356840d7dc07e0593dea0 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java @@ -4,7 +4,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import de.vipra.cmd.lda.Analyzer; -import de.vipra.util.Config; +import de.vipra.util.ConfigDtm; import de.vipra.util.StringUtils; import de.vipra.util.Timer; @@ -12,22 +12,20 @@ public class ModelingCommand implements Command { public static final Logger log = LogManager.getLogger(ModelingCommand.class); - private Config config; private Analyzer analyzer; + private ConfigDtm configDtm; private boolean reread; public ModelingCommand() {} - public ModelingCommand(final boolean reread) { + public ModelingCommand(final ConfigDtm configDtm, final boolean reread) { + this.configDtm = configDtm; this.reread = reread; } @Override public void run() throws Exception { - config = Config.getConfig(); - analyzer = Analyzer.getAnalyzer(config); - - log.info("using analyzer: " + analyzer.getName()); + analyzer = new Analyzer(); final Timer timer = new Timer(); timer.restart(); @@ -36,7 +34,7 @@ public class ModelingCommand implements Command { * do topic modeling */ log.info("topic modeling"); - analyzer.analyze(reread); + analyzer.analyze(configDtm, reread); timer.lap("topic modeling"); /* diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java deleted file mode 100644 index 225d13bf5277eddcc52469de73128d4254256004..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java +++ /dev/null @@ -1,38 +0,0 @@ -package de.vipra.cmd.option; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.bson.types.ObjectId; - -import de.vipra.util.Config; -import de.vipra.util.model.Article; -import de.vipra.util.model.Topic; -import de.vipra.util.model.Word; -import de.vipra.util.service.MongoService; - -public class StatsCommand implements Command { - - public static final Logger log = LogManager.getLogger(StatsCommand.class); - - private Config config; - private MongoService<Article, ObjectId> dbArticles; - private MongoService<Topic, ObjectId> dbTopics; - private MongoService<Word, String> dbWords; - - private void stats() { - log.info("# of articles: " + dbArticles.count(null)); - log.info("# of topics : " + dbTopics.count(null)); - log.info("# of words : " + dbWords.count(null)); - } - - @Override - public void run() throws Exception { - config = Config.getConfig(); - dbArticles = MongoService.getDatabaseService(config, Article.class); - dbTopics = MongoService.getDatabaseService(config, Topic.class); - dbWords = MongoService.getDatabaseService(config, Word.class); - - stats(); - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/plugin/ClassNameRegexFilter.java b/vipra-cmd/src/main/java/de/vipra/cmd/plugin/ClassNameRegexFilter.java index b1c618c6787b738fb8892171b499d2f1bd6e0544..76c416a85502e26b38e74bda81f3f6d06c3e0ce0 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/plugin/ClassNameRegexFilter.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/plugin/ClassNameRegexFilter.java @@ -56,8 +56,8 @@ public final class ClassNameRegexFilter extends AbstractFilter { * @return The Log4jRegexFilter. */ @PluginFactory - public static ClassNameRegexFilter createFilter(@PluginAttribute("regex") final String regex, - @PluginAttribute("onMatch") final String match, @PluginAttribute("onMismatch") final String mismatch) { + public static ClassNameRegexFilter createFilter(@PluginAttribute("regex") final String regex, @PluginAttribute("onMatch") final String match, + @PluginAttribute("onMismatch") final String mismatch) { if (regex == null) { LOGGER.error("A regular expression must be provided for RegexFilter"); return null; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/CoreNLPProcessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/CoreNLPProcessor.java deleted file mode 100644 index 1975adc881bc26d0b787d16b6e1ef8fbfd23bd20..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/CoreNLPProcessor.java +++ /dev/null @@ -1,75 +0,0 @@ -package de.vipra.cmd.text; - -import java.util.List; -import java.util.Properties; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import de.vipra.cmd.ex.ProcessorException; -import de.vipra.util.Constants; -import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.StanfordCoreNLP; -import edu.stanford.nlp.util.CoreMap; -import edu.stanford.nlp.util.StringUtils; - -public class CoreNLPProcessor extends Processor { - - public static final Logger log = LogManager.getLogger(CoreNLPProcessor.class); - - private final StanfordCoreNLP nlp; - - public CoreNLPProcessor(final List<String> stopWordsList) { - super("Stanford CoreNLP Processor"); - - final Properties props = new Properties(); - props.setProperty("customAnnotatorClass.stopwords", StopwordsAnnotator.class.getCanonicalName()); - props.setProperty("customAnnotatorClass.frequency", FrequencyAnnotator.class.getCanonicalName()); - // tokenize: transform words to tokens - // ssplit: split by and group into sentences - // stopwords: mark stopwords - // frequency: count word frequency - // pos: mark word position - // lemma: lemmatize words - props.setProperty("annotators", "tokenize, ssplit, stopwords, pos, lemma, frequency"); - props.setProperty("stopwords", StringUtils.join(stopWordsList)); - - nlp = new StanfordCoreNLP(props); - } - - @Override - public ProcessedText process(final String input) throws ProcessorException { - final Annotation doc = new Annotation(input.toLowerCase()); - nlp.annotate(doc); - final StringBuilder sb = new StringBuilder(); - long wordCount = 0; - // loop sentences - for (final CoreMap sentence : doc.get(SentencesAnnotation.class)) { - final List<CoreLabel> words = sentence.get(TokensAnnotation.class); - // count words - wordCount += words.size(); - // loop words - for (final CoreLabel word : words) { - // filter out stopwords - final Boolean b = word.get(StopwordsAnnotator.class); - if (b == null || !b) { - // filter out infrequent words - final Long count = word.get(FrequencyAnnotator.class); - if (count != null && count >= Constants.DOCUMENT_MIN_WORD_FREQ) { - final String lemma = word.get(LemmaAnnotation.class); - // collect unique words - sb.append(lemma).append(" "); - } - } - } - } - - final String text = clean(sb.toString()); - return new ProcessedText(text, wordCount); - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java index 78a41473997b01732c3535d855b9e675108cd394..f72b26958421bbf3d7c8ebe231dfcc31b9ea1f0c 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java @@ -20,8 +20,7 @@ public class FrequencyAnnotator implements Annotator, CoreAnnotation<Long> { @Override public void annotate(final Annotation annotation) { final List<CoreLabel> tokens = annotation.get(TokensAnnotation.class); - final Map<String, Long> words = tokens.stream() - .collect(Collectors.groupingBy(p -> p.get(LemmaAnnotation.class), Collectors.counting())); + final Map<String, Long> words = tokens.stream().collect(Collectors.groupingBy(p -> p.get(LemmaAnnotation.class), Collectors.counting())); for (final CoreLabel token : tokens) { token.set(FrequencyAnnotator.class, words.get(token.get(LemmaAnnotation.class))); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java index 54caa942147c2bdf1727c2fd54ce9b70d1e340fc..01e1c45a11ba29f14db50e8373af1f6a7f034e3a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java @@ -1,39 +1,76 @@ package de.vipra.cmd.text; import java.util.List; +import java.util.Properties; -import de.vipra.cmd.ex.ProcessorException; -import de.vipra.util.Constants; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; -public abstract class Processor { +import de.vipra.util.Constants; +import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.StringUtils; - private final String name; +public class Processor { - public Processor(final String name) { - this.name = name; - } + public static final Logger log = LogManager.getLogger(Processor.class); - public String getName() { - return name; - } + private final StanfordCoreNLP nlp; - public abstract ProcessedText process(String input) throws ProcessorException; + public Processor() { + final Properties props = new Properties(); + props.setProperty("customAnnotatorClass.stopwords", StopwordsAnnotator.class.getCanonicalName()); + props.setProperty("customAnnotatorClass.frequency", FrequencyAnnotator.class.getCanonicalName()); + // tokenize: transform words to tokens + // ssplit: split by and group into sentences + // stopwords: mark stopwords + // frequency: count word frequency + // pos: mark word position + // lemma: lemmatize words + props.setProperty("annotators", "tokenize, ssplit, stopwords, pos, lemma, frequency"); + props.setProperty("stopwords", StringUtils.join(Constants.STOPWORDS)); - public static Processor getProcessor() { - final List<String> stopWords = Constants.STOPWORDS; + nlp = new StanfordCoreNLP(props); + } - switch (Constants.PROCESSOR) { - case CORENLP: - return new CoreNLPProcessor(stopWords); - default: - return null; + public ProcessedText process(final String input) throws ProcessorException { + final Annotation doc = new Annotation(input.toLowerCase()); + nlp.annotate(doc); + final StringBuilder sb = new StringBuilder(); + long wordCount = 0; + // loop sentences + for (final CoreMap sentence : doc.get(SentencesAnnotation.class)) { + final List<CoreLabel> words = sentence.get(TokensAnnotation.class); + // count words + wordCount += words.size(); + // loop words + for (final CoreLabel word : words) { + // filter out stopwords + final Boolean b = word.get(StopwordsAnnotator.class); + if (b == null || !b) { + // filter out infrequent words + final Long count = word.get(FrequencyAnnotator.class); + if (count != null && count >= Constants.DOCUMENT_MIN_WORD_FREQ) { + final String lemma = word.get(LemmaAnnotation.class); + // collect unique words + sb.append(lemma).append(" "); + } + } + } } + + final String text = clean(sb.toString()); + return new ProcessedText(text, wordCount); } public static String clean(final String in) { - return in.replaceAll(Constants.REGEX_EMAIL, "").replaceAll(Constants.REGEX_URL, "") - .replaceAll(Constants.REGEX_NUMBER, "").replaceAll(Constants.CHARS_DISALLOWED, "") - .replaceAll(Constants.REGEX_SINGLECHAR, "").replaceAll("\\s+", " ").trim(); + return in.replaceAll(Constants.REGEX_EMAIL, "").replaceAll(Constants.REGEX_URL, "").replaceAll(Constants.REGEX_NUMBER, "") + .replaceAll(Constants.CHARS_DISALLOWED, "").replaceAll(Constants.REGEX_SINGLECHAR, "").replaceAll("\\s+", " ").trim(); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/ex/ProcessorException.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessorException.java similarity index 89% rename from vipra-cmd/src/main/java/de/vipra/cmd/ex/ProcessorException.java rename to vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessorException.java index 13f1be81d6e3301038bf26669ef1ee93c1fe50ff..68ac876e53056fca54d198a16b550d5574d9ce69 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/ex/ProcessorException.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessorException.java @@ -1,4 +1,4 @@ -package de.vipra.cmd.ex; +package de.vipra.cmd.text; public class ProcessorException extends Exception { diff --git a/vipra-cmd/src/main/resources/config.json b/vipra-cmd/src/main/resources/config.json new file mode 100644 index 0000000000000000000000000000000000000000..83de6f0965f4377c44b8c130d9d9ece9440dc29a --- /dev/null +++ b/vipra-cmd/src/main/resources/config.json @@ -0,0 +1,20 @@ +{ + "dataDirectory": "", + "databaseHost": "127.0.0.1", + "databasePort": 27017, + "databaseName": "test", + "elasticSearchHost": "127.0.0.1", + "elasticSearchPort": 9300, + "spotlightUrl": "", + "dtmPath": "", + "defaultAllModels": false, + "modelConfigTemplate": { + "name": "", + "kTopics": 20, + "dynamicMinIterations": 100, + "dynamicMaxIterations": 1000, + "staticIterations": 100, + "windowResolution": "YEAR", + "processorMode": "TEXT" + } +} \ No newline at end of file diff --git a/vipra-cmd/src/main/resources/config.properties b/vipra-cmd/src/main/resources/config.properties deleted file mode 100644 index b7f21bc6797a242a07e4b325351608fc9b55179c..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/resources/config.properties +++ /dev/null @@ -1,6 +0,0 @@ -db.host=localhost -db.port=27017 -db.name=test -es.host=localhost -es.port=9300 -tm.dtmpath=/home/eike/repos/master/ma-impl/dtm_release/dtm/main \ No newline at end of file diff --git a/vipra-cmd/src/main/resources/model.json b/vipra-cmd/src/main/resources/model.json new file mode 100644 index 0000000000000000000000000000000000000000..628e3533c7d28c35d9f53941d70e24f3c3621738 --- /dev/null +++ b/vipra-cmd/src/main/resources/model.json @@ -0,0 +1,9 @@ +{ + "name": "", + "kTopics": 20, + "dynamicMinIterations": 100, + "dynamicMaxIterations": 1000, + "staticIterations": 100, + "windowResolution": "YEAR", + "processorMode": "TEXT" +} \ No newline at end of file diff --git a/vipra-ui/app/html/network.html b/vipra-ui/app/html/network.html index 98fe5c76870cbd90dd58ba3fae7fdf1649c80a82..8f75e4951ecc3977b1620c5316665058d31ec3a5 100644 --- a/vipra-ui/app/html/network.html +++ b/vipra-ui/app/html/network.html @@ -9,10 +9,6 @@ <input type="checkbox" id="showTopics" ng-model="shown.topics" ng-disabled="type == 'topics'"> <label for="showTopics" style="color:{{colors.topics}}">Topics</label> </div> - <div class="checkbox"> - <input type="checkbox" id="showWords" ng-model="shown.words" ng-disabled="type == 'words'"> - <label for="showWords" style="color:{{colors.words}}">Words</label> - </div> </div> <div class="fullsize" id="visgraph"></div> </div> diff --git a/vipra-ui/app/js/controllers.js b/vipra-ui/app/js/controllers.js index 9c460c9da4dc6aba1b2666692516af8551276463..58d8fd9ee96ee7dbaedc613a261f12c69a6acbe5 100644 --- a/vipra-ui/app/js/controllers.js +++ b/vipra-ui/app/js/controllers.js @@ -2,7 +2,7 @@ * Vipra Application * Controllers ******************************************************************************/ -/* globals angular, Vipra, moment, vis, console, $, prompt, randomColor, Highcharts */ +/* globals angular, Vipra, moment, vis, console, prompt, randomColor, Highcharts */ (function() { "use strict"; @@ -19,8 +19,8 @@ /** * Index controller */ - app.controller('IndexController', ['$scope', '$location', 'ArticleFactory', 'TopicFactory', 'WordFactory', 'SearchFactory', - function($scope, $location, ArticleFactory, TopicFactory, WordFactory, SearchFactory) { + app.controller('IndexController', ['$scope', '$location', 'ArticleFactory', 'TopicFactory', 'SearchFactory', + function($scope, $location, ArticleFactory, TopicFactory, SearchFactory) { $scope.search = $location.search().query; @@ -85,8 +85,8 @@ /** * Network controller */ - app.controller('NetworkController', ['$scope', '$state', '$stateParams', '$timeout', 'ArticleFactory', 'TopicFactory', 'WordFactory', - function($scope, $state, $stateParams, $timeout, ArticleFactory, TopicFactory, WordFactory) { + app.controller('NetworkController', ['$scope', '$state', '$stateParams', '$timeout', 'ArticleFactory', 'TopicFactory', + function($scope, $state, $stateParams, $timeout, ArticleFactory, TopicFactory) { var id = 0, ids = {}, @@ -94,8 +94,7 @@ $scope.colors = { articles: '#BBC9D2', - topics: '#DBB234', - words: '#547C65' + topics: '#DBB234' }; $scope.nodes = new vis.DataSet(); $scope.edges = new vis.DataSet(); @@ -125,8 +124,7 @@ }; $scope.shown = { articles: true, - topics: true, - words: true + topics: true }; var factory; @@ -134,8 +132,6 @@ factory = ArticleFactory; else if ($stateParams.type === 'topics') factory = TopicFactory; - else if ($stateParams.type === 'words') - factory = WordFactory; else { console.log('unknown network type'); return; @@ -148,8 +144,6 @@ $scope.nodes.add([articleNode(data)]); else if ($stateParams.type === 'topics') $scope.nodes.add([topicNode(data)]); - else if ($stateParams.type === 'words') - $scope.nodes.add([wordNode(data)]); ids[data.id] = id; // create graph @@ -185,10 +179,6 @@ return newNode(article.title, 'article', 'articles.show', article.id, $scope.colors.articles, 'square'); }; - var wordNode = function(word) { - return newNode(word.id, 'word', 'words.show', word.id, $scope.colors.words); - }; - var edgeExists = function(idA, idB) { if (idB < idA) { var tmp = idA; @@ -253,15 +243,7 @@ $scope.errors = err; }); } else if (node.type === 'topic') { - // node is topic, load topic to get words and articles - if ($scope.shown.words) - TopicFactory.get({ - id: node.dbid - }, function(data) { - constructor(data.words, node, wordNode); - }, function(err) { - $scope.errors = err; - }); + // node is topic, load topic to get articles if ($scope.shown.articles) TopicFactory.articles({ id: node.dbid @@ -270,15 +252,6 @@ }, function(err) { $scope.errors = err; }); - } else if (node.type === 'word' && $scope.shown.topics) { - // node is word, load word to get topics - WordFactory.topics({ - id: node.dbid - }, function(data) { - constructor(data, node, topicNode); - }, function(err) { - $scope.errors = err; - }); } $scope.nodes.update(node); } diff --git a/vipra-ui/app/js/factories.js b/vipra-ui/app/js/factories.js index 55742067640b97be968a89604692dadabe956ccc..b65bb073cffe1274df9eeab01974b3cb3eac119c 100644 --- a/vipra-ui/app/js/factories.js +++ b/vipra-ui/app/js/factories.js @@ -22,12 +22,6 @@ }); }]); - app.factory('WordFactory', ['$resource', function($resource) { - return $resource(Vipra.config.restUrl + '/words/:id', {}, { - topics: { isArray: true, url: Vipra.config.restUrl + '/words/:id/topics' } - }); - }]); - app.factory('SequenceFactory', ['$resource', function($resource) { return $resource(Vipra.config.restUrl + '/sequences/:id'); }]); diff --git a/vipra-util/.settings/org.eclipse.jdt.core.prefs b/vipra-util/.settings/org.eclipse.jdt.core.prefs index 84a81ceba42c0cea2774a52031b93584974a6b42..db79ce67f21727b653c0d5d9d9870e7b2f877f21 100644 --- a/vipra-util/.settings/org.eclipse.jdt.core.prefs +++ b/vipra-util/.settings/org.eclipse.jdt.core.prefs @@ -281,7 +281,7 @@ org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line=false org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line=false org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line=false org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line=false -org.eclipse.jdt.core.formatter.lineSplit=120 +org.eclipse.jdt.core.formatter.lineSplit=150 org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column=false org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column=false org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body=0 diff --git a/vipra-util/src/main/java/de/vipra/util/Config.java b/vipra-util/src/main/java/de/vipra/util/Config.java index 490ded1b5b8f794b44a453704ceeef8aa778edb1..53061973446b459984208f07ed83944a6159cb3c 100644 --- a/vipra-util/src/main/java/de/vipra/util/Config.java +++ b/vipra-util/src/main/java/de/vipra/util/Config.java @@ -3,194 +3,122 @@ package de.vipra.util; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; -import java.io.PrintWriter; -import java.lang.reflect.Field; -import java.lang.reflect.Modifier; import java.util.HashMap; import java.util.Map; -import java.util.Map.Entry; -import java.util.Properties; -import java.util.Set; +import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import de.vipra.util.an.ConfigKey; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.databind.ObjectMapper; + import de.vipra.util.ex.ConfigException; import de.vipra.util.model.Model; import de.vipra.util.service.MongoService; -/** - * Vipra configuration - */ public class Config { - /* - * Configuration keys - */ + public static final Logger log = LoggerFactory.getLogger(Config.class); - /** - * The database host of the application database. - */ - @ConfigKey("db.host") - public String databaseHost = Constants.DATABASE_HOST; + private static Config instance; - /** - * The database port of the application database. - */ - @ConfigKey("db.port") - public int databasePort = Constants.DATABASE_PORT; + private String dataDirectory = getGenericDataDir().getAbsolutePath(); + private String databaseHost = Constants.DATABASE_HOST; + private int databasePort = Constants.DATABASE_PORT; + private String databaseName = Constants.DATABASE_NAME; + private String elasticSearchHost = Constants.ES_HOST; + private int elasticSearchPort = Constants.ES_PORT; + private boolean defaultAllModels = false; + private ConfigDtm modelConfigTemplate = new ConfigDtm(); + private String spotlightUrl; + private String dtmPath; - /** - * The database name of the application database. Can be an arbitrary value. - */ - @ConfigKey("db.name") - public String databaseName = Constants.DATABASE_NAME; + @JsonIgnore + private Map<String, ConfigDtm> dtmConfigurations; - /** - * ElasticSearch host - */ - @ConfigKey("es.host") - public String elasticsearchHost = Constants.ES_HOST; + public String getDatabaseHost() { + return databaseHost; + } - /** - * ElasticSearch connection port - */ - @ConfigKey("es.port") - public int elasticsearchPort = Constants.ES_PORT; + public void setDatabaseHost(final String databaseHost) { + this.databaseHost = databaseHost; + } - /** - * Path to the dtm executable. If using dtm as the anaylyzer, this path must - * be set to the dtm executable. - */ - @ConfigKey("tm.dtmpath") - public String dtmPath = ""; + public int getDatabasePort() { + return databasePort; + } - /* - * Configuration reader - */ + public void setDatabasePort(final int databasePort) { + this.databasePort = databasePort; + } - public static final Logger log = LoggerFactory.getLogger(Config.class); - private static Config config; - private static Set<Entry<String, Field>> fields; - private static int printMaxFieldNameLength = 0; + public String getDatabaseName() { + return databaseName; + } - static { - final Map<String, Field> foundFields = new HashMap<>(); + public void setDatabaseName(final String databaseName) { + this.databaseName = databaseName; + } - for (final Field field : Config.class.getDeclaredFields()) { - final int modifiers = field.getModifiers(); - if (Modifier.isFinal(modifiers)) - continue; + public String getElasticSearchHost() { + return elasticSearchHost; + } - if (!field.isAccessible()) - field.setAccessible(true); + public void setElasticSearchHost(final String elasticSearchHost) { + this.elasticSearchHost = elasticSearchHost; + } - final ConfigKey ck = field.getDeclaredAnnotation(ConfigKey.class); - if (ck == null) - continue; + public int getElasticSearchPort() { + return elasticSearchPort; + } - String name = ck.value(); - if (name == null || name.isEmpty()) { - name = field.getName(); - } + public void setElasticSearchPort(final int elasticSearchPort) { + this.elasticSearchPort = elasticSearchPort; + } - if (name.length() > printMaxFieldNameLength) - printMaxFieldNameLength = name.length(); + public String getSpotlightUrl() { + return spotlightUrl; + } - foundFields.put(name, field); - } + public void setSpotlightUrl(final String spotlightUrl) { + this.spotlightUrl = spotlightUrl; + } - fields = foundFields.entrySet(); + public String getDtmPath() { + return dtmPath; } - private final Properties props = new Properties(); + public void setDtmPath(final String dtmPath) { + this.dtmPath = dtmPath; + } - @SuppressWarnings({ "unchecked", "rawtypes" }) - private Config() throws IOException, ConfigException { - InputStream in = null; + public Map<String, ConfigDtm> getDtmConfigurations() { + return dtmConfigurations; + } - // config from environment - final String configPath = System.getenv("VIPRA_CONFIG"); - if (configPath != null && configPath.length() > 0) { - final File file = new File(configPath); - if (file.exists() && file.isFile()) { - in = org.apache.commons.io.FileUtils.openInputStream(file); - } - } + public void setDtmConfigurations(final Map<String, ConfigDtm> dtmConfigurations) { + this.dtmConfigurations = dtmConfigurations; + } - // config from generic config dir - final File configDir = getGenericConfigDir(); - if (configDir != null && configDir.exists() && configDir.isDirectory()) { - final File file = new File(configDir, Constants.CONFIG_FILE); - if (file.exists() && file.isFile()) { - in = org.apache.commons.io.FileUtils.openInputStream(file); - } - } + public void setDataDirectory(final String dataDirectory) { + this.dataDirectory = dataDirectory; + } - // config from source - if (in == null) { - in = FileUtils.getResource(Constants.CONFIG_FILE); - } + public boolean isDefaultAllModels() { + return defaultAllModels; + } - // load config - if (in == null) { - log.error("config file input stream is null"); - throw new ConfigException("config file input stream is null"); - } else { - props.load(in); - } + public void setDefaultAllModels(final boolean defaultAllModels) { + this.defaultAllModels = defaultAllModels; + } - // read values - for (final Entry<String, Field> entry : fields) { - final String value = props.getProperty(entry.getKey()); - if (value != null) { - Object parsedValue = null; - try { - final Class<?> clazz = entry.getValue().getType(); - if (clazz == String.class) { - parsedValue = value; - } else if (clazz == Boolean.class || clazz == Boolean.TYPE) { - // boolean - parsedValue = Boolean.parseBoolean(value); - } else if (clazz == Character.class || clazz == Character.TYPE) { - // char - if (value.length() == 1) - parsedValue = value.charAt(0); - } else if (clazz == Byte.class || clazz == Byte.TYPE) { - // byte - parsedValue = Byte.parseByte(value); - } else if (clazz == Short.class || clazz == Short.TYPE) { - // short - parsedValue = Short.parseShort(value); - } else if (clazz == Integer.class || clazz == Integer.TYPE) { - // int - parsedValue = Integer.parseInt(value); - } else if (clazz == Long.class || clazz == Long.TYPE) { - // long - parsedValue = Long.parseLong(value); - } else if (clazz == Float.class || clazz == Float.TYPE) { - // float - parsedValue = Float.parseFloat(value); - } else if (clazz == Double.class || clazz == Double.TYPE) { - // double - parsedValue = Double.parseDouble(value); - } else if (Enum.class.isAssignableFrom(clazz)) { - // enum - parsedValue = EnumTools.searchEnum((Class<Enum>) entry.getValue().getType(), value); - } else { - // something else - log.warn("unrecognized config value type: " + clazz); - continue; - } + public ConfigDtm getModelConfigTemplate() { + return modelConfigTemplate; + } - entry.getValue().set(this, parsedValue); - } catch (final Exception e) { - log.error("could not read config value " + entry.getKey(), e); - } - } - } + public void setModelConfigTemplate(final ConfigDtm configDtmTemplate) { + modelConfigTemplate = configDtmTemplate; } /** @@ -201,7 +129,11 @@ public class Config { * @throws ConfigException */ public File getDataDirectory() throws ConfigException { - final File dataDir = getGenericDataDir(); + final File dataDir; + if (dataDirectory != null && !dataDirectory.isEmpty()) + dataDir = new File(dataDirectory); + else + dataDir = getGenericDataDir(); if (!dataDir.exists()) { if (!dataDir.mkdirs()) { @@ -212,6 +144,26 @@ public class Config { return dataDir; } + /** + * Returns a generic data directory, if none is configured. + * + * @return generic data directory + */ + public static File getGenericDataDir() { + final File base = PathUtils.appDataDir(); + return new File(base, Constants.FILEBASE_DIR); + } + + /** + * Returns a generic config directory, if none is configured. + * + * @return generic config directory + */ + public static File getGenericConfigDir() { + final File base = PathUtils.appConfigDir(); + return new File(base, Constants.FILEBASE_DIR); + } + /** * Returns a representation of the used mongodb connection * @@ -230,84 +182,76 @@ public class Config { * @return the database service * @throws ConfigException */ - public <Type extends Model<IdType>, IdType> MongoService<Type, IdType> getDatabaseService(final Class<Type> clazz) - throws ConfigException { + public <Type extends Model<IdType>, IdType> MongoService<Type, IdType> getDatabaseService(final Class<Type> clazz) throws ConfigException { return MongoService.getDatabaseService(this, clazz); } - /** - * Prints out the current configuration values - * - * @param out - * OutputStream to be used for printing. Usually System.out - * @param prefix - * Line prefix, appended before each printed line - * @param separator - * The separator between the key and the value - * @param pad - * set to true to pad values to alignment. Aligns to the longest - * of keys - * @param padRight - * set to true to pad after the key, false to pad before the key, - * if enabled - * @param padChar - * the pad character to be used for padding, if enabled - */ - public void print(final OutputStream out, final String prefix, final String separator, final boolean pad, - final boolean padRight, final char padChar) { - final PrintWriter pw = new PrintWriter(out); - final String padding = padChar + ""; - for (final Entry<String, Field> e : fields) { + public static Config getConfig() throws ConfigException { + if (instance == null) { try { - String key = e.getKey() + separator; - if (pad) { - final int diff = printMaxFieldNameLength - e.getKey().length(); - if (diff > 0) { - if (padRight) - key = e.getKey() + StringUtils.repeat(padding, diff) + separator; - else - key = StringUtils.repeat(padding, diff) + e.getKey() + separator; + InputStream in = null; + + // config from environment + final String configPath = System.getenv("VIPRA_CONFIG"); + if (configPath != null && configPath.length() > 0) { + final File file = new File(configPath); + if (file.exists() && file.isFile()) { + in = org.apache.commons.io.FileUtils.openInputStream(file); } } - pw.println(prefix + key + e.getValue().get(this)); - } catch (IllegalArgumentException | IllegalAccessException e1) {} - } - pw.flush(); - } - /** - * Returns a generic data directory, if none is configured. - * - * @return generic data directory - */ - public static File getGenericDataDir() { - final File base = PathUtils.appDataDir(); - return new File(base, Constants.FILEBASE_DIR); - } + // config from generic config dir + final File configDir = getGenericConfigDir(); + if (!configDir.exists()) { + configDir.mkdirs(); + } + final File configFile = new File(configDir, Constants.CONFIG_FILE); + if (configDir != null && configDir.exists() && configDir.isDirectory() && configFile.exists() && configFile.isFile()) + in = org.apache.commons.io.FileUtils.openInputStream(configFile); - /** - * Returns a generic config directory, if none is configured. - * - * @return generic config directory - */ - public static File getGenericConfigDir() { - final File base = PathUtils.appConfigDir(); - return new File(base, Constants.FILEBASE_DIR); - } + // config from source + if (in == null) { + in = FileUtils.getResource(Constants.CONFIG_FILE); + } - /** - * Config class is a singleton. This method returns its instantiation. - * - * @return config object instantiation - * @throws IOException - * if reading the config file fails - * @throws ConfigException - */ - public static Config getConfig() throws IOException, ConfigException { - if (config == null) { - config = new Config(); + if (in == null) { + log.error("config file input stream is null"); + throw new ConfigException("config file input stream is null"); + } + + // load config + final String config = IOUtils.toString(in); + if (configDir.exists() && !configFile.exists()) + org.apache.commons.io.FileUtils.write(configFile, config); + + final ObjectMapper mapper = new ObjectMapper(); + instance = mapper.readValue(config, Config.class); + + if (instance == null) + throw new ConfigException("could not read configuration"); + + // read model configurations + final File dataDir = instance.getDataDirectory(); + final Map<String, ConfigDtm> modelConfigs = new HashMap<>(); + for (final File file : dataDir.listFiles()) { + if (file.isDirectory()) { + final File modelConfigFile = new File(file, Constants.MODEL_FILE); + if (!modelConfigFile.exists()) + throw new ConfigException("missing model configuration file: " + modelConfigFile.getAbsolutePath()); + final ConfigDtm configDtm = mapper.readValue(modelConfigFile, ConfigDtm.class); + if (configDtm.getName() == null || configDtm.getName().isEmpty()) + throw new ConfigException("models must have a name: " + modelConfigFile.getAbsolutePath()); + modelConfigs.put(configDtm.getName(), configDtm); + } + } + instance.dtmConfigurations = modelConfigs; + + } catch (final IOException e) { + throw new ConfigException(e); + } } - return config; + + return instance; } } diff --git a/vipra-util/src/main/java/de/vipra/util/ConfigDtm.java b/vipra-util/src/main/java/de/vipra/util/ConfigDtm.java new file mode 100644 index 0000000000000000000000000000000000000000..19b14faf93b527a1872a67460c54221f0ca60666 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/ConfigDtm.java @@ -0,0 +1,65 @@ +package de.vipra.util; + +import java.io.File; + +import de.vipra.util.Constants.ProcessorMode; +import de.vipra.util.Constants.WindowResolution; + +public class ConfigDtm { + + private String name; + private final int kTopics = Constants.K_TOPICS; + private final int dynamicMinIterations = Constants.DYNAMIC_MIN_ITER; + private final int dynamicMaxIterations = Constants.DYNAMIC_MAX_ITER; + private final int staticIterations = Constants.STATIC_ITER; + private final WindowResolution windowResolution = Constants.WINDOW_RESOLUTION; + private final ProcessorMode processorMode = Constants.PROCESSOR_MODE; + + public String getName() { + return name; + } + + public void setName(final String name) { + this.name = name; + } + + public int getkTopics() { + return kTopics; + } + + public int getDynamicMinIterations() { + return dynamicMinIterations; + } + + public int getDynamicMaxIterations() { + return dynamicMaxIterations; + } + + public int getStaticIterations() { + return staticIterations; + } + + public WindowResolution getWindowResolution() { + return windowResolution; + } + + public ProcessorMode getProcessorMode() { + return processorMode; + } + + public File getModelDir(final File dataDir) { + return new File(dataDir, name); + } + + public File getConfigFile(final File dataDir) { + return new File(getModelDir(dataDir), Constants.MODEL_FILE); + } + + @Override + public String toString() { + return "ConfigDtm [name=" + name + ", kTopics=" + kTopics + ", dynamicMinIterations=" + dynamicMinIterations + ", dynamicMaxIterations=" + + dynamicMaxIterations + ", staticIterations=" + staticIterations + ", windowResolution=" + windowResolution + ", processorMode=" + + processorMode + "]"; + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java b/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java deleted file mode 100644 index f24878cde607405dbc7a6ebf2f67ec318a374961..0000000000000000000000000000000000000000 --- a/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java +++ /dev/null @@ -1,125 +0,0 @@ -package de.vipra.util; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.Arrays; -import java.util.LinkedHashSet; -import java.util.Set; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class ConsoleUtils { - - public static final Logger log = LoggerFactory.getLogger(ConsoleUtils.class); - - /** - * Choice enum. Describes choices that can be prompted - */ - public static enum Choice { - ABORT("[a]bort", "a"), - CONTINUE("[c]ontinue", "c"), - RETRY("[r]etry", "r"); - - public final String choice; - public final String shortChoice; - - Choice(final String choice, final String shortChoice) { - this.choice = choice; - this.shortChoice = shortChoice; - } - - public static Choice fromString(final String text) { - if (text != null) { - for (final Choice b : Choice.values()) { - if (text.equalsIgnoreCase(b.choice) || text.equalsIgnoreCase(b.shortChoice)) { - return b; - } - } - } - return null; - } - - @Override - public String toString() { - return choice; - } - } - - /** - * Read a line from the console - * - * @return read line, without newline character - */ - public static String readLine() { - final BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); - try { - return in.readLine(); - } catch (final IOException e) { - log.error("io error while reading line from console: " + e.getMessage()); - return ""; - } - } - - /** - * Request user confirmation - * - * @param confirm - * the confirm string, has to be typed in by the user - * @return wether user input matches confirm string - */ - public static boolean confirm(final String confirm) { - System.out.print("> "); - final String in = readLine().toLowerCase().trim(); - return in.equals(confirm); - } - - /** - * Promt a user choice from multiple options - * - * @param choice - * default choice - * @param acceptDefault - * immediately return with default choice - * @param choices - * all choices that can be selected - * @return the selected choice - */ - public static String prompt(final String choice, final boolean acceptDefault, final String... choices) { - if (acceptDefault && choice != null) - return choice; - final Set<String> set = new LinkedHashSet<>(Arrays.asList(choices)); - if (choice != null) - set.add(choice); - String msg = "(" + StringUtils.join(set, ",") + "): "; - if (choice != null) { - msg = "[" + choice + "] " + msg; - } - System.out.print(msg); - final String in = readLine().toLowerCase().trim(); - if (set.contains(in)) { - return in; - } - if (choice != null && in.length() == 0) - return choice; - return null; - } - - /** - * Promt a user choice from multiple options - * - * @param choice - * default choice - * @param acceptDefault - * immediately return with default choice - * @param choices - * all choices that can be selected - * @return the selected choice - */ - public static Choice prompt(final Choice choice, final boolean acceptDefault, final Choice... choices) { - return Choice.fromString( - prompt(choice != null ? choice.choice : null, acceptDefault, ListUtils.toStringArray(choices))); - } - -} diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 4c03608ca34f485db591b732dd43cd3162bd9329..f526c9788bef0728b739026976e0214732bd3b3b 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -29,13 +29,14 @@ public class Constants { * FILES */ - public static final String CONFIG_FILE = "config.properties"; + public static final String CONFIG_FILE = "config.json"; + public static final String MODEL_FILE = "model.json"; /* * DATABASE */ - public static final String DATABASE_HOST = "localhost"; + public static final String DATABASE_HOST = "127.0.0.1"; public static final int DATABASE_PORT = 27017; public static final String DATABASE_NAME = "test"; @@ -136,18 +137,6 @@ public class Constants { */ public static final int DOCUMENT_MIN_LENGTH = 10; - /** - * The text processor to be used. To find a list of available values, - * {@link de.vipra.util.Constants.Processor}. - */ - public static final Processor PROCESSOR = Processor.CORENLP; - - /** - * The topic modeling analyzer to be used. To find a list of available - * analyzers, {@link de.vipra.util.Constants.Analyzer}. - */ - public static final Analyzer ANALYZER = Analyzer.DTM; - /** * The dynamic topic modeling window resolution to be used. This value is * only used, if the selected analyzer supports dynamic topic modeling. To @@ -156,96 +145,84 @@ public class Constants { */ public static final WindowResolution WINDOW_RESOLUTION = WindowResolution.YEAR; + /** + * The processor mode defines the processed text output. In text mode, the + * text is trimmed down, in entity mode, the text is scanned for entities. + * In mixed mode, the found entities are inserted into the trimmed text. + */ + public static final ProcessorMode PROCESSOR_MODE = ProcessorMode.TEXT; + /** * Stopwords list. Extensive list of stopwords used to clean imported * articles of the most common words before topic modeling is applied. */ - public static final List<String> STOPWORDS = Arrays.asList("'ll", "'ve", "a", "a's", "able", "about", "above", - "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", - "affecting", "affects", "after", "afterwards", "again", "against", "ah", "ain't", "aint", "all", "allow", - "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", - "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", - "anything", "anyway", "anyways", "anywhere", "apart", "apparently", "appear", "appreciate", "appropriate", - "approximately", "are", "area", "areas", "aren", "aren't", "arent", "arise", "around", "as", "aside", "ask", - "asked", "asking", "asks", "associated", "at", "auth", "available", "away", "awfully", "b", "back", - "backed", "backing", "backs", "be", "became", "because", "become", "becomes", "becoming", "been", "before", - "beforehand", "began", "begin", "beginning", "beginnings", "begins", "behind", "being", "beings", "believe", - "below", "beside", "besides", "best", "better", "between", "beyond", "big", "bill", "biol", "both", - "bottom", "brief", "briefly", "but", "by", "c", "c'mon", "c's", "ca", "call", "came", "can", "can't", - "cannot", "cant", "case", "cases", "cause", "causes", "certain", "certainly", "changes", "clear", "clearly", - "cmon", "co", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider", - "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "couldnt", - "course", "cry", "cs", "currently", "d", "date", "de", "definitely", "describe", "described", "despite", - "detail", "did", "didn't", "didnt", "differ", "different", "differently", "do", "does", "doesn't", "doesnt", - "doing", "don't", "done", "dont", "down", "downed", "downing", "downs", "downwards", "due", "during", "e", - "each", "early", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "eleven", "else", "elsewhere", - "empty", "end", "ended", "ending", "ends", "enough", "entirely", "especially", "et", "et-al", "etc", "even", - "evenly", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", - "except", "f", "face", "faces", "fact", "facts", "far", "felt", "few", "ff", "fifteen", "fifth", "fify", - "fill", "find", "finds", "fire", "first", "five", "fix", "followed", "following", "follows", "for", - "former", "formerly", "forth", "forty", "found", "four", "from", "front", "full", "fully", "further", - "furthered", "furthering", "furthermore", "furthers", "g", "gave", "general", "generally", "get", "gets", - "getting", "give", "given", "gives", "giving", "go", "goes", "going", "gone", "good", "goods", "got", - "gotten", "great", "greater", "greatest", "greetings", "group", "grouped", "grouping", "groups", "h", "had", - "hadn't", "hadnt", "happens", "hardly", "has", "hasn't", "hasnt", "have", "haven't", "havent", "having", - "he", "he'd", "he'll", "he's", "hed", "hello", "help", "hence", "her", "here", "here's", "hereafter", - "hereby", "herein", "heres", "hereupon", "hers", "herse", "herself", "hes", "hi", "hid", "high", "higher", - "highest", "him", "himse", "himself", "his", "hither", "home", "hopefully", "how", "how's", "howbeit", - "however", "hundred", "i", "i'd", "i'll", "i'm", "i've", "id", "ie", "if", "ignored", "ill", "im", - "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", - "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", - "interested", "interesting", "interests", "into", "invention", "inward", "is", "isn't", "isnt", "it", - "it'd", "it'll", "it's", "itd", "itll", "its", "itse", "itself", "ive", "j", "just", "k", "keep", "keeps", - "kept", "kg", "kind", "km", "knew", "know", "known", "knows", "l", "large", "largely", "last", "lately", - "later", "latest", "latter", "latterly", "least", "less", "lest", "let", "let's", "lets", "like", "liked", - "likely", "line", "little", "long", "longer", "longest", "look", "looking", "looks", "ltd", "m", "made", - "mainly", "make", "makes", "making", "man", "many", "may", "maybe", "me", "mean", "means", "meantime", - "meanwhile", "member", "members", "men", "merely", "mg", "might", "mill", "million", "mine", "miss", "ml", - "more", "moreover", "most", "mostly", "move", "mr", "mrs", "much", "mug", "must", "mustn't", "my", "myse", - "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", - "needed", "needing", "needs", "neither", "never", "nevertheless", "new", "newer", "newest", "next", "nine", - "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", - "nothing", "novel", "now", "nowhere", "number", "numbers", "o", "obtain", "obtained", "obviously", "of", - "off", "often", "oh", "ok", "okay", "old", "older", "oldest", "omitted", "on", "once", "one", "ones", - "only", "onto", "open", "opened", "opening", "opens", "or", "ord", "order", "ordered", "ordering", "orders", - "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", - "owing", "own", "p", "page", "pages", "part", "parted", "particular", "particularly", "parting", "parts", - "past", "per", "perhaps", "place", "placed", "places", "please", "plus", "point", "pointed", "pointing", - "points", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "presented", - "presenting", "presents", "presumably", "previously", "primarily", "probably", "problem", "problems", - "promptly", "proud", "provides", "put", "puts", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", - "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", - "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", - "results", "right", "room", "rooms", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", - "second", "secondly", "seconds", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", - "sees", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "shan't", - "she", "she'd", "she'll", "she's", "shed", "shes", "should", "shouldn't", "shouldnt", "show", "showed", - "showing", "shown", "showns", "shows", "side", "sides", "significant", "significantly", "similar", - "similarly", "since", "sincere", "six", "sixty", "slightly", "small", "smaller", "smallest", "so", "some", - "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", - "soon", "sorry", "specifically", "specified", "specify", "specifying", "state", "states", "still", "stop", - "strongly", "sub", "substantially", "successfully", "such", "such as", "sufficiently", "suggest", "sup", - "sure", "system", "t", "t's", "take", "taken", "taking", "tell", "ten", "tends", "th", "than", "thank", - "thanks", "thanx", "that", "that'll", "that's", "that've", "thats", "the", "their", "theirs", "them", - "themselves", "then", "thence", "there", "there'll", "there's", "there've", "thereafter", "thereby", - "thered", "therefore", "therein", "thereof", "therere", "theres", "thereto", "thereupon", "these", "they", - "they'd", "they'll", "they're", "they've", "theyd", "theyll", "theyre", "theyve", "thick", "thin", "thing", - "things", "think", "thinks", "third", "this", "thorough", "thoroughly", "those", "thou", "though", - "thoughh", "thought", "thoughts", "thousand", "three", "throug", "through", "throughout", "thru", "thus", - "til", "tip", "to", "today", "together", "too", "took", "top", "toward", "towards", "tried", "tries", - "truly", "try", "trying", "ts", "turn", "turned", "turning", "turns", "twelve", "twenty", "twice", "two", - "u", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", - "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "uucp", "v", "value", - "various", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wanted", "wanting", "wants", "was", - "wasn't", "wasnt", "way", "ways", "we", "we'd", "we'll", "we're", "we've", "wed", "welcome", "well", - "wells", "went", "were", "weren't", "werent", "weve", "what", "what'll", "what's", "whatever", "whats", - "when", "when's", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein", - "wheres", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "who'll", "who's", - "whod", "whoever", "whole", "whom", "whomever", "whos", "whose", "why", "why's", "widely", "will", - "willing", "wish", "with", "within", "without", "won't", "wonder", "wont", "words", "work", "worked", - "working", "works", "world", "would", "wouldn't", "wouldnt", "www", "x", "y", "year", "years", "yes", "yet", - "you", "you'd", "you'll", "you're", "you've", "youd", "youll", "young", "younger", "youngest", "your", - "youre", "yours", "yourself", "yourselves", "youve", "z", "zero"); + public static final List<String> STOPWORDS = Arrays.asList("'ll", "'ve", "a", "a's", "able", "about", "above", "abst", "accordance", "according", + "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", + "ah", "ain't", "aint", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", + "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", + "anyway", "anyways", "anywhere", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "are", "area", "areas", + "aren", "aren't", "arent", "arise", "around", "as", "aside", "ask", "asked", "asking", "asks", "associated", "at", "auth", "available", + "away", "awfully", "b", "back", "backed", "backing", "backs", "be", "became", "because", "become", "becomes", "becoming", "been", + "before", "beforehand", "began", "begin", "beginning", "beginnings", "begins", "behind", "being", "beings", "believe", "below", "beside", + "besides", "best", "better", "between", "beyond", "big", "bill", "biol", "both", "bottom", "brief", "briefly", "but", "by", "c", "c'mon", + "c's", "ca", "call", "came", "can", "can't", "cannot", "cant", "case", "cases", "cause", "causes", "certain", "certainly", "changes", + "clear", "clearly", "cmon", "co", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider", "considering", + "contain", "containing", "contains", "corresponding", "could", "couldn't", "couldnt", "course", "cry", "cs", "currently", "d", "date", + "de", "definitely", "describe", "described", "despite", "detail", "did", "didn't", "didnt", "differ", "different", "differently", "do", + "does", "doesn't", "doesnt", "doing", "don't", "done", "dont", "down", "downed", "downing", "downs", "downwards", "due", "during", "e", + "each", "early", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "eleven", "else", "elsewhere", "empty", "end", "ended", + "ending", "ends", "enough", "entirely", "especially", "et", "et-al", "etc", "even", "evenly", "ever", "every", "everybody", "everyone", + "everything", "everywhere", "ex", "exactly", "example", "except", "f", "face", "faces", "fact", "facts", "far", "felt", "few", "ff", + "fifteen", "fifth", "fify", "fill", "find", "finds", "fire", "first", "five", "fix", "followed", "following", "follows", "for", "former", + "formerly", "forth", "forty", "found", "four", "from", "front", "full", "fully", "further", "furthered", "furthering", "furthermore", + "furthers", "g", "gave", "general", "generally", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "going", + "gone", "good", "goods", "got", "gotten", "great", "greater", "greatest", "greetings", "group", "grouped", "grouping", "groups", "h", + "had", "hadn't", "hadnt", "happens", "hardly", "has", "hasn't", "hasnt", "have", "haven't", "havent", "having", "he", "he'd", "he'll", + "he's", "hed", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herse", + "herself", "hes", "hi", "hid", "high", "higher", "highest", "him", "himse", "himself", "his", "hither", "home", "hopefully", "how", + "how's", "howbeit", "however", "hundred", "i", "i'd", "i'll", "i'm", "i've", "id", "ie", "if", "ignored", "ill", "im", "immediate", + "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", + "inner", "insofar", "instead", "interest", "interested", "interesting", "interests", "into", "invention", "inward", "is", "isn't", "isnt", + "it", "it'd", "it'll", "it's", "itd", "itll", "its", "itse", "itself", "ive", "j", "just", "k", "keep", "keeps", "kept", "kg", "kind", + "km", "knew", "know", "known", "knows", "l", "large", "largely", "last", "lately", "later", "latest", "latter", "latterly", "least", + "less", "lest", "let", "let's", "lets", "like", "liked", "likely", "line", "little", "long", "longer", "longest", "look", "looking", + "looks", "ltd", "m", "made", "mainly", "make", "makes", "making", "man", "many", "may", "maybe", "me", "mean", "means", "meantime", + "meanwhile", "member", "members", "men", "merely", "mg", "might", "mill", "million", "mine", "miss", "ml", "more", "moreover", "most", + "mostly", "move", "mr", "mrs", "much", "mug", "must", "mustn't", "my", "myse", "myself", "n", "na", "name", "namely", "nay", "nd", "near", + "nearly", "necessarily", "necessary", "need", "needed", "needing", "needs", "neither", "never", "nevertheless", "new", "newer", "newest", + "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", + "novel", "now", "nowhere", "number", "numbers", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", + "older", "oldest", "omitted", "on", "once", "one", "ones", "only", "onto", "open", "opened", "opening", "opens", "or", "ord", "order", + "ordered", "ordering", "orders", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", + "owing", "own", "p", "page", "pages", "part", "parted", "particular", "particularly", "parting", "parts", "past", "per", "perhaps", + "place", "placed", "places", "please", "plus", "point", "pointed", "pointing", "points", "poorly", "possible", "possibly", "potentially", + "pp", "predominantly", "present", "presented", "presenting", "presents", "presumably", "previously", "primarily", "probably", "problem", + "problems", "promptly", "proud", "provides", "put", "puts", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", + "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", + "research", "respectively", "resulted", "resulting", "results", "right", "room", "rooms", "run", "s", "said", "same", "saw", "say", + "saying", "says", "sec", "second", "secondly", "seconds", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", + "sees", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "shan't", "she", "she'd", "she'll", + "she's", "shed", "shes", "should", "shouldn't", "shouldnt", "show", "showed", "showing", "shown", "showns", "shows", "side", "sides", + "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "slightly", "small", "smaller", "smallest", + "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", + "sorry", "specifically", "specified", "specify", "specifying", "state", "states", "still", "stop", "strongly", "sub", "substantially", + "successfully", "such", "such as", "sufficiently", "suggest", "sup", "sure", "system", "t", "t's", "take", "taken", "taking", "tell", + "ten", "tends", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "that's", "that've", "thats", "the", "their", "theirs", + "them", "themselves", "then", "thence", "there", "there'll", "there's", "there've", "thereafter", "thereby", "thered", "therefore", + "therein", "thereof", "therere", "theres", "thereto", "thereupon", "these", "they", "they'd", "they'll", "they're", "they've", "theyd", + "theyll", "theyre", "theyve", "thick", "thin", "thing", "things", "think", "thinks", "third", "this", "thorough", "thoroughly", "those", + "thou", "though", "thoughh", "thought", "thoughts", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "til", "tip", + "to", "today", "together", "too", "took", "top", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "turn", "turned", + "turning", "turns", "twelve", "twenty", "twice", "two", "u", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", + "unto", "up", "upon", "ups", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "uucp", "v", "value", + "various", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wanted", "wanting", "wants", "was", "wasn't", "wasnt", "way", "ways", + "we", "we'd", "we'll", "we're", "we've", "wed", "welcome", "well", "wells", "went", "were", "weren't", "werent", "weve", "what", + "what'll", "what's", "whatever", "whats", "when", "when's", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", + "wherein", "wheres", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "who'll", "who's", "whod", "whoever", + "whole", "whom", "whomever", "whos", "whose", "why", "why's", "widely", "will", "willing", "wish", "with", "within", "without", "won't", + "wonder", "wont", "words", "work", "worked", "working", "works", "world", "would", "wouldn't", "wouldnt", "www", "x", "y", "year", + "years", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "youd", "youll", "young", "younger", "youngest", "your", "youre", + "yours", "yourself", "yourselves", "youve", "z", "zero"); /** * Disallowed chars for words in processed text segments. This regular @@ -294,91 +271,17 @@ public class Constants { */ public static final String DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"; - /** - * The text processors available, including the default text processor - */ - public static enum Processor { - CORENLP("corenlp"); - - public final String name; - - private Processor(final String name) { - this.name = name; - } - - private Processor(final Processor def) { - name = def.name; - } - - public static Processor DEFAULT() { - return CORENLP; - } - - public static Processor fromString(final String text) { - if (text != null) { - for (final Processor b : Processor.values()) { - if (text.equalsIgnoreCase(b.name)) { - return b; - } - } - } - return DEFAULT(); - } - } - - /** - * The topic modeling analyzers available, including the default analyzer. - */ - public static enum Analyzer { - DTM("dtm"); - - public final String name; - - private Analyzer(final String name) { - this.name = name; - } - - private Analyzer(final Analyzer def) { - name = def.name; - } - - public static Analyzer DEFAULT() { - return DTM; - } - - public static Analyzer fromString(final String text) { - if (text != null) { - for (final Analyzer b : Analyzer.values()) { - if (text.equalsIgnoreCase(b.name)) { - return b; - } - } - } - return DEFAULT(); - } - } - /** * Describes the window size, when using dynamic topic modeling */ public static enum WindowResolution { - YEAR("year"), - QUARTER("quarter"), - MONTH("month"), - DAY("day"), - HOUR("hour"), - MINUTE("minute"), - SECOND("second"); - - public final String name; - - private WindowResolution(final String name) { - this.name = name; - } - - private WindowResolution(final WindowResolution def) { - name = def.name; - } + YEAR, + QUARTER, + MONTH, + DAY, + HOUR, + MINUTE, + SECOND; public Date startDate(final Date date) { final Calendar in = new GregorianCalendar(); @@ -431,21 +334,12 @@ public class Constants { out.set(Calendar.MONTH, (CalendarUtils.getQuarter(in) * 3) + 2); return out.getTime(); } + } - public static WindowResolution DEFAULT() { - return YEAR; - } - - public static WindowResolution fromString(final String text) { - if (text != null) { - for (final WindowResolution b : WindowResolution.values()) { - if (text.equalsIgnoreCase(b.name)) { - return b; - } - } - } - return DEFAULT(); - } + public static enum ProcessorMode { + TEXT, + ENTITIES, + TEXT_WITH_ENTITIES } } diff --git a/vipra-util/src/main/java/de/vipra/util/ESClient.java b/vipra-util/src/main/java/de/vipra/util/ESClient.java index e901af4c570f223f7346be6644373887f6cb8ab2..b41b8b07e0c4abbce144b649db3afe2bb242f6ed 100644 --- a/vipra-util/src/main/java/de/vipra/util/ESClient.java +++ b/vipra-util/src/main/java/de/vipra/util/ESClient.java @@ -23,8 +23,8 @@ public abstract class ESClient { */ public static TransportClient getClient(final Config config) throws UnknownHostException { if (client == null) { - client = TransportClient.builder().build().addTransportAddress(new InetSocketTransportAddress( - InetAddress.getByName(config.elasticsearchHost), config.elasticsearchPort)); + client = TransportClient.builder().build().addTransportAddress( + new InetSocketTransportAddress(InetAddress.getByName(config.getElasticSearchHost()), config.getElasticSearchPort())); } return client; } diff --git a/vipra-util/src/main/java/de/vipra/util/EnumTools.java b/vipra-util/src/main/java/de/vipra/util/EnumTools.java deleted file mode 100644 index 0bc760f5b99d9fdb39c7b8d6e411a42f83c453a9..0000000000000000000000000000000000000000 --- a/vipra-util/src/main/java/de/vipra/util/EnumTools.java +++ /dev/null @@ -1,26 +0,0 @@ -package de.vipra.util; - -/** - * Enum tools to work with java enums. - */ -public class EnumTools { - - /** - * Finds an enum value by its name, ignoring case. - * - * @param enumeration - * Enum to be searched - * @param search - * Enum value to be searched - * @return the found enum value, or null - */ - public static <T extends Enum<?>> T searchEnum(final Class<T> enumeration, final String search) { - for (final T each : enumeration.getEnumConstants()) { - if (each.name().compareToIgnoreCase(search) == 0) { - return each; - } - } - return null; - } - -} diff --git a/vipra-util/src/main/java/de/vipra/util/FileUtils.java b/vipra-util/src/main/java/de/vipra/util/FileUtils.java index 8c636498d5c403053e238f038ff88da3ea61c574..6a018fa4f9792b1dd9cdf5b2c0f8ef7f6f5e45f3 100644 --- a/vipra-util/src/main/java/de/vipra/util/FileUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/FileUtils.java @@ -49,8 +49,7 @@ public class FileUtils extends org.apache.commons.io.FileUtils { */ public static File getFile(final String relPath) { try { - final File thisFile = new File( - FileUtils.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()); + final File thisFile = new File(FileUtils.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()); return new File(thisFile.getParent(), relPath); } catch (final URISyntaxException e) { e.printStackTrace(); diff --git a/vipra-util/src/main/java/de/vipra/util/Mongo.java b/vipra-util/src/main/java/de/vipra/util/Mongo.java index cb1b5000d97db61df1749ba5e7d117ccc6abb2a5..cc0c78643769256fd4c7fe4e2994df8c548d0a89 100644 --- a/vipra-util/src/main/java/de/vipra/util/Mongo.java +++ b/vipra-util/src/main/java/de/vipra/util/Mongo.java @@ -27,9 +27,9 @@ public class Mongo { private final Datastore datastore; private Mongo(final Config config) throws ConfigException { - final String host = config.databaseHost; - final Integer port = config.databasePort; - final String databaseName = config.databaseName; + final String host = config.getDatabaseHost(); + final Integer port = config.getDatabasePort(); + final String databaseName = config.getDatabaseName(); if (host == null || port == null || databaseName == null) { log.error("host/port/dbname missing in configuration"); diff --git a/vipra-util/src/main/java/de/vipra/util/PathUtils.java b/vipra-util/src/main/java/de/vipra/util/PathUtils.java index 47bd542aec2279330d6d8a6b8d3c484cbb60d8f7..4275979ab03b3c9952f77d94a8491db0e5db522c 100644 --- a/vipra-util/src/main/java/de/vipra/util/PathUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/PathUtils.java @@ -10,8 +10,7 @@ public class PathUtils { if (os.contains("WIN")) { base = new File(System.getProperty("APPDATA")); } else if (os.contains("MAC")) { - base = new File(System.getProperty("user.home") + File.separator + "Library" + File.separator - + "ApplicationSupport"); + base = new File(System.getProperty("user.home") + File.separator + "Library" + File.separator + "ApplicationSupport"); } else { base = new File(System.getProperty("user.home") + File.separator + ".local" + File.separator + "share"); } @@ -24,8 +23,7 @@ public class PathUtils { if (os.contains("WIN")) { base = new File(System.getProperty("APPDATA")); } else if (os.contains("MAC")) { - base = new File(System.getProperty("user.home") + File.separator + "Library" + File.separator - + "ApplicationSupport"); + base = new File(System.getProperty("user.home") + File.separator + "Library" + File.separator + "ApplicationSupport"); } else { base = new File(System.getProperty("user.home") + File.separator + ".config"); } diff --git a/vipra-util/src/main/java/de/vipra/util/ex/ConfigException.java b/vipra-util/src/main/java/de/vipra/util/ex/ConfigException.java index c9aad0005be219e42a13bc81600f6e5c59e65e85..3abac6b8f0cd07a4d9d81aadb4d61568aa852169 100644 --- a/vipra-util/src/main/java/de/vipra/util/ex/ConfigException.java +++ b/vipra-util/src/main/java/de/vipra/util/ex/ConfigException.java @@ -8,4 +8,8 @@ public class ConfigException extends Exception { super(string); } + public ConfigException(final Throwable t) { + super(t); + } + } \ No newline at end of file diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java index 60eb7dc6cfaf1ea9f34a923f4078b5210cc4f119..d66d4b8bf915af3a09f87b4d55003796853cb1d3 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java @@ -52,7 +52,11 @@ public class ArticleFull implements Model<ObjectId>, Serializable { @Embedded @QueryIgnore(multi = true) - private List<TopicRef> topics; + private TopicModel model; + + @Embedded + @QueryIgnore(multi = true) + private List<TopicShare> topics; @Embedded @QueryIgnore(multi = true) @@ -146,21 +150,34 @@ public class ArticleFull implements Model<ObjectId>, Serializable { } } - public List<TopicRef> getTopics() { + public TopicModel getModel() { + return model; + } + + @ElasticIndex("model") + public String serializeModel() { + return model.getId(); + } + + public void setModel(final TopicModel model) { + this.model = model; + } + + public List<TopicShare> getTopics() { return topics; } - public void setTopics(final List<TopicRef> topics) { + public void setTopics(final List<TopicShare> topics) { this.topics = topics; } @ElasticIndex("topics") public String[] serializeTopics() { - final List<TopicRef> refs = getTopics(); + final List<TopicShare> refs = getTopics(); if (refs == null) return new String[0]; final List<String> topics = new ArrayList<>(refs.size()); - for (final TopicRef ref : refs) { + for (final TopicShare ref : refs) { topics.add(ref.getTopic().getName()); } return topics.toArray(new String[topics.size()]); @@ -248,10 +265,9 @@ public class ArticleFull implements Model<ObjectId>, Serializable { @Override public String toString() { - return "ArticleFull [id=" + id + ", title=" + title + ", text=" + text + ", processedText=" - + Arrays.toString(processedText) + ", url=" + url + ", date=" + date + ", topics=" + topics - + ", similarArticles=" + similarArticles + ", stats=" + stats + ", created=" + created + ", modified=" - + modified + ", meta=" + meta + "]"; + return "ArticleFull [id=" + id + ", title=" + title + ", text=" + text + ", processedText=" + Arrays.toString(processedText) + ", url=" + url + + ", date=" + date + ", topics=" + topics + ", similarArticles=" + similarArticles + ", stats=" + stats + ", created=" + created + + ", modified=" + modified + ", meta=" + meta + "]"; } } \ No newline at end of file diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java index 157890cc98a260e0f4a3a5344b192a4f08c591aa..64c13b17e18ded2617252c7c15d43093e7eb9844 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java +++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java @@ -48,8 +48,8 @@ public class ArticleStats implements Serializable { @Override public String toString() { - return "ArticleStats [wordCount=" + wordCount + ", uniqueWordCount=" + uniqueWordCount + ", processedWordCount=" - + processedWordCount + ", reductionRatio=" + reductionRatio + "]"; + return "ArticleStats [wordCount=" + wordCount + ", uniqueWordCount=" + uniqueWordCount + ", processedWordCount=" + processedWordCount + + ", reductionRatio=" + reductionRatio + "]"; } } \ No newline at end of file diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleWord.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleWord.java index 023d3879001d2a63a00115d2e8db27824858e1d7..fe043792dec3887e0cafeec6a102441286b1b7ba 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/ArticleWord.java +++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleWord.java @@ -8,22 +8,22 @@ import org.mongodb.morphia.annotations.Embedded; @Embedded public class ArticleWord implements Comparable<ArticleWord>, Serializable { - private Word word; + private String word; private Integer count; public ArticleWord() {} public ArticleWord(final String word, final int count) { - this.word = new Word(word); + this.word = word; this.count = count; } - public Word getWord() { + public String getWord() { return word; } - public void setWord(final Word word) { + public void setWord(final String word) { this.word = word; } diff --git a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java index 784f0b233d0db496091040525a561439d7200d69..ee2e0c77b397d1822d25dd3cd3ffd15d54bd68aa 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java @@ -7,6 +7,8 @@ import org.mongodb.morphia.annotations.Embedded; import org.mongodb.morphia.annotations.Entity; import org.mongodb.morphia.annotations.Id; +import de.vipra.util.an.QueryIgnore; + @SuppressWarnings("serial") @Entity(value = "sequences", noClassnameStored = true) public class Sequence implements Model<ObjectId>, Comparable<Sequence>, Serializable { @@ -14,6 +16,10 @@ public class Sequence implements Model<ObjectId>, Comparable<Sequence>, Serializ @Id private ObjectId id = new ObjectId(); + @Embedded + @QueryIgnore(multi = true) + private TopicModel model; + @Embedded private Window window; @@ -37,6 +43,14 @@ public class Sequence implements Model<ObjectId>, Comparable<Sequence>, Serializ this.id = id; } + public TopicModel getModel() { + return model; + } + + public void setModel(final TopicModel model) { + this.model = model; + } + public Window getWindow() { return window; } @@ -68,8 +82,7 @@ public class Sequence implements Model<ObjectId>, Comparable<Sequence>, Serializ @Override public String toString() { - return "Sequence [id=" + id + ", window=" + window + ", relevance=" + relevance + ", relevanceChange=" - + relevanceChange + "]"; + return "Sequence [id=" + id + ", window=" + window + ", relevance=" + relevance + ", relevanceChange=" + relevanceChange + "]"; } } diff --git a/vipra-util/src/main/java/de/vipra/util/model/SequenceFull.java b/vipra-util/src/main/java/de/vipra/util/model/SequenceFull.java index 9f43e31d3b53dd95c94c57fe12dc7a7e5a88eaba..91201a0f2eedf0aa9e545dd3b844e464507e5fd9 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/SequenceFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/SequenceFull.java @@ -18,6 +18,10 @@ public class SequenceFull implements Model<ObjectId>, Comparable<SequenceFull>, @Id private ObjectId id = new ObjectId(); + @Embedded + @QueryIgnore(multi = true) + private TopicModel model; + @Embedded private Window window; @@ -43,6 +47,14 @@ public class SequenceFull implements Model<ObjectId>, Comparable<SequenceFull>, this.id = id; } + public TopicModel getModel() { + return model; + } + + public void setModel(final TopicModel model) { + this.model = model; + } + public Window getWindow() { return window; } @@ -90,8 +102,8 @@ public class SequenceFull implements Model<ObjectId>, Comparable<SequenceFull>, @Override public String toString() { - return "SequenceFull [id=" + id + ", window=" + window + ", relevance=" + relevance + ", relevanceChange=" - + relevanceChange + ", topic=" + topic + ", words=" + words + "]"; + return "SequenceFull [id=" + id + ", window=" + window + ", relevance=" + relevance + ", relevanceChange=" + relevanceChange + ", topic=" + + topic + ", words=" + words + "]"; } } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java index b8046d97eea8cf5d26bc274854b10546a74fd87f..260b998a40e4f918e665042157893bce68748a03 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java @@ -6,6 +6,7 @@ import java.util.Date; import java.util.List; import org.bson.types.ObjectId; +import org.mongodb.morphia.annotations.Embedded; import org.mongodb.morphia.annotations.Entity; import org.mongodb.morphia.annotations.Id; import org.mongodb.morphia.annotations.PrePersist; @@ -23,6 +24,10 @@ public class TopicFull implements Model<ObjectId>, Serializable { @Id private ObjectId id = new ObjectId(); + @Embedded + @QueryIgnore(multi = true) + private TopicModel model; + private String name; @Reference @@ -62,6 +67,14 @@ public class TopicFull implements Model<ObjectId>, Serializable { this.id = MongoUtils.objectId(id); } + public TopicModel getModel() { + return model; + } + + public void setModel(final TopicModel model) { + this.model = model; + } + public String getName() { return name; } @@ -147,7 +160,7 @@ public class TopicFull implements Model<ObjectId>, Serializable { final int size = Math.min(Constants.TOPIC_AUTO_NAMING_WORDS, words.size()); final List<String> topWords = new ArrayList<>(size); for (int i = 0; i < size; i++) { - topWords.add(words.get(i).getWord().getId()); + topWords.add(words.get(i).getWord()); } name = StringUtils.join(topWords); } @@ -175,8 +188,9 @@ public class TopicFull implements Model<ObjectId>, Serializable { @Override public String toString() { - return "TopicFull [id=" + id + ", name=" + name + ", sequences=" + sequences + ", avgRelevance=" + avgRelevance - + ", varRelevance=" + varRelevance + ", created=" + created + ", modified=" + modified + "]"; + return "TopicFull [id=" + id + ", model=" + model + ", name=" + name + ", sequences=" + sequences + ", avgRelevance=" + avgRelevance + + ", varRelevance=" + varRelevance + ", risingRelevance=" + risingRelevance + ", fallingRelevance=" + fallingRelevance + + ", risingDecayRelevance=" + risingDecayRelevance + ", created=" + created + ", modified=" + modified + "]"; } } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicModel.java b/vipra-util/src/main/java/de/vipra/util/model/TopicModel.java new file mode 100644 index 0000000000000000000000000000000000000000..3cb061df10c53a2a088cce367bf9ab6bbfdb3eab --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicModel.java @@ -0,0 +1,31 @@ +package de.vipra.util.model; + +import java.io.Serializable; + +import org.mongodb.morphia.annotations.Entity; +import org.mongodb.morphia.annotations.Id; + +@SuppressWarnings("serial") +@Entity(noClassnameStored = true) +public class TopicModel implements Model<String>, Serializable { + + @Id + private String id; + + public TopicModel() {} + + public TopicModel(final String id) { + this.id = id; + } + + @Override + public String getId() { + return id; + } + + @Override + public void setId(final String id) { + this.id = id; + }; + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicModelFull.java b/vipra-util/src/main/java/de/vipra/util/model/TopicModelFull.java new file mode 100644 index 0000000000000000000000000000000000000000..b82838c7e199e54dcb7e5a28a5025967dfa1be5d --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicModelFull.java @@ -0,0 +1,29 @@ +package de.vipra.util.model; + +import org.mongodb.morphia.annotations.Entity; +import org.mongodb.morphia.annotations.Id; + +@SuppressWarnings("serial") +@Entity(noClassnameStored = true) +public class TopicModelFull implements Model<String> { + + @Id + private String id; + + public TopicModelFull() {} + + public TopicModelFull(final String id) { + this.id = id; + } + + @Override + public String getId() { + return id; + } + + @Override + public void setId(final String id) { + this.id = id; + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java b/vipra-util/src/main/java/de/vipra/util/model/TopicShare.java similarity index 84% rename from vipra-util/src/main/java/de/vipra/util/model/TopicRef.java rename to vipra-util/src/main/java/de/vipra/util/model/TopicShare.java index 7577b3e6a0b7f65bda2112e2dd22b163763e0a9b..5f604cf9ef373c25a7552ddf24d5256cdb66a3ca 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicShare.java @@ -7,7 +7,7 @@ import org.mongodb.morphia.annotations.Reference; @SuppressWarnings("serial") @Embedded -public class TopicRef implements Comparable<TopicRef>, Serializable { +public class TopicShare implements Comparable<TopicShare>, Serializable { @Reference(ignoreMissing = true) private Topic topic; @@ -31,7 +31,7 @@ public class TopicRef implements Comparable<TopicRef>, Serializable { } @Override - public int compareTo(final TopicRef arg0) { + public int compareTo(final TopicShare arg0) { return (int) (share - arg0.getShare()); } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java index 3aad49e9bf936cafeccf99976e9067691fae512a..dad8de098b861f4941c90b517a69a6691a985935 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java @@ -14,33 +14,33 @@ public class TopicWord implements Comparable<TopicWord>, Serializable { @Embedded @JsonIgnore - private Word word; + private String word; private Double likeliness; public TopicWord() {} - public TopicWord(final Word word, final Double likeliness) { + public TopicWord(final String word, final Double likeliness) { this.word = word; this.likeliness = likeliness; } - public Word getWord() { + public String getWord() { return word; } - public void setWord(final Word word) { + public void setWord(final String word) { this.word = word; } @JsonGetter("id") public String getWordString() { - return word.getId(); + return word; } @JsonSetter("id") public void setWordString(final String word) { - this.word = new Word(word); + this.word = word; } public Double getLikeliness() { diff --git a/vipra-util/src/main/java/de/vipra/util/model/Window.java b/vipra-util/src/main/java/de/vipra/util/model/Window.java index 7e88315bbde5785f28ce964b1c60e9832d0e5387..3ab0ba5811484c1c69f39f008a79abf96b2a7969 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Window.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Window.java @@ -3,10 +3,12 @@ package de.vipra.util.model; import java.io.Serializable; import java.util.Date; +import org.mongodb.morphia.annotations.Embedded; import org.mongodb.morphia.annotations.Entity; import org.mongodb.morphia.annotations.Id; import de.vipra.util.Constants.WindowResolution; +import de.vipra.util.an.QueryIgnore; @SuppressWarnings("serial") @Entity(value = "windows", noClassnameStored = true) @@ -15,6 +17,10 @@ public class Window implements Model<Integer>, Serializable, Comparable<Window> @Id private Integer id; + @Embedded + @QueryIgnore(multi = true) + private TopicModel model; + private Date startDate; private Date endDate; @@ -31,6 +37,14 @@ public class Window implements Model<Integer>, Serializable, Comparable<Window> this.id = id; } + public TopicModel getModel() { + return model; + } + + public void setModel(final TopicModel model) { + this.model = model; + } + public Date getStartDate() { return startDate; } diff --git a/vipra-util/src/main/java/de/vipra/util/model/Word.java b/vipra-util/src/main/java/de/vipra/util/model/Word.java deleted file mode 100644 index 15b63e17f97c0e7c2863fcb6b894eb58d6c2e189..0000000000000000000000000000000000000000 --- a/vipra-util/src/main/java/de/vipra/util/model/Word.java +++ /dev/null @@ -1,55 +0,0 @@ -package de.vipra.util.model; - -import java.io.Serializable; - -import org.mongodb.morphia.annotations.Entity; -import org.mongodb.morphia.annotations.Id; - -@SuppressWarnings("serial") -@Entity(value = "words", noClassnameStored = true) -public class Word implements Model<String>, Serializable { - - @Id - private String id; - - public Word() {} - - public Word(final String id) { - this.id = id; - } - - @Override - public String getId() { - return id; - } - - @Override - public void setId(final String id) { - this.id = id; - } - - @Override - public boolean equals(final Object o) { - if (o == null) - return false; - if (!(o instanceof Word)) - return false; - final Word w = (Word) o; - if (id == null) - return w.getId() == null; - return id.equals(w.getId()); - } - - @Override - public int hashCode() { - if (id == null) - return super.hashCode(); - return id.hashCode(); - } - - @Override - public String toString() { - return "Word [id=" + id + "]"; - } - -} diff --git a/vipra-util/src/main/java/de/vipra/util/service/MongoService.java b/vipra-util/src/main/java/de/vipra/util/service/MongoService.java index 664382a65b546beb603f79fb6fd58bdee1d9b479..714cbf64938dc693b5990d35b5038a011be725c5 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/MongoService.java +++ b/vipra-util/src/main/java/de/vipra/util/service/MongoService.java @@ -71,16 +71,14 @@ public class MongoService<Type extends Model<IdType>, IdType> implements Service } @Override - public List<Type> getMultiple(final Integer skip, final Integer limit, final String sortBy, - final String... fields) { + public List<Type> getMultiple(final Integer skip, final Integer limit, final String sortBy, final String... fields) { return getMultiple(QueryBuilder.builder().skip(skip).limit(limit).sortBy(sortBy).fields(true, fields)); } @Override - public List<Type> getMultiple(final Integer skip, final Integer limit, final String sortBy, - final Tuple<String, Object> criteria, final String... fields) { - return getMultiple( - QueryBuilder.builder().skip(skip).limit(limit).sortBy(sortBy).fields(true, fields).criteria(criteria)); + public List<Type> getMultiple(final Integer skip, final Integer limit, final String sortBy, final Tuple<String, Object> criteria, + final String... fields) { + return getMultiple(QueryBuilder.builder().skip(skip).limit(limit).sortBy(sortBy).fields(true, fields).criteria(criteria)); } @Override @@ -155,6 +153,19 @@ public class MongoService<Type extends Model<IdType>, IdType> implements Service return deleted; } + @Override + public long deleteMultiple(final QueryBuilder builder) throws DatabaseException { + final Query<Type> query = datastore.createQuery(clazz); + if (builder != null) { + if (builder.getCriteria() != null) + for (final Tuple<String, Object> criteria : builder.getCriteria()) + query.field(criteria.first()).equal(criteria.second()); + } + + final int deleted = datastore.delete(query).getN(); + return deleted; + } + @Override public void replaceSingle(final Type t) throws DatabaseException { if (t == null) @@ -230,8 +241,8 @@ public class MongoService<Type extends Model<IdType>, IdType> implements Service return datastore.getCount(query); } - public static <Type extends Model<IdType>, IdType> MongoService<Type, IdType> getDatabaseService( - final Config config, final Class<Type> clazz) throws ConfigException { + public static <Type extends Model<IdType>, IdType> MongoService<Type, IdType> getDatabaseService(final Config config, final Class<Type> clazz) + throws ConfigException { final Mongo mongo = config.getMongo(); return new MongoService<Type, IdType>(mongo, clazz); } diff --git a/vipra-util/src/main/java/de/vipra/util/service/Service.java b/vipra-util/src/main/java/de/vipra/util/service/Service.java index 416903495ef3297f5fc00146164b3ac6b71b5e52..cc58335c5d652f0ce72f2b60c91529215c365a64 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/Service.java +++ b/vipra-util/src/main/java/de/vipra/util/service/Service.java @@ -39,8 +39,7 @@ public interface Service<Type extends Model<IdType>, IdType, E extends Exception /** * @see {@link Service#getMultiple(QueryBuilder)} */ - List<Type> getMultiple(Integer skip, Integer limit, String sortBy, Tuple<String, Object> criteria, String... fields) - throws E; + List<Type> getMultiple(Integer skip, Integer limit, String sortBy, Tuple<String, Object> criteria, String... fields) throws E; /** * Returns multiple entities from the database. @@ -103,6 +102,8 @@ public interface Service<Type extends Model<IdType>, IdType, E extends Exception */ long deleteMultiple(Iterable<IdType> ids) throws E; + long deleteMultiple(QueryBuilder builder) throws E; + /** * Replaces a single entity in the database *