diff --git a/ma-impl.sublime-workspace b/ma-impl.sublime-workspace index 975b92e6b4c05177da818deb33277708d1e1166a..19022d5e430ba28656ede4fa81491286af9d730d 100644 --- a/ma-impl.sublime-workspace +++ b/ma-impl.sublime-workspace @@ -279,6 +279,14 @@ }, "buffers": [ + { + "contents": "curl -XPOST 'http://localhost:9200/articles/_search' -d '{\"query\":{\"match\":{\"_all\":\"ibm\"}},\"_source\":{\"exclude\":[\"text\"]}}'", + "settings": + { + "buffer_size": 123, + "line_ending": "Unix" + } + } ], "build_system": "", "build_system_choices": @@ -915,8 +923,36 @@ "groups": [ { + "selected": 0, "sheets": [ + { + "buffer": 0, + "semi_transient": false, + "settings": + { + "buffer_size": 123, + "regions": + { + }, + "selection": + [ + [ + 123, + 123 + ] + ], + "settings": + { + "syntax": "Packages/Text/Plain text.tmLanguage" + }, + "translation.x": 0.0, + "translation.y": 0.0, + "zoom_level": 1.0 + }, + "stack_index": 0, + "type": "text" + } ] } ], diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java index 5fe15001f12bb639817c29514329d6914ace1bd9..65a012c8f48861a05dd795db154ee25d73fc6c7f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java @@ -166,7 +166,7 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { List<TopicRef> topicCount = new ArrayList<>(countMap.size()); for (Entry<String, Integer> e : countMap.entrySet()) { TopicRef tc = new TopicRef(); - tc.setTopicId(e.getKey()); + tc.setTopicIndex(e.getKey()); tc.setCount(e.getValue()); topicCount.add(tc); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java index 5a9e7b54c5681b614042af0c8fe5c5a195c4a24c..6625adda49113871a785464f1a7fc741577c2549 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java @@ -7,7 +7,9 @@ import org.apache.commons.io.FileUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.bson.types.ObjectId; +import org.elasticsearch.client.Client; +import de.vipra.cmd.es.ESClient; import de.vipra.cmd.model.ProcessedArticle; import de.vipra.util.Config; import de.vipra.util.ConsoleUtils; @@ -27,6 +29,7 @@ public class ClearCommand implements Command { private DatabaseService<TopicFull, ObjectId> dbTopics; private DatabaseService<Word, String> dbWords; private DatabaseService<Import, ObjectId> dbImports; + private Client elasticClient; public ClearCommand(boolean defaults) { this.defaults = defaults; @@ -38,6 +41,7 @@ public class ClearCommand implements Command { dbTopics = DatabaseService.getDatabaseService(config, TopicFull.class); dbWords = DatabaseService.getDatabaseService(config, Word.class); dbImports = DatabaseService.getDatabaseService(config, Import.class); + elasticClient = ESClient.getClient(config); out.info("clearing database"); dbArticles.drop(); @@ -45,6 +49,9 @@ public class ClearCommand implements Command { dbWords.drop(); dbImports.drop(); + out.info("clearing index"); + elasticClient.admin().indices().prepareDelete("_all").get(); + try { out.info("clearing filebase"); File dataDir = config.getDataDirectory(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 382cc4a011fa1a7bd3771cbfa076f0d238c5d10a..0e43bd171100d2a6dae86b77d4ca6a26d4d0eae7 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -7,6 +7,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.ListIterator; import java.util.Map; import org.apache.logging.log4j.LogManager; @@ -25,6 +26,7 @@ import de.vipra.cmd.model.ProcessedArticle; import de.vipra.cmd.text.ProcessedText; import de.vipra.cmd.text.Processor; import de.vipra.util.Config; +import de.vipra.util.Constants; import de.vipra.util.ConvertStream; import de.vipra.util.ElasticSerializer; import de.vipra.util.MongoUtils; @@ -108,7 +110,7 @@ public class ImportCommand implements Command { * @throws Exception */ private Article importArticle(JSONObject obj) throws Exception { - out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\""); + out.info("importing \"" + obj.get("title") + "\""); ProcessedArticle article = new ProcessedArticle(); article.fromJSON(obj); @@ -238,28 +240,42 @@ public class ImportCommand implements Command { * save topic refs */ out.info("saving document topics"); - ConvertStream<List<TopicRef>> topics = analyzer.getTopics(); + ConvertStream<List<TopicRef>> topicStream = analyzer.getTopics(); FilebaseIndex index = filebase.getIndex(); Iterator<String> indexIter = index.iterator(); - Iterator<List<TopicRef>> topicIter = topics.iterator(); - while (indexIter.hasNext() && topicIter.hasNext()) { - List<TopicRef> topicCount = topicIter.next(); - for (TopicRef tc : topicCount) { - String oid = topicIndexMap.get(tc.getTopicId()); - tc.setTopicId(oid); - if (oid == null) - log.error("no object id for topic index " + tc.getTopicId()); - } + Iterator<List<TopicRef>> topicRefsListIter = topicStream.iterator(); + while (indexIter.hasNext() && topicRefsListIter.hasNext()) { + // get article from database String id = indexIter.next(); - ProcessedArticle a = dbArticles.getSingle(MongoUtils.objectId(id)); - if (a != null) - a.setTopics(topicCount); - else + ProcessedArticle article = dbArticles.getSingle(MongoUtils.objectId(id)); + if (article == null) { log.error("no article found in db for id " + id); + continue; + } + + double wordCount = article.getStats().getWordCount(); + + // insert topic references into article, ignoring low refs + List<TopicRef> topicRefs = topicRefsListIter.next(); + for (ListIterator<TopicRef> topicRefsIter = topicRefs.listIterator(); topicRefsIter.hasNext();) { + TopicRef topicRef = topicRefsIter.next(); + if ((topicRef.getCount() / wordCount) < Constants.TOPIC_THRESHOLD) { + topicRefsIter.remove(); + continue; + } + String topicObjectId = topicIndexMap.get(topicRef.getTopicIndex()); + if (topicObjectId != null) + topicRef.setTopicId(topicObjectId); + else + log.error("no object id for topic index " + topicRef.getTopicIndex()); + } + + article.setTopics(topicRefs); + try { - dbArticles.updateSingle(a); + dbArticles.updateSingle(article); } catch (DatabaseException e) { - log.error("could not update article: " + a.getTitle() + " (" + a.getId() + ")"); + log.error("could not update article: " + article.getTitle() + " (" + article.getId() + ")"); } } List<Word> importedWords = wordMap.getNewWords(); @@ -289,6 +305,7 @@ public class ImportCommand implements Command { out.info("imported " + newArticlesCount + " new " + StringUtils.quantity(newArticlesCount, "article")); out.info("imported " + newWordsCount + " new " + StringUtils.quantity(newWordsCount, "word")); out.info(timer.toString()); + out.info("done in " + StringUtils.timeString(timer.total())); } } diff --git a/vipra-ui/app/components/pagination-bar.js b/vipra-ui/app/components/pagination-bar.js index d6654db94cfaf900ae1d0ccf51cf37b95321ee96..42d7c324b9653e624ad1c0fff8362bd868258c03 100644 --- a/vipra-ui/app/components/pagination-bar.js +++ b/vipra-ui/app/components/pagination-bar.js @@ -4,28 +4,32 @@ export default Ember.Component.extend({ elements: 2, + currentPage: Ember.computed('page', function() { + return parseInt(this.get('page') || 1); + }), + prev: Ember.computed('page', function() { - return this.page > 1; + return this.get('currentPage') > 1; }), prevPrev: Ember.computed('page', function() { - return this.page > this.elements + 1; + return this.get('currentPage') > this.elements + 1; }), prevPage: Ember.computed('page', function() { - return this.page - 1; + return this.get('currentPage') - 1; }), next: Ember.computed('page', function() { - return this.page < Math.ceil(this.total/this.limit*1.0); + return this.get('currentPage') < Math.ceil(this.total/this.limit*1.0); }), nextNext: Ember.computed('page', function() { - return this.page < Math.ceil(this.total/this.limit*1.0) - this.elements; + return this.get('currentPage') < Math.ceil(this.total/this.limit*1.0) - this.elements; }), nextPage: Ember.computed('page', function() { - return this.page + 1; + return this.get('currentPage') + 1; }), lastPage: Ember.computed('page', function() { @@ -34,7 +38,7 @@ export default Ember.Component.extend({ pages: Ember.computed('total', 'page', 'limit', 'elements', function() { let pages = [], - page = parseInt(this.page || 1), + page = this.get('currentPage'), max = Math.ceil(this.total/this.limit*1.0), start = Math.max(page - this.elements, 1), end = Math.min(Math.max(page + this.elements, start + this.elements * 2), max); diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index b4440ea1426fd2a57d236b9a5d582d1143e4d60c..8c3e176f68b89c4de4169613e08f070f54bb6f69 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -64,6 +64,12 @@ public class Constants { */ public static final int LIKELINESS_PRECISION = 6; + /** + * Topics with a share greater or equal to this number are regarded as + * accepted topics to that article. Value range: [0.0, 1.0] + */ + public static final double TOPIC_THRESHOLD = 0.01; + /** * Stopwords list. Extensive list of stopwords used to clean imported * articles of the most common words before topic modeling is applied. diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java index 7d84a4d650b374b32d8387f15825cef2f51d0cad..35d8755e11de8eeb90ad970597abc6e3b2a88d62 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java @@ -6,24 +6,28 @@ import org.mongodb.morphia.annotations.Embedded; import org.mongodb.morphia.annotations.Reference; import org.mongodb.morphia.annotations.Transient; +import de.vipra.util.MongoUtils; + @SuppressWarnings("serial") @Embedded public class TopicRef implements Comparable<TopicRef>, Serializable { @Transient - private String topicId; + private String topicIndex; @Reference(ignoreMissing = true) private Topic topic; private int count; - public String getTopicId() { - return topicId; + public String getTopicIndex() { + return topicIndex; + } + + public void setTopicIndex(String index) { + this.topicIndex = index; } public void setTopicId(String id) { - this.topicId = id; - this.topic = new Topic(); - this.topic.setId(id); + this.topic = new Topic(MongoUtils.objectId(id)); } public int getCount() { @@ -49,7 +53,8 @@ public class TopicRef implements Comparable<TopicRef>, Serializable { @Override public String toString() { - return TopicRef.class.getSimpleName() + "[topicId:" + topicId + ",count:" + count + "]"; + return TopicRef.class.getSimpleName() + "[topicIndex:" + topicIndex + ", topic: " + topic + ", count:" + count + + "]"; } }