Skip to content
Snippets Groups Projects
Commit fa749921 authored by Eike Cochu's avatar Eike Cochu
Browse files

added index clearing to clear command

clear command now deletes all elasticsearch indexes
fixed paginator current page query when page is not set (1)
added ignoring article topics with low relevance
parent 605ef106
No related branches found
No related tags found
No related merge requests found
......@@ -279,6 +279,14 @@
},
"buffers":
[
{
"contents": "curl -XPOST 'http://localhost:9200/articles/_search' -d '{\"query\":{\"match\":{\"_all\":\"ibm\"}},\"_source\":{\"exclude\":[\"text\"]}}'",
"settings":
{
"buffer_size": 123,
"line_ending": "Unix"
}
}
],
"build_system": "",
"build_system_choices":
......@@ -915,8 +923,36 @@
"groups":
[
{
"selected": 0,
"sheets":
[
{
"buffer": 0,
"semi_transient": false,
"settings":
{
"buffer_size": 123,
"regions":
{
},
"selection":
[
[
123,
123
]
],
"settings":
{
"syntax": "Packages/Text/Plain text.tmLanguage"
},
"translation.x": 0.0,
"translation.y": 0.0,
"zoom_level": 1.0
},
"stack_index": 0,
"type": "text"
}
]
}
],
......
......@@ -166,7 +166,7 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer {
List<TopicRef> topicCount = new ArrayList<>(countMap.size());
for (Entry<String, Integer> e : countMap.entrySet()) {
TopicRef tc = new TopicRef();
tc.setTopicId(e.getKey());
tc.setTopicIndex(e.getKey());
tc.setCount(e.getValue());
topicCount.add(tc);
}
......
......@@ -7,7 +7,9 @@ import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.bson.types.ObjectId;
import org.elasticsearch.client.Client;
import de.vipra.cmd.es.ESClient;
import de.vipra.cmd.model.ProcessedArticle;
import de.vipra.util.Config;
import de.vipra.util.ConsoleUtils;
......@@ -27,6 +29,7 @@ public class ClearCommand implements Command {
private DatabaseService<TopicFull, ObjectId> dbTopics;
private DatabaseService<Word, String> dbWords;
private DatabaseService<Import, ObjectId> dbImports;
private Client elasticClient;
public ClearCommand(boolean defaults) {
this.defaults = defaults;
......@@ -38,6 +41,7 @@ public class ClearCommand implements Command {
dbTopics = DatabaseService.getDatabaseService(config, TopicFull.class);
dbWords = DatabaseService.getDatabaseService(config, Word.class);
dbImports = DatabaseService.getDatabaseService(config, Import.class);
elasticClient = ESClient.getClient(config);
out.info("clearing database");
dbArticles.drop();
......@@ -45,6 +49,9 @@ public class ClearCommand implements Command {
dbWords.drop();
dbImports.drop();
out.info("clearing index");
elasticClient.admin().indices().prepareDelete("_all").get();
try {
out.info("clearing filebase");
File dataDir = config.getDataDirectory();
......
......@@ -7,6 +7,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
......@@ -25,6 +26,7 @@ import de.vipra.cmd.model.ProcessedArticle;
import de.vipra.cmd.text.ProcessedText;
import de.vipra.cmd.text.Processor;
import de.vipra.util.Config;
import de.vipra.util.Constants;
import de.vipra.util.ConvertStream;
import de.vipra.util.ElasticSerializer;
import de.vipra.util.MongoUtils;
......@@ -108,7 +110,7 @@ public class ImportCommand implements Command {
* @throws Exception
*/
private Article importArticle(JSONObject obj) throws Exception {
out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\"");
out.info("importing \"" + obj.get("title") + "\"");
ProcessedArticle article = new ProcessedArticle();
article.fromJSON(obj);
......@@ -238,28 +240,42 @@ public class ImportCommand implements Command {
* save topic refs
*/
out.info("saving document topics");
ConvertStream<List<TopicRef>> topics = analyzer.getTopics();
ConvertStream<List<TopicRef>> topicStream = analyzer.getTopics();
FilebaseIndex index = filebase.getIndex();
Iterator<String> indexIter = index.iterator();
Iterator<List<TopicRef>> topicIter = topics.iterator();
while (indexIter.hasNext() && topicIter.hasNext()) {
List<TopicRef> topicCount = topicIter.next();
for (TopicRef tc : topicCount) {
String oid = topicIndexMap.get(tc.getTopicId());
tc.setTopicId(oid);
if (oid == null)
log.error("no object id for topic index " + tc.getTopicId());
}
Iterator<List<TopicRef>> topicRefsListIter = topicStream.iterator();
while (indexIter.hasNext() && topicRefsListIter.hasNext()) {
// get article from database
String id = indexIter.next();
ProcessedArticle a = dbArticles.getSingle(MongoUtils.objectId(id));
if (a != null)
a.setTopics(topicCount);
else
ProcessedArticle article = dbArticles.getSingle(MongoUtils.objectId(id));
if (article == null) {
log.error("no article found in db for id " + id);
continue;
}
double wordCount = article.getStats().getWordCount();
// insert topic references into article, ignoring low refs
List<TopicRef> topicRefs = topicRefsListIter.next();
for (ListIterator<TopicRef> topicRefsIter = topicRefs.listIterator(); topicRefsIter.hasNext();) {
TopicRef topicRef = topicRefsIter.next();
if ((topicRef.getCount() / wordCount) < Constants.TOPIC_THRESHOLD) {
topicRefsIter.remove();
continue;
}
String topicObjectId = topicIndexMap.get(topicRef.getTopicIndex());
if (topicObjectId != null)
topicRef.setTopicId(topicObjectId);
else
log.error("no object id for topic index " + topicRef.getTopicIndex());
}
article.setTopics(topicRefs);
try {
dbArticles.updateSingle(a);
dbArticles.updateSingle(article);
} catch (DatabaseException e) {
log.error("could not update article: " + a.getTitle() + " (" + a.getId() + ")");
log.error("could not update article: " + article.getTitle() + " (" + article.getId() + ")");
}
}
List<Word> importedWords = wordMap.getNewWords();
......@@ -289,6 +305,7 @@ public class ImportCommand implements Command {
out.info("imported " + newArticlesCount + " new " + StringUtils.quantity(newArticlesCount, "article"));
out.info("imported " + newWordsCount + " new " + StringUtils.quantity(newWordsCount, "word"));
out.info(timer.toString());
out.info("done in " + StringUtils.timeString(timer.total()));
}
}
......@@ -4,28 +4,32 @@ export default Ember.Component.extend({
elements: 2,
currentPage: Ember.computed('page', function() {
return parseInt(this.get('page') || 1);
}),
prev: Ember.computed('page', function() {
return this.page > 1;
return this.get('currentPage') > 1;
}),
prevPrev: Ember.computed('page', function() {
return this.page > this.elements + 1;
return this.get('currentPage') > this.elements + 1;
}),
prevPage: Ember.computed('page', function() {
return this.page - 1;
return this.get('currentPage') - 1;
}),
next: Ember.computed('page', function() {
return this.page < Math.ceil(this.total/this.limit*1.0);
return this.get('currentPage') < Math.ceil(this.total/this.limit*1.0);
}),
nextNext: Ember.computed('page', function() {
return this.page < Math.ceil(this.total/this.limit*1.0) - this.elements;
return this.get('currentPage') < Math.ceil(this.total/this.limit*1.0) - this.elements;
}),
nextPage: Ember.computed('page', function() {
return this.page + 1;
return this.get('currentPage') + 1;
}),
lastPage: Ember.computed('page', function() {
......@@ -34,7 +38,7 @@ export default Ember.Component.extend({
pages: Ember.computed('total', 'page', 'limit', 'elements', function() {
let pages = [],
page = parseInt(this.page || 1),
page = this.get('currentPage'),
max = Math.ceil(this.total/this.limit*1.0),
start = Math.max(page - this.elements, 1),
end = Math.min(Math.max(page + this.elements, start + this.elements * 2), max);
......
......@@ -64,6 +64,12 @@ public class Constants {
*/
public static final int LIKELINESS_PRECISION = 6;
/**
* Topics with a share greater or equal to this number are regarded as
* accepted topics to that article. Value range: [0.0, 1.0]
*/
public static final double TOPIC_THRESHOLD = 0.01;
/**
* Stopwords list. Extensive list of stopwords used to clean imported
* articles of the most common words before topic modeling is applied.
......
......@@ -6,24 +6,28 @@ import org.mongodb.morphia.annotations.Embedded;
import org.mongodb.morphia.annotations.Reference;
import org.mongodb.morphia.annotations.Transient;
import de.vipra.util.MongoUtils;
@SuppressWarnings("serial")
@Embedded
public class TopicRef implements Comparable<TopicRef>, Serializable {
@Transient
private String topicId;
private String topicIndex;
@Reference(ignoreMissing = true)
private Topic topic;
private int count;
public String getTopicId() {
return topicId;
public String getTopicIndex() {
return topicIndex;
}
public void setTopicIndex(String index) {
this.topicIndex = index;
}
public void setTopicId(String id) {
this.topicId = id;
this.topic = new Topic();
this.topic.setId(id);
this.topic = new Topic(MongoUtils.objectId(id));
}
public int getCount() {
......@@ -49,7 +53,8 @@ public class TopicRef implements Comparable<TopicRef>, Serializable {
@Override
public String toString() {
return TopicRef.class.getSimpleName() + "[topicId:" + topicId + ",count:" + count + "]";
return TopicRef.class.getSimpleName() + "[topicIndex:" + topicIndex + ", topic: " + topic + ", count:" + count
+ "]";
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment