diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java index e26181118401e883093e4a2c0f0919bc580ba2ec..bcfdf9f66f1acd07d554f26af8dbaaa0700bb0f7 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java @@ -3,7 +3,11 @@ package de.vipra.cmd.lda; import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,8 +17,8 @@ import de.vipra.util.Config; import de.vipra.util.ConvertStream; import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; -import de.vipra.util.model.TopicDefinition; -import de.vipra.util.model.TopicMap; +import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicCount; import de.vipra.util.model.TopicWord; import jgibblda.Estimator; import jgibblda.Inferencer; @@ -77,13 +81,13 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { } @Override - public ConvertStream<TopicDefinition> getTopicDefinitions() throws LDAAnalyzerException { + public ConvertStream<Topic> getTopicDefinitions() throws LDAAnalyzerException { File twords = new File(modelDir, "jgibb.twords"); try { - return new ConvertStream<TopicDefinition>(twords) { + return new ConvertStream<Topic>(twords) { @Override - public TopicDefinition convert(String line) { - TopicDefinition topicDef = new TopicDefinition(); + public Topic convert(String line) { + Topic topicDef = new Topic(); List<TopicWord> topicWords = new ArrayList<>(); Integer index = StringUtils.getFirstNumber(line); if (index == null) { @@ -115,19 +119,30 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { } @Override - public ConvertStream<TopicMap> getTopics() throws LDAAnalyzerException { + public ConvertStream<List<TopicCount>> getTopics() throws LDAAnalyzerException { File tassign = new File(modelDir, "jgibb.tassign"); try { - return new ConvertStream<TopicMap>(tassign) { + return new ConvertStream<List<TopicCount>>(tassign) { @Override - public TopicMap convert(String line) { - TopicMap map = new TopicMap(); + public List<TopicCount> convert(String line) { + // count topics + Map<String, Integer> countMap = new HashMap<>(); String[] wordList = line.split("\\s+"); for (String word : wordList) { - String[] wordTopic = word.split(":"); - map.put(wordTopic[1]); + String topic = word.split(":")[1]; + Integer count = countMap.get(topic); + countMap.put(topic, count == null ? 1 : count + 1); } - return map; + + // turn into list + List<TopicCount> topicCount = new ArrayList<>(countMap.size()); + for (Entry<String, Integer> e : countMap.entrySet()) { + topicCount.add(new TopicCount(e.getKey(), e.getValue())); + } + + Collections.sort(topicCount, Collections.reverseOrder()); + + return topicCount; } }; } catch (FileNotFoundException e) { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java index 51a92cef369438dea98ab705f61c1c74801afb7f..8cef4fe5b50f15bcfc59f756b976061035b948f4 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java @@ -1,11 +1,13 @@ package de.vipra.cmd.lda; +import java.util.List; + import de.vipra.cmd.ex.LDAAnalyzerException; import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.ConvertStream; -import de.vipra.util.model.TopicDefinition; -import de.vipra.util.model.TopicMap; +import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicCount; import de.vipra.util.Config.Key; public abstract class LDAAnalyzer { @@ -24,9 +26,9 @@ public abstract class LDAAnalyzer { public abstract void analyze() throws LDAAnalyzerException; - public abstract ConvertStream<TopicDefinition> getTopicDefinitions() throws LDAAnalyzerException; + public abstract ConvertStream<Topic> getTopicDefinitions() throws LDAAnalyzerException; - public abstract ConvertStream<TopicMap> getTopics() throws LDAAnalyzerException; + public abstract ConvertStream<List<TopicCount>> getTopics() throws LDAAnalyzerException; public static LDAAnalyzer getAnalyzer(Config config) throws LDAAnalyzerException { LDAAnalyzer analyzer = null; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java index 653ab04700645cb82ddedd26fbe9ee9b76ae1c1e..9f8ac68ac6cf5249e19b01c2ca6393237daa91b5 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java @@ -14,7 +14,7 @@ import de.vipra.util.Config; import de.vipra.util.ConsoleUtils; import de.vipra.util.Constants; import de.vipra.util.ex.ConfigException; -import de.vipra.util.model.TopicDefinition; +import de.vipra.util.model.Topic; import de.vipra.util.service.DatabaseService; public class ClearCommand implements Command { @@ -25,7 +25,7 @@ public class ClearCommand implements Command { private boolean defaults; private Config config; private DatabaseService<Article> dbArticles; - private DatabaseService<TopicDefinition> dbTopics; + private DatabaseService<Topic> dbTopics; public ClearCommand(boolean defaults) { this.defaults = defaults; @@ -35,7 +35,7 @@ public class ClearCommand implements Command { try { config = Config.getConfig(); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); - dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, TopicDefinition.class); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); } catch (Exception e) { throw new ClearException(e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 78458237ada19fd22b7dcfe8830d1cbb9076af8f..f95a3b559a8aa37cb93876ed7a01cdfa072e76f9 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -6,8 +6,10 @@ import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.Map; import org.json.simple.JSONArray; import org.json.simple.JSONObject; @@ -32,8 +34,8 @@ import de.vipra.util.StringUtils; import de.vipra.util.Timer; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleStats; -import de.vipra.util.model.TopicDefinition; -import de.vipra.util.model.TopicMap; +import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicCount; import de.vipra.util.service.DatabaseService; public class ImportCommand implements Command { @@ -45,7 +47,7 @@ public class ImportCommand implements Command { private JSONParser parser = new JSONParser(); private Config config; private DatabaseService<Article> dbArticles; - private DatabaseService<TopicDefinition> dbTopics; + private DatabaseService<Topic> dbTopics; private Filebase filebase; private Processor preprocessor; private LDAAnalyzer analyzer; @@ -155,14 +157,18 @@ public class ImportCommand implements Command { * @throws LDAAnalyzerException * @throws DatabaseException */ - private void saveTopicDefinitions() throws LDAAnalyzerException, DatabaseException { - ConvertStream<TopicDefinition> topics = analyzer.getTopicDefinitions(); + private Map<String, String> saveTopicDefinitions() throws LDAAnalyzerException, DatabaseException { + ConvertStream<Topic> topics = analyzer.getTopicDefinitions(); + Map<String, String> topicIndexMap = new HashMap<>(); // recreate topics in database dbTopics.drop(); - for (TopicDefinition topic : topics) { - dbTopics.createSingle(topic); + for (Topic topic : topics) { + Topic newTopic = dbTopics.createSingle(topic); + topicIndexMap.put(Integer.toString(newTopic.getIndex()), newTopic.getId()); } + + return topicIndexMap; } /** @@ -174,18 +180,25 @@ public class ImportCommand implements Command { * * @throws LDAAnalyzerException */ - private void saveTopicsPerDocument() throws LDAAnalyzerException { - ConvertStream<TopicMap> topics = analyzer.getTopics(); + private void saveTopicsPerDocument(Map<String, String> topicIndexMap) throws LDAAnalyzerException { + ConvertStream<List<TopicCount>> topics = analyzer.getTopics(); FilebaseIndex index = filebase.getIndex(); Iterator<String> indexIter = index.iterator(); - Iterator<TopicMap> topicIter = topics.iterator(); + Iterator<List<TopicCount>> topicIter = topics.iterator(); while (indexIter.hasNext() && topicIter.hasNext()) { String id = indexIter.next(); - TopicMap map = topicIter.next(); + List<TopicCount> topicCount = topicIter.next(); + for (TopicCount tc : topicCount) { + String oid = topicIndexMap.get(tc.getId()); + if (oid != null) + tc.setId(topicIndexMap.get(tc.getId())); + else + log.error("no object id for topic index " + tc.getId()); + } Article a = dbArticles.getSingle(id); - a.setTopics(map); + a.setTopics(topicCount); try { dbArticles.updateSingle(a); } catch (DatabaseException e) { @@ -199,7 +212,7 @@ public class ImportCommand implements Command { try { config = Config.getConfig(); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); - dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, TopicDefinition.class); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); filebase = Filebase.getFilebase(config); preprocessor = Processor.getPreprocessor(config); analyzer = LDAAnalyzer.getAnalyzer(config); @@ -226,9 +239,10 @@ public class ImportCommand implements Command { analyzer.analyze(); long durAnalyze = timer.lap(); + // save topic model out.info("saving topic models"); - saveTopicDefinitions(); - saveTopicsPerDocument(); + Map<String, String> topicIndexMap = saveTopicDefinitions(); + saveTopicsPerDocument(topicIndexMap); out.info("imported " + imported + " " + (imported == 1 ? "article" : "articles")); out.info("import: " + StringUtils.timeString(durImport) + ", analyze: " diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java index dd9e9963ba2d2719ae76656e3b322a55d778ac7d..8410d7848d626a094f22cef4252a4ff40b8c8489 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java @@ -13,7 +13,7 @@ import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; -import de.vipra.util.model.TopicDefinition; +import de.vipra.util.model.Topic; import de.vipra.util.service.DatabaseService; public class StatsCommand implements Command { @@ -23,7 +23,7 @@ public class StatsCommand implements Command { private Config config; private Filebase filebase; - private DatabaseService<TopicDefinition> dbTopics; + private DatabaseService<Topic> dbTopics; private void stats() { File modelFile = filebase.getModelFile(); @@ -38,7 +38,7 @@ public class StatsCommand implements Command { try { config = Config.getConfig(); filebase = Filebase.getFilebase(config); - dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, TopicDefinition.class); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); stats(); } catch (IOException | ConfigException | FilebaseException e) { diff --git a/vipra-rest/src/main/java/de/vipra/rest/model/TopicDefinition.java b/vipra-rest/src/main/java/de/vipra/rest/model/TopicDefinition.java index a514a4cd1248a8c00773f73818192ab490115435..0541f4a1625aa9376a895e3a9ddd7e791f92c984 100644 --- a/vipra-rest/src/main/java/de/vipra/rest/model/TopicDefinition.java +++ b/vipra-rest/src/main/java/de/vipra/rest/model/TopicDefinition.java @@ -4,7 +4,7 @@ import java.net.URI; import java.util.HashMap; import java.util.Map; -public class TopicDefinition extends de.vipra.util.model.TopicDefinition implements Linked { +public class TopicDefinition extends de.vipra.util.model.Topic implements Linked { private Map<String, String> links; diff --git a/vipra-util/src/main/java/de/vipra/util/model/Article.java b/vipra-util/src/main/java/de/vipra/util/model/Article.java index 9471bc878bf9da32b4f747634e9fa3330cdb8be9..0c92154dcc743a885777732e46cc0ad419008932 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Article.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Article.java @@ -4,6 +4,7 @@ import java.io.File; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Date; import java.util.List; @@ -22,7 +23,7 @@ public class Article extends Model { private Date date; private boolean complete; private ArticleStats stats; - private TopicMap topics; + private List<TopicCount> topics; public String getTitle() { return title; @@ -79,11 +80,11 @@ public class Article extends Model { } catch (ParseException e) {} } - public TopicMap getTopics() { + public List<TopicCount> getTopics() { return topics; } - public void setTopics(TopicMap topics) { + public void setTopics(List<TopicCount> topics) { this.topics = topics; } @@ -98,8 +99,12 @@ public class Article extends Model { document.put("date", getDate()); if (getStats() != null) document.put("stats", getStats().toDocument()); - if (getTopics() != null) - document.put("topics", getTopics().toDocument()); + if (getTopics() != null) { + List<Document> topicDocs = new ArrayList<>(topics.size()); + for (TopicCount tc : topics) + topicDocs.add(tc.toDocument()); + document.put("topics", topicDocs); + } return document; } @@ -112,8 +117,13 @@ public class Article extends Model { setDate(document.getDate("date")); if (document.containsKey("stats")) setStats(new ArticleStats((Document) document.get("stats"))); - if (document.containsKey("topics")) - setTopics(new TopicMap((Document) document.get("topics"))); + if (document.containsKey("topics")) { + @SuppressWarnings("unchecked") + List<Document> topicDocs = (List<Document>) document.get("topics"); + topics = new ArrayList<>(topicDocs.size()); + for (Document doc : topicDocs) + topics.add(new TopicCount(doc)); + } } @Override diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicDefinition.java b/vipra-util/src/main/java/de/vipra/util/model/Topic.java similarity index 86% rename from vipra-util/src/main/java/de/vipra/util/model/TopicDefinition.java rename to vipra-util/src/main/java/de/vipra/util/model/Topic.java index dbaaecd6632133fcf3e983cc3abe87f1717eb99a..a55aa98feca3284f1843d03c627fb430fba5d0ba 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicDefinition.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Topic.java @@ -7,17 +7,18 @@ import java.util.List; import org.bson.Document; +import de.vipra.util.MongoUtils; import de.vipra.util.ex.NotImplementedException; -public class TopicDefinition extends Model { +public class Topic extends Model { private int index; private String name; private List<TopicWord> words; - public TopicDefinition() {} + public Topic() {} - public TopicDefinition(List<TopicWord> words) { + public Topic(List<TopicWord> words) { this.words = words; } @@ -48,6 +49,7 @@ public class TopicDefinition extends Model { @SuppressWarnings("unchecked") @Override public void fromDocument(Document document) { + setId(document.getObjectId("_id").toString()); setName(document.getString("name")); setIndex(document.getInteger("index", 0)); if (document.containsKey("words")) { @@ -62,6 +64,8 @@ public class TopicDefinition extends Model { @Override public Document toDocument() { Document document = new Document(); + if (getId() != null) + document.put("_id", MongoUtils.objectId(getId())); document.append("name", getName()); document.append("index", getIndex()); if (getWords() != null) { diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicCount.java b/vipra-util/src/main/java/de/vipra/util/model/TopicCount.java new file mode 100644 index 0000000000000000000000000000000000000000..3c6bc441a575eed25ad9e4f9d4a7f38e74e2277d --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicCount.java @@ -0,0 +1,58 @@ +package de.vipra.util.model; + +import org.bson.Document; + +import de.vipra.util.MongoUtils; + +public class TopicCount implements BsonDocument, Comparable<TopicCount> { + + private String id; + private int count; + + public TopicCount() {} + + public TopicCount(String id, int count) { + this.id = id; + this.count = count; + } + + public TopicCount(Document document) { + fromDocument(document); + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public int getCount() { + return count; + } + + public void setCount(int count) { + this.count = count; + } + + @Override + public Document toDocument() { + Document document = new Document(); + document.put("id", getId()); + document.append("count", count); + return document; + } + + @Override + public void fromDocument(Document document) { + this.id = document.getString("id"); + this.count = document.getInteger("count", 0); + } + + @Override + public int compareTo(TopicCount arg0) { + return count - arg0.getCount(); + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicMap.java b/vipra-util/src/main/java/de/vipra/util/model/TopicMap.java deleted file mode 100644 index c91fd6f76d00cbb186f357d26a647ec350360645..0000000000000000000000000000000000000000 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicMap.java +++ /dev/null @@ -1,39 +0,0 @@ -package de.vipra.util.model; - -import java.util.HashMap; - -import org.bson.Document; - -public class TopicMap extends HashMap<String, Integer> implements BsonDocument { - - private static final long serialVersionUID = 1L; - - public TopicMap() {} - - public TopicMap(Document document) { - fromDocument(document); - } - - public void put(String topic) { - Integer i = this.get(topic); - this.put(topic, i == null ? 1 : i + 1); - } - - @Override - public void fromDocument(Document document) { - clear(); - for (String key : document.keySet()) { - put(key, document.getInteger(key)); - } - } - - @Override - public Document toDocument() { - Document document = new Document(); - for (String key : keySet()) { - document.append(key, get(key)); - } - return document; - } - -}