Skip to content
Snippets Groups Projects
Commit 4fa565cb authored by Eike Cochu's avatar Eike Cochu
Browse files

removed topic map

topic counts are now stored as objectid/count pairs in an array in each article
renamed topicdefinition to topic
parent dfef8a45
No related branches found
No related tags found
No related merge requests found
Showing
with 152 additions and 88 deletions
...@@ -3,7 +3,11 @@ package de.vipra.cmd.lda; ...@@ -3,7 +3,11 @@ package de.vipra.cmd.lda;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -13,8 +17,8 @@ import de.vipra.util.Config; ...@@ -13,8 +17,8 @@ import de.vipra.util.Config;
import de.vipra.util.ConvertStream; import de.vipra.util.ConvertStream;
import de.vipra.util.StringUtils; import de.vipra.util.StringUtils;
import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.ConfigException;
import de.vipra.util.model.TopicDefinition; import de.vipra.util.model.Topic;
import de.vipra.util.model.TopicMap; import de.vipra.util.model.TopicCount;
import de.vipra.util.model.TopicWord; import de.vipra.util.model.TopicWord;
import jgibblda.Estimator; import jgibblda.Estimator;
import jgibblda.Inferencer; import jgibblda.Inferencer;
...@@ -77,13 +81,13 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { ...@@ -77,13 +81,13 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer {
} }
@Override @Override
public ConvertStream<TopicDefinition> getTopicDefinitions() throws LDAAnalyzerException { public ConvertStream<Topic> getTopicDefinitions() throws LDAAnalyzerException {
File twords = new File(modelDir, "jgibb.twords"); File twords = new File(modelDir, "jgibb.twords");
try { try {
return new ConvertStream<TopicDefinition>(twords) { return new ConvertStream<Topic>(twords) {
@Override @Override
public TopicDefinition convert(String line) { public Topic convert(String line) {
TopicDefinition topicDef = new TopicDefinition(); Topic topicDef = new Topic();
List<TopicWord> topicWords = new ArrayList<>(); List<TopicWord> topicWords = new ArrayList<>();
Integer index = StringUtils.getFirstNumber(line); Integer index = StringUtils.getFirstNumber(line);
if (index == null) { if (index == null) {
...@@ -115,19 +119,30 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { ...@@ -115,19 +119,30 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer {
} }
@Override @Override
public ConvertStream<TopicMap> getTopics() throws LDAAnalyzerException { public ConvertStream<List<TopicCount>> getTopics() throws LDAAnalyzerException {
File tassign = new File(modelDir, "jgibb.tassign"); File tassign = new File(modelDir, "jgibb.tassign");
try { try {
return new ConvertStream<TopicMap>(tassign) { return new ConvertStream<List<TopicCount>>(tassign) {
@Override @Override
public TopicMap convert(String line) { public List<TopicCount> convert(String line) {
TopicMap map = new TopicMap(); // count topics
Map<String, Integer> countMap = new HashMap<>();
String[] wordList = line.split("\\s+"); String[] wordList = line.split("\\s+");
for (String word : wordList) { for (String word : wordList) {
String[] wordTopic = word.split(":"); String topic = word.split(":")[1];
map.put(wordTopic[1]); Integer count = countMap.get(topic);
countMap.put(topic, count == null ? 1 : count + 1);
} }
return map;
// turn into list
List<TopicCount> topicCount = new ArrayList<>(countMap.size());
for (Entry<String, Integer> e : countMap.entrySet()) {
topicCount.add(new TopicCount(e.getKey(), e.getValue()));
}
Collections.sort(topicCount, Collections.reverseOrder());
return topicCount;
} }
}; };
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
......
package de.vipra.cmd.lda; package de.vipra.cmd.lda;
import java.util.List;
import de.vipra.cmd.ex.LDAAnalyzerException; import de.vipra.cmd.ex.LDAAnalyzerException;
import de.vipra.util.Config; import de.vipra.util.Config;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.ConvertStream; import de.vipra.util.ConvertStream;
import de.vipra.util.model.TopicDefinition; import de.vipra.util.model.Topic;
import de.vipra.util.model.TopicMap; import de.vipra.util.model.TopicCount;
import de.vipra.util.Config.Key; import de.vipra.util.Config.Key;
public abstract class LDAAnalyzer { public abstract class LDAAnalyzer {
...@@ -24,9 +26,9 @@ public abstract class LDAAnalyzer { ...@@ -24,9 +26,9 @@ public abstract class LDAAnalyzer {
public abstract void analyze() throws LDAAnalyzerException; public abstract void analyze() throws LDAAnalyzerException;
public abstract ConvertStream<TopicDefinition> getTopicDefinitions() throws LDAAnalyzerException; public abstract ConvertStream<Topic> getTopicDefinitions() throws LDAAnalyzerException;
public abstract ConvertStream<TopicMap> getTopics() throws LDAAnalyzerException; public abstract ConvertStream<List<TopicCount>> getTopics() throws LDAAnalyzerException;
public static LDAAnalyzer getAnalyzer(Config config) throws LDAAnalyzerException { public static LDAAnalyzer getAnalyzer(Config config) throws LDAAnalyzerException {
LDAAnalyzer analyzer = null; LDAAnalyzer analyzer = null;
......
...@@ -14,7 +14,7 @@ import de.vipra.util.Config; ...@@ -14,7 +14,7 @@ import de.vipra.util.Config;
import de.vipra.util.ConsoleUtils; import de.vipra.util.ConsoleUtils;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.ConfigException;
import de.vipra.util.model.TopicDefinition; import de.vipra.util.model.Topic;
import de.vipra.util.service.DatabaseService; import de.vipra.util.service.DatabaseService;
public class ClearCommand implements Command { public class ClearCommand implements Command {
...@@ -25,7 +25,7 @@ public class ClearCommand implements Command { ...@@ -25,7 +25,7 @@ public class ClearCommand implements Command {
private boolean defaults; private boolean defaults;
private Config config; private Config config;
private DatabaseService<Article> dbArticles; private DatabaseService<Article> dbArticles;
private DatabaseService<TopicDefinition> dbTopics; private DatabaseService<Topic> dbTopics;
public ClearCommand(boolean defaults) { public ClearCommand(boolean defaults) {
this.defaults = defaults; this.defaults = defaults;
...@@ -35,7 +35,7 @@ public class ClearCommand implements Command { ...@@ -35,7 +35,7 @@ public class ClearCommand implements Command {
try { try {
config = Config.getConfig(); config = Config.getConfig();
dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class);
dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, TopicDefinition.class); dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class);
} catch (Exception e) { } catch (Exception e) {
throw new ClearException(e); throw new ClearException(e);
} }
......
...@@ -6,8 +6,10 @@ import java.io.FileReader; ...@@ -6,8 +6,10 @@ import java.io.FileReader;
import java.io.FilenameFilter; import java.io.FilenameFilter;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map;
import org.json.simple.JSONArray; import org.json.simple.JSONArray;
import org.json.simple.JSONObject; import org.json.simple.JSONObject;
...@@ -32,8 +34,8 @@ import de.vipra.util.StringUtils; ...@@ -32,8 +34,8 @@ import de.vipra.util.StringUtils;
import de.vipra.util.Timer; import de.vipra.util.Timer;
import de.vipra.util.ex.DatabaseException; import de.vipra.util.ex.DatabaseException;
import de.vipra.util.model.ArticleStats; import de.vipra.util.model.ArticleStats;
import de.vipra.util.model.TopicDefinition; import de.vipra.util.model.Topic;
import de.vipra.util.model.TopicMap; import de.vipra.util.model.TopicCount;
import de.vipra.util.service.DatabaseService; import de.vipra.util.service.DatabaseService;
public class ImportCommand implements Command { public class ImportCommand implements Command {
...@@ -45,7 +47,7 @@ public class ImportCommand implements Command { ...@@ -45,7 +47,7 @@ public class ImportCommand implements Command {
private JSONParser parser = new JSONParser(); private JSONParser parser = new JSONParser();
private Config config; private Config config;
private DatabaseService<Article> dbArticles; private DatabaseService<Article> dbArticles;
private DatabaseService<TopicDefinition> dbTopics; private DatabaseService<Topic> dbTopics;
private Filebase filebase; private Filebase filebase;
private Processor preprocessor; private Processor preprocessor;
private LDAAnalyzer analyzer; private LDAAnalyzer analyzer;
...@@ -155,14 +157,18 @@ public class ImportCommand implements Command { ...@@ -155,14 +157,18 @@ public class ImportCommand implements Command {
* @throws LDAAnalyzerException * @throws LDAAnalyzerException
* @throws DatabaseException * @throws DatabaseException
*/ */
private void saveTopicDefinitions() throws LDAAnalyzerException, DatabaseException { private Map<String, String> saveTopicDefinitions() throws LDAAnalyzerException, DatabaseException {
ConvertStream<TopicDefinition> topics = analyzer.getTopicDefinitions(); ConvertStream<Topic> topics = analyzer.getTopicDefinitions();
Map<String, String> topicIndexMap = new HashMap<>();
// recreate topics in database // recreate topics in database
dbTopics.drop(); dbTopics.drop();
for (TopicDefinition topic : topics) { for (Topic topic : topics) {
dbTopics.createSingle(topic); Topic newTopic = dbTopics.createSingle(topic);
topicIndexMap.put(Integer.toString(newTopic.getIndex()), newTopic.getId());
} }
return topicIndexMap;
} }
/** /**
...@@ -174,18 +180,25 @@ public class ImportCommand implements Command { ...@@ -174,18 +180,25 @@ public class ImportCommand implements Command {
* *
* @throws LDAAnalyzerException * @throws LDAAnalyzerException
*/ */
private void saveTopicsPerDocument() throws LDAAnalyzerException { private void saveTopicsPerDocument(Map<String, String> topicIndexMap) throws LDAAnalyzerException {
ConvertStream<TopicMap> topics = analyzer.getTopics(); ConvertStream<List<TopicCount>> topics = analyzer.getTopics();
FilebaseIndex index = filebase.getIndex(); FilebaseIndex index = filebase.getIndex();
Iterator<String> indexIter = index.iterator(); Iterator<String> indexIter = index.iterator();
Iterator<TopicMap> topicIter = topics.iterator(); Iterator<List<TopicCount>> topicIter = topics.iterator();
while (indexIter.hasNext() && topicIter.hasNext()) { while (indexIter.hasNext() && topicIter.hasNext()) {
String id = indexIter.next(); String id = indexIter.next();
TopicMap map = topicIter.next(); List<TopicCount> topicCount = topicIter.next();
for (TopicCount tc : topicCount) {
String oid = topicIndexMap.get(tc.getId());
if (oid != null)
tc.setId(topicIndexMap.get(tc.getId()));
else
log.error("no object id for topic index " + tc.getId());
}
Article a = dbArticles.getSingle(id); Article a = dbArticles.getSingle(id);
a.setTopics(map); a.setTopics(topicCount);
try { try {
dbArticles.updateSingle(a); dbArticles.updateSingle(a);
} catch (DatabaseException e) { } catch (DatabaseException e) {
...@@ -199,7 +212,7 @@ public class ImportCommand implements Command { ...@@ -199,7 +212,7 @@ public class ImportCommand implements Command {
try { try {
config = Config.getConfig(); config = Config.getConfig();
dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class);
dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, TopicDefinition.class); dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class);
filebase = Filebase.getFilebase(config); filebase = Filebase.getFilebase(config);
preprocessor = Processor.getPreprocessor(config); preprocessor = Processor.getPreprocessor(config);
analyzer = LDAAnalyzer.getAnalyzer(config); analyzer = LDAAnalyzer.getAnalyzer(config);
...@@ -226,9 +239,10 @@ public class ImportCommand implements Command { ...@@ -226,9 +239,10 @@ public class ImportCommand implements Command {
analyzer.analyze(); analyzer.analyze();
long durAnalyze = timer.lap(); long durAnalyze = timer.lap();
// save topic model
out.info("saving topic models"); out.info("saving topic models");
saveTopicDefinitions(); Map<String, String> topicIndexMap = saveTopicDefinitions();
saveTopicsPerDocument(); saveTopicsPerDocument(topicIndexMap);
out.info("imported " + imported + " " + (imported == 1 ? "article" : "articles")); out.info("imported " + imported + " " + (imported == 1 ? "article" : "articles"));
out.info("import: " + StringUtils.timeString(durImport) + ", analyze: " out.info("import: " + StringUtils.timeString(durImport) + ", analyze: "
......
...@@ -13,7 +13,7 @@ import de.vipra.util.Config; ...@@ -13,7 +13,7 @@ import de.vipra.util.Config;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.StringUtils; import de.vipra.util.StringUtils;
import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.ConfigException;
import de.vipra.util.model.TopicDefinition; import de.vipra.util.model.Topic;
import de.vipra.util.service.DatabaseService; import de.vipra.util.service.DatabaseService;
public class StatsCommand implements Command { public class StatsCommand implements Command {
...@@ -23,7 +23,7 @@ public class StatsCommand implements Command { ...@@ -23,7 +23,7 @@ public class StatsCommand implements Command {
private Config config; private Config config;
private Filebase filebase; private Filebase filebase;
private DatabaseService<TopicDefinition> dbTopics; private DatabaseService<Topic> dbTopics;
private void stats() { private void stats() {
File modelFile = filebase.getModelFile(); File modelFile = filebase.getModelFile();
...@@ -38,7 +38,7 @@ public class StatsCommand implements Command { ...@@ -38,7 +38,7 @@ public class StatsCommand implements Command {
try { try {
config = Config.getConfig(); config = Config.getConfig();
filebase = Filebase.getFilebase(config); filebase = Filebase.getFilebase(config);
dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, TopicDefinition.class); dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class);
stats(); stats();
} catch (IOException | ConfigException | FilebaseException e) { } catch (IOException | ConfigException | FilebaseException e) {
......
...@@ -4,7 +4,7 @@ import java.net.URI; ...@@ -4,7 +4,7 @@ import java.net.URI;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
public class TopicDefinition extends de.vipra.util.model.TopicDefinition implements Linked { public class TopicDefinition extends de.vipra.util.model.Topic implements Linked {
private Map<String, String> links; private Map<String, String> links;
......
...@@ -4,6 +4,7 @@ import java.io.File; ...@@ -4,6 +4,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
...@@ -22,7 +23,7 @@ public class Article extends Model { ...@@ -22,7 +23,7 @@ public class Article extends Model {
private Date date; private Date date;
private boolean complete; private boolean complete;
private ArticleStats stats; private ArticleStats stats;
private TopicMap topics; private List<TopicCount> topics;
public String getTitle() { public String getTitle() {
return title; return title;
...@@ -79,11 +80,11 @@ public class Article extends Model { ...@@ -79,11 +80,11 @@ public class Article extends Model {
} catch (ParseException e) {} } catch (ParseException e) {}
} }
public TopicMap getTopics() { public List<TopicCount> getTopics() {
return topics; return topics;
} }
public void setTopics(TopicMap topics) { public void setTopics(List<TopicCount> topics) {
this.topics = topics; this.topics = topics;
} }
...@@ -98,8 +99,12 @@ public class Article extends Model { ...@@ -98,8 +99,12 @@ public class Article extends Model {
document.put("date", getDate()); document.put("date", getDate());
if (getStats() != null) if (getStats() != null)
document.put("stats", getStats().toDocument()); document.put("stats", getStats().toDocument());
if (getTopics() != null) if (getTopics() != null) {
document.put("topics", getTopics().toDocument()); List<Document> topicDocs = new ArrayList<>(topics.size());
for (TopicCount tc : topics)
topicDocs.add(tc.toDocument());
document.put("topics", topicDocs);
}
return document; return document;
} }
...@@ -112,8 +117,13 @@ public class Article extends Model { ...@@ -112,8 +117,13 @@ public class Article extends Model {
setDate(document.getDate("date")); setDate(document.getDate("date"));
if (document.containsKey("stats")) if (document.containsKey("stats"))
setStats(new ArticleStats((Document) document.get("stats"))); setStats(new ArticleStats((Document) document.get("stats")));
if (document.containsKey("topics")) if (document.containsKey("topics")) {
setTopics(new TopicMap((Document) document.get("topics"))); @SuppressWarnings("unchecked")
List<Document> topicDocs = (List<Document>) document.get("topics");
topics = new ArrayList<>(topicDocs.size());
for (Document doc : topicDocs)
topics.add(new TopicCount(doc));
}
} }
@Override @Override
......
...@@ -7,17 +7,18 @@ import java.util.List; ...@@ -7,17 +7,18 @@ import java.util.List;
import org.bson.Document; import org.bson.Document;
import de.vipra.util.MongoUtils;
import de.vipra.util.ex.NotImplementedException; import de.vipra.util.ex.NotImplementedException;
public class TopicDefinition extends Model { public class Topic extends Model {
private int index; private int index;
private String name; private String name;
private List<TopicWord> words; private List<TopicWord> words;
public TopicDefinition() {} public Topic() {}
public TopicDefinition(List<TopicWord> words) { public Topic(List<TopicWord> words) {
this.words = words; this.words = words;
} }
...@@ -48,6 +49,7 @@ public class TopicDefinition extends Model { ...@@ -48,6 +49,7 @@ public class TopicDefinition extends Model {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
@Override @Override
public void fromDocument(Document document) { public void fromDocument(Document document) {
setId(document.getObjectId("_id").toString());
setName(document.getString("name")); setName(document.getString("name"));
setIndex(document.getInteger("index", 0)); setIndex(document.getInteger("index", 0));
if (document.containsKey("words")) { if (document.containsKey("words")) {
...@@ -62,6 +64,8 @@ public class TopicDefinition extends Model { ...@@ -62,6 +64,8 @@ public class TopicDefinition extends Model {
@Override @Override
public Document toDocument() { public Document toDocument() {
Document document = new Document(); Document document = new Document();
if (getId() != null)
document.put("_id", MongoUtils.objectId(getId()));
document.append("name", getName()); document.append("name", getName());
document.append("index", getIndex()); document.append("index", getIndex());
if (getWords() != null) { if (getWords() != null) {
......
package de.vipra.util.model;
import org.bson.Document;
import de.vipra.util.MongoUtils;
public class TopicCount implements BsonDocument, Comparable<TopicCount> {
private String id;
private int count;
public TopicCount() {}
public TopicCount(String id, int count) {
this.id = id;
this.count = count;
}
public TopicCount(Document document) {
fromDocument(document);
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public Document toDocument() {
Document document = new Document();
document.put("id", getId());
document.append("count", count);
return document;
}
@Override
public void fromDocument(Document document) {
this.id = document.getString("id");
this.count = document.getInteger("count", 0);
}
@Override
public int compareTo(TopicCount arg0) {
return count - arg0.getCount();
}
}
package de.vipra.util.model;
import java.util.HashMap;
import org.bson.Document;
public class TopicMap extends HashMap<String, Integer> implements BsonDocument {
private static final long serialVersionUID = 1L;
public TopicMap() {}
public TopicMap(Document document) {
fromDocument(document);
}
public void put(String topic) {
Integer i = this.get(topic);
this.put(topic, i == null ? 1 : i + 1);
}
@Override
public void fromDocument(Document document) {
clear();
for (String key : document.keySet()) {
put(key, document.getInteger(key));
}
}
@Override
public Document toDocument() {
Document document = new Document();
for (String key : keySet()) {
document.append(key, get(key));
}
return document;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment