diff --git a/ma-impl.sublime-workspace b/ma-impl.sublime-workspace index 15db5673770ad9158b7eae57022adafa9073223d..badf5bebf68c6d84bf2b41dd6044e8c6793028c7 100644 --- a/ma-impl.sublime-workspace +++ b/ma-impl.sublime-workspace @@ -272,10 +272,10 @@ "buffers": [ { - "file": "Vagrantfile", + "file": "/home/eike/.local/share/vipra/jgibb/jgibb.tassign", "settings": { - "buffer_size": 955, + "buffer_size": 17796, "line_ending": "Unix" } } @@ -461,8 +461,8 @@ ], "file_history": [ - "/home/eike/.local/share/vipra/jgibb/jgibb.twords", "/home/eike/.local/share/vipra/jgibb/jgibb.tassign", + "/home/eike/.local/share/vipra/jgibb/jgibb.twords", "/home/eike/Repositories/fu/ss15/ma/impl/TODO", "/home/eike/.local/share/vipra/jgibb/vocab", "/home/eike/.local/share/vipra/jgibb/index", @@ -914,19 +914,19 @@ [ { "buffer": 0, - "file": "Vagrantfile", - "semi_transient": true, + "file": "/home/eike/.local/share/vipra/jgibb/jgibb.tassign", + "semi_transient": false, "settings": { - "buffer_size": 955, + "buffer_size": 17796, "regions": { }, "selection": [ [ - 797, - 797 + 4764, + 4764 ] ], "settings": @@ -934,68 +934,66 @@ "BracketHighlighterBusy": false, "bh_regions": [ + "bh_angle", + "bh_angle_center", + "bh_angle_open", + "bh_angle_close", + "bh_angle_content", "bh_c_define", "bh_c_define_center", "bh_c_define_open", "bh_c_define_close", "bh_c_define_content", - "bh_square", - "bh_square_center", - "bh_square_open", - "bh_square_close", - "bh_square_content", - "bh_single_quote", - "bh_single_quote_center", - "bh_single_quote_open", - "bh_single_quote_close", - "bh_single_quote_content", "bh_curly", "bh_curly_center", "bh_curly_open", "bh_curly_close", "bh_curly_content", + "bh_regex", + "bh_regex_center", + "bh_regex_open", + "bh_regex_close", + "bh_regex_content", + "bh_square", + "bh_square_center", + "bh_square_open", + "bh_square_close", + "bh_square_content", "bh_double_quote", "bh_double_quote_center", "bh_double_quote_open", "bh_double_quote_close", "bh_double_quote_content", - "bh_tag", - "bh_tag_center", - "bh_tag_open", - "bh_tag_close", - "bh_tag_content", - "bh_angle", - "bh_angle_center", - "bh_angle_open", - "bh_angle_close", - "bh_angle_content", "bh_round", "bh_round_center", "bh_round_open", "bh_round_close", "bh_round_content", + "bh_unmatched", + "bh_unmatched_center", + "bh_unmatched_open", + "bh_unmatched_close", + "bh_unmatched_content", + "bh_single_quote", + "bh_single_quote_center", + "bh_single_quote_open", + "bh_single_quote_close", + "bh_single_quote_content", "bh_default", "bh_default_center", "bh_default_open", "bh_default_close", "bh_default_content", - "bh_regex", - "bh_regex_center", - "bh_regex_open", - "bh_regex_close", - "bh_regex_content", - "bh_unmatched", - "bh_unmatched_center", - "bh_unmatched_open", - "bh_unmatched_close", - "bh_unmatched_content" + "bh_tag", + "bh_tag_center", + "bh_tag_open", + "bh_tag_close", + "bh_tag_content" ], "incomplete_sync": null, "remote_loading": false, "synced": false, - "syntax": "Packages/Ruby/Ruby.sublime-syntax", - "tab_size": 2, - "translate_tabs_to_spaces": true + "syntax": "Packages/Text/Plain text.tmLanguage" }, "translation.x": 0.0, "translation.y": 0.0, diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java index 5b9a1887284a0d9bd8daae2e6c4a9a9b758b8aa8..e26181118401e883093e4a2c0f0919bc580ba2ec 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java @@ -1,10 +1,7 @@ package de.vipra.cmd.lda; -import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; +import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.List; @@ -12,15 +9,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.vipra.cmd.ex.LDAAnalyzerException; -import de.vipra.cmd.model.Article; import de.vipra.util.Config; -import de.vipra.util.FileUtils; -import de.vipra.util.FrequencyList; +import de.vipra.util.ConvertStream; +import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; -import de.vipra.util.ex.DatabaseException; -import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicDefinition; +import de.vipra.util.model.TopicMap; import de.vipra.util.model.TopicWord; -import de.vipra.util.service.DatabaseService; import jgibblda.Estimator; import jgibblda.Inferencer; import jgibblda.LDACmdOption; @@ -60,14 +55,6 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { options.modelName = "jgibb"; } - @Override - public void analyze() throws LDAAnalyzerException { - if (!modelFile.exists()) { - throw new LDAAnalyzerException("model file does not exist: " + modelFile.getAbsolutePath()); - } - estimate(); - } - private void estimate() { Estimator estimator = new Estimator(); estimator.init(options); @@ -81,35 +68,69 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { Model newModel = inferencer.inference(); } - private List<Topic> readTopics() throws IOException { + @Override + public void analyze() throws LDAAnalyzerException { + if (!modelFile.exists()) { + throw new LDAAnalyzerException("model file does not exist: " + modelFile.getAbsolutePath()); + } + estimate(); + } + + @Override + public ConvertStream<TopicDefinition> getTopicDefinitions() throws LDAAnalyzerException { File twords = new File(modelDir, "jgibb.twords"); - List<String> lines = FileUtils.readFile(twords); - List<Topic> topics = new ArrayList<>(); - List<TopicWord> topicWords = null; - for (String line : lines) { - if (line.startsWith("\t")) { - String[] parts = line.trim().split("\\s+"); - topicWords.add(new TopicWord(parts[0], Double.parseDouble(parts[1]))); - } else { - if (topicWords != null) - topics.add(new Topic(topicWords)); - topicWords = new ArrayList<>(); - } + try { + return new ConvertStream<TopicDefinition>(twords) { + @Override + public TopicDefinition convert(String line) { + TopicDefinition topicDef = new TopicDefinition(); + List<TopicWord> topicWords = new ArrayList<>(); + Integer index = StringUtils.getFirstNumber(line); + if (index == null) { + log.error("could not extract topic index from line: " + line); + } else { + topicDef.setIndex(index); + } + String nextLine; + while ((nextLine = nextLine()) != null) { + if (nextLine.startsWith("\t")) { + String[] parts = nextLine.trim().split("\\s+"); + try { + topicWords.add(new TopicWord(parts[0], Double.parseDouble(parts[1]))); + } catch (NumberFormatException e) { + log.error("could not parse number in line: " + nextLine); + } + } else { + buffer(nextLine); + break; + } + } + topicDef.setWords(topicWords); + return topicDef; + } + }; + } catch (FileNotFoundException e) { + throw new LDAAnalyzerException(e); } - return topics; } @Override - public void save(DatabaseService<Article> dbArticles, DatabaseService<Topic> dbTopics) throws LDAAnalyzerException { + public ConvertStream<TopicMap> getTopics() throws LDAAnalyzerException { + File tassign = new File(modelDir, "jgibb.tassign"); try { - List<Topic> topics = readTopics(); - - // recreate topics in database - dbTopics.drop(); - for (Topic topic : topics) { - dbTopics.createSingle(topic); - } - } catch (IOException | DatabaseException e) { + return new ConvertStream<TopicMap>(tassign) { + @Override + public TopicMap convert(String line) { + TopicMap map = new TopicMap(); + String[] wordList = line.split("\\s+"); + for (String word : wordList) { + String[] wordTopic = word.split(":"); + map.put(wordTopic[1]); + } + return map; + } + }; + } catch (FileNotFoundException e) { throw new LDAAnalyzerException(e); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java index c352eb8958bacab4c5a665249a5f8df451482016..51a92cef369438dea98ab705f61c1c74801afb7f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/LDAAnalyzer.java @@ -1,11 +1,11 @@ package de.vipra.cmd.lda; import de.vipra.cmd.ex.LDAAnalyzerException; -import de.vipra.cmd.model.Article; import de.vipra.util.Config; import de.vipra.util.Constants; -import de.vipra.util.model.Topic; -import de.vipra.util.service.DatabaseService; +import de.vipra.util.ConvertStream; +import de.vipra.util.model.TopicDefinition; +import de.vipra.util.model.TopicMap; import de.vipra.util.Config.Key; public abstract class LDAAnalyzer { @@ -24,8 +24,9 @@ public abstract class LDAAnalyzer { public abstract void analyze() throws LDAAnalyzerException; - public abstract void save(DatabaseService<Article> dbArticles, DatabaseService<Topic> dbTopics) - throws LDAAnalyzerException; + public abstract ConvertStream<TopicDefinition> getTopicDefinitions() throws LDAAnalyzerException; + + public abstract ConvertStream<TopicMap> getTopics() throws LDAAnalyzerException; public static LDAAnalyzer getAnalyzer(Config config) throws LDAAnalyzerException { LDAAnalyzer analyzer = null; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java index 9f8ac68ac6cf5249e19b01c2ca6393237daa91b5..653ab04700645cb82ddedd26fbe9ee9b76ae1c1e 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java @@ -14,7 +14,7 @@ import de.vipra.util.Config; import de.vipra.util.ConsoleUtils; import de.vipra.util.Constants; import de.vipra.util.ex.ConfigException; -import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicDefinition; import de.vipra.util.service.DatabaseService; public class ClearCommand implements Command { @@ -25,7 +25,7 @@ public class ClearCommand implements Command { private boolean defaults; private Config config; private DatabaseService<Article> dbArticles; - private DatabaseService<Topic> dbTopics; + private DatabaseService<TopicDefinition> dbTopics; public ClearCommand(boolean defaults) { this.defaults = defaults; @@ -35,7 +35,7 @@ public class ClearCommand implements Command { try { config = Config.getConfig(); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); - dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, TopicDefinition.class); } catch (Exception e) { throw new ClearException(e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 498f6b229c606ad19d69ab811baa2d9b0551a7f1..78458237ada19fd22b7dcfe8830d1cbb9076af8f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -6,6 +6,8 @@ import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; import org.json.simple.JSONArray; import org.json.simple.JSONObject; @@ -16,17 +18,22 @@ import org.slf4j.LoggerFactory; import de.vipra.cmd.ExecutionException; import de.vipra.cmd.ex.ImportException; +import de.vipra.cmd.ex.LDAAnalyzerException; import de.vipra.cmd.file.Filebase; +import de.vipra.cmd.file.FilebaseIndex; import de.vipra.cmd.lda.LDAAnalyzer; import de.vipra.cmd.model.Article; import de.vipra.cmd.text.Processor; import de.vipra.cmd.text.ProcessedText; import de.vipra.util.Config; import de.vipra.util.Constants; +import de.vipra.util.ConvertStream; import de.vipra.util.StringUtils; import de.vipra.util.Timer; +import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleStats; -import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicDefinition; +import de.vipra.util.model.TopicMap; import de.vipra.util.service.DatabaseService; public class ImportCommand implements Command { @@ -38,7 +45,7 @@ public class ImportCommand implements Command { private JSONParser parser = new JSONParser(); private Config config; private DatabaseService<Article> dbArticles; - private DatabaseService<Topic> dbTopics; + private DatabaseService<TopicDefinition> dbTopics; private Filebase filebase; private Processor preprocessor; private LDAAnalyzer analyzer; @@ -131,12 +138,68 @@ public class ImportCommand implements Command { return imported; } + private long importFiles(List<File> files) + throws FileNotFoundException, IOException, ParseException, ImportException { + long imported = 0; + for (File file : files) { + imported += importFile(file); + } + return imported; + } + + /** + * Saves topic definitions into a database collection. Topic definitions + * contain the words assigned to that topic and the likeliness, that a word + * belongs to that topic. + * + * @throws LDAAnalyzerException + * @throws DatabaseException + */ + private void saveTopicDefinitions() throws LDAAnalyzerException, DatabaseException { + ConvertStream<TopicDefinition> topics = analyzer.getTopicDefinitions(); + + // recreate topics in database + dbTopics.drop(); + for (TopicDefinition topic : topics) { + dbTopics.createSingle(topic); + } + } + + /** + * The analyzer saves the topics assigned to document words in the + * "*.tassign" file. This file is read line by line, each line is a single + * document. The line number corresponds to the line number in the index + * file, which holds the object id of that article. The topics are extracted + * and stored in the document. + * + * @throws LDAAnalyzerException + */ + private void saveTopicsPerDocument() throws LDAAnalyzerException { + ConvertStream<TopicMap> topics = analyzer.getTopics(); + FilebaseIndex index = filebase.getIndex(); + + Iterator<String> indexIter = index.iterator(); + Iterator<TopicMap> topicIter = topics.iterator(); + + while (indexIter.hasNext() && topicIter.hasNext()) { + String id = indexIter.next(); + TopicMap map = topicIter.next(); + Article a = dbArticles.getSingle(id); + a.setTopics(map); + try { + dbArticles.updateSingle(a); + } catch (DatabaseException e) { + log.error("could not update article: " + a.getTitle() + " (" + a.getId() + ")"); + } + } + } + @Override public void run() throws ExecutionException { try { config = Config.getConfig(); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); - dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, TopicDefinition.class); filebase = Filebase.getFilebase(config); preprocessor = Processor.getPreprocessor(config); analyzer = LDAAnalyzer.getAnalyzer(config); @@ -150,10 +213,7 @@ public class ImportCommand implements Command { // import files into database and filebase out.info("file import"); - long imported = 0; - for (File file : files) { - imported += importFile(file); - } + long imported = importFiles(files); long durImport = timer.lap(); // write filebase @@ -167,7 +227,8 @@ public class ImportCommand implements Command { long durAnalyze = timer.lap(); out.info("saving topic models"); - analyzer.save(dbArticles, dbTopics); + saveTopicDefinitions(); + saveTopicsPerDocument(); out.info("imported " + imported + " " + (imported == 1 ? "article" : "articles")); out.info("import: " + StringUtils.timeString(durImport) + ", analyze: " diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java index 8410d7848d626a094f22cef4252a4ff40b8c8489..dd9e9963ba2d2719ae76656e3b322a55d778ac7d 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java @@ -13,7 +13,7 @@ import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; -import de.vipra.util.model.Topic; +import de.vipra.util.model.TopicDefinition; import de.vipra.util.service.DatabaseService; public class StatsCommand implements Command { @@ -23,7 +23,7 @@ public class StatsCommand implements Command { private Config config; private Filebase filebase; - private DatabaseService<Topic> dbTopics; + private DatabaseService<TopicDefinition> dbTopics; private void stats() { File modelFile = filebase.getModelFile(); @@ -38,7 +38,7 @@ public class StatsCommand implements Command { try { config = Config.getConfig(); filebase = Filebase.getFilebase(config); - dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, TopicDefinition.class); stats(); } catch (IOException | ConfigException | FilebaseException e) { diff --git a/vipra-rest/src/main/java/de/vipra/rest/model/Topic.java b/vipra-rest/src/main/java/de/vipra/rest/model/Topic.java index 3cd619149cfc914baea052b92aa5fee8ae6f3323..d99d427335272ef88310d4bdb0dd64c7fe1dab97 100644 --- a/vipra-rest/src/main/java/de/vipra/rest/model/Topic.java +++ b/vipra-rest/src/main/java/de/vipra/rest/model/Topic.java @@ -4,7 +4,7 @@ import java.net.URI; import java.util.HashMap; import java.util.Map; -public class Topic extends de.vipra.util.model.Topic { +public class Topic extends de.vipra.util.model.TopicDefinition { private Map<String, String> links; diff --git a/vipra-rest/src/main/java/de/vipra/rest/serializer/ArticleSerializer.java b/vipra-rest/src/main/java/de/vipra/rest/serializer/ArticleSerializer.java index 071fcdef42e6c7df408fc534003df89a59767e61..b1c1e49b98ad28b92fc013d0ca3fd6b0aab0eb8d 100644 --- a/vipra-rest/src/main/java/de/vipra/rest/serializer/ArticleSerializer.java +++ b/vipra-rest/src/main/java/de/vipra/rest/serializer/ArticleSerializer.java @@ -18,7 +18,7 @@ public class ArticleSerializer extends JsonSerializer<Article> { throws IOException, JsonProcessingException { gen.writeStartObject(); gen.writeStringField("id", value.getId()); - gen.writeStringField("type", value.getType()); + gen.writeStringField("type", "article"); if (value.getLinks() != null) gen.writeObjectField("links", value.getLinks()); diff --git a/vipra-util/src/main/java/de/vipra/util/ConvertStream.java b/vipra-util/src/main/java/de/vipra/util/ConvertStream.java new file mode 100644 index 0000000000000000000000000000000000000000..c5dc60580bc56c8ef0290e946fae18e6f78fe196 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/ConvertStream.java @@ -0,0 +1,79 @@ +package de.vipra.util; + +import java.io.BufferedReader; +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Queue; + +import de.vipra.util.ex.NotImplementedException; + +public abstract class ConvertStream<T> implements Closeable, AutoCloseable, Iterator<T>, Iterable<T> { + + private final BufferedReader reader; + private Queue<String> buffer; + + public ConvertStream(File file) throws FileNotFoundException { + this.reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); + this.buffer = new LinkedList<>(); + } + + @Override + public void close() throws IOException { + reader.close(); + } + + protected String nextLine() { + if (buffer.isEmpty()) { + try { + return reader.readLine(); + } catch (IOException e) {} + } + return buffer.poll(); + } + + protected void buffer(String line) { + buffer.offer(line); + } + + @Override + public boolean hasNext() { + String line = null; + try { + line = reader.readLine(); + } catch (IOException e) {} + if (line != null) { + buffer.offer(line); + return true; + } + return false; + } + + @Override + public T next() { + if (buffer.isEmpty()) { + try { + return convert(reader.readLine()); + } catch (IOException e) {} + } + return convert(buffer.poll()); + } + + @Override + public void remove() { + throw new NotImplementedException(); + } + + @Override + public Iterator<T> iterator() { + return this; + } + + public abstract T convert(String line); + +} diff --git a/vipra-util/src/main/java/de/vipra/util/StringUtils.java b/vipra-util/src/main/java/de/vipra/util/StringUtils.java index 355845f1288122fcf6388f774330c2bb82ce8fe4..566ade74a16f1c2e16a8389c8d8f9ee09ab385b4 100644 --- a/vipra-util/src/main/java/de/vipra/util/StringUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/StringUtils.java @@ -5,6 +5,8 @@ import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public class StringUtils { @@ -110,4 +112,16 @@ public class StringUtils { return humanReadableByteCount(bytes, true); } + private static Pattern firstNumberPattern = Pattern.compile("[0-9]+"); + + public static Integer getFirstNumber(String in) { + Matcher m = firstNumberPattern.matcher(in); + if (m.find()) { + try { + return Integer.parseInt(m.group()); + } catch (NumberFormatException e) {} + } + return null; + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/model/Article.java b/vipra-util/src/main/java/de/vipra/util/model/Article.java index 8d1490166e3d5b1cebf70aa16f3f7a76935db726..9471bc878bf9da32b4f747634e9fa3330cdb8be9 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Article.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Article.java @@ -22,6 +22,7 @@ public class Article extends Model { private Date date; private boolean complete; private ArticleStats stats; + private TopicMap topics; public String getTitle() { return title; @@ -78,9 +79,12 @@ public class Article extends Model { } catch (ParseException e) {} } - @Override - public String getType() { - return Article.class.getSimpleName().toLowerCase(); + public TopicMap getTopics() { + return topics; + } + + public void setTopics(TopicMap topics) { + this.topics = topics; } @Override @@ -94,6 +98,8 @@ public class Article extends Model { document.put("date", getDate()); if (getStats() != null) document.put("stats", getStats().toDocument()); + if (getTopics() != null) + document.put("topics", getTopics().toDocument()); return document; } @@ -106,6 +112,8 @@ public class Article extends Model { setDate(document.getDate("date")); if (document.containsKey("stats")) setStats(new ArticleStats((Document) document.get("stats"))); + if (document.containsKey("topics")) + setTopics(new TopicMap((Document) document.get("topics"))); } @Override diff --git a/vipra-util/src/main/java/de/vipra/util/model/Model.java b/vipra-util/src/main/java/de/vipra/util/model/Model.java index 4872539814fede770d79d9e6fb1b31c756264f2a..6a628f6253bc8db22157d283580fd80ae5202a9f 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Model.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Model.java @@ -40,8 +40,6 @@ public abstract class Model implements BsonDocument { FileUtils.writeStringToFile(file, toFileString(), Constants.FB_ENCODING, false); } - public abstract String getType(); - public abstract void fromDocument(Document document); public abstract Document toDocument(); diff --git a/vipra-util/src/main/java/de/vipra/util/model/Topic.java b/vipra-util/src/main/java/de/vipra/util/model/Topic.java index 8ef2730f7d15c08aed0e543bcb017d4e08faf918..a693f41e6ca60af3cb45818900d91b0a58da32a1 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Topic.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Topic.java @@ -2,75 +2,33 @@ package de.vipra.util.model; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import org.bson.Document; -import de.vipra.util.ex.NotImplementedException; - public class Topic extends Model { - private List<String> names; - private List<TopicWord> words; - - public Topic() {} - - public Topic(List<TopicWord> words) { - this.words = words; - } - - public List<String> getNames() { - return names; - } - - public void setNames(List<String> names) { - this.names = names; - } - - public List<TopicWord> getWords() { - return words; - } - - public void setWords(List<TopicWord> words) { - this.words = words; - } - - @Override - public String getType() { - return Topic.class.getSimpleName().toLowerCase(); - } - @Override public void fromDocument(Document document) { - if (document.containsKey("words")) { - List<Document> topicWords = (List<Document>) document.get("words"); - words = new ArrayList<>(topicWords.size()); - for (Document word : topicWords) { - words.add(new TopicWord(word)); - } - } + // TODO Auto-generated method stub + } @Override public Document toDocument() { - Document document = new Document(); - List<Document> topicWords = new ArrayList<>(words.size()); - for (TopicWord word : words) { - topicWords.add(word.toDocument()); - } - document.put("words", topicWords); - return document; + // TODO Auto-generated method stub + return null; } @Override public void fromFile(File file) throws IOException { - throw new NotImplementedException(); + // TODO Auto-generated method stub + } @Override public String toFileString() { - throw new NotImplementedException(); + // TODO Auto-generated method stub + return null; } } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicDefinition.java b/vipra-util/src/main/java/de/vipra/util/model/TopicDefinition.java new file mode 100644 index 0000000000000000000000000000000000000000..921b17557135c7128ea076da4d6e9325ea1bb3b7 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicDefinition.java @@ -0,0 +1,83 @@ +package de.vipra.util.model; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.bson.Document; + +import de.vipra.util.ex.NotImplementedException; + +public class TopicDefinition extends Model { + + private int index; + private List<String> names; + private List<TopicWord> words; + + public TopicDefinition() {} + + public TopicDefinition(List<TopicWord> words) { + this.words = words; + } + + public int getIndex() { + return index; + } + + public void setIndex(int index) { + this.index = index; + } + + public List<String> getNames() { + return names; + } + + public void setNames(List<String> names) { + this.names = names; + } + + public List<TopicWord> getWords() { + return words; + } + + public void setWords(List<TopicWord> words) { + this.words = words; + } + + @SuppressWarnings("unchecked") + @Override + public void fromDocument(Document document) { + setIndex(document.getInteger("index", 0)); + if (document.containsKey("words")) { + List<Document> topicWords = (List<Document>) document.get("words"); + words = new ArrayList<>(topicWords.size()); + for (Document word : topicWords) { + words.add(new TopicWord(word)); + } + } + } + + @Override + public Document toDocument() { + Document document = new Document(); + document.append("index", getIndex()); + List<Document> topicWords = new ArrayList<>(words.size()); + for (TopicWord word : words) { + topicWords.add(word.toDocument()); + } + document.put("words", topicWords); + return document; + } + + @Override + public void fromFile(File file) throws IOException { + throw new NotImplementedException(); + } + + @Override + public String toFileString() { + throw new NotImplementedException(); + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicMap.java b/vipra-util/src/main/java/de/vipra/util/model/TopicMap.java new file mode 100644 index 0000000000000000000000000000000000000000..68cdc54d56d6babc3e57b82550180942daf6456e --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicMap.java @@ -0,0 +1,75 @@ +package de.vipra.util.model; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.bson.Document; + +public class TopicMap extends Model { + + private Map<String, Integer> map = new LinkedHashMap<>(); + + public TopicMap() {} + + public TopicMap(Document document) { + fromDocument(document); + } + + public void put(String topic) { + Integer i = map.get(topic); + map.put(topic, i == null ? 1 : i + 1); + } + + public Integer get(String topic) { + return map.get(topic); + } + + public Set<String> keySet() { + return map.keySet(); + } + + @SuppressWarnings("unchecked") + @Override + public void fromDocument(Document document) { + List<Document> list = (List<Document>) document.get("topics"); + for (Document doc : list) { + TopicOccurrence to = new TopicOccurrence(doc); + map.put(to.getTopic(), to.getCount()); + } + } + + @Override + public Document toDocument() { + Document document = new Document(); + List<TopicOccurrence> tos = new ArrayList<>(map.size()); + List<Document> list = new ArrayList<>(map.size()); + for (String key : map.keySet()) { + tos.add(new TopicOccurrence(key, map.get(key))); + } + Collections.sort(tos, Collections.reverseOrder()); + for (TopicOccurrence to : tos) { + list.add(to.toDocument()); + } + document.append("topics", list); + return document; + } + + @Override + public void fromFile(File file) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public String toFileString() { + // TODO Auto-generated method stub + return null; + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicOccurrence.java b/vipra-util/src/main/java/de/vipra/util/model/TopicOccurrence.java new file mode 100644 index 0000000000000000000000000000000000000000..bba159e88a9ed21ee929b802dc3b45e7ccb20ef1 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicOccurrence.java @@ -0,0 +1,71 @@ +package de.vipra.util.model; + +import java.io.File; +import java.io.IOException; + +import org.bson.Document; + +import de.vipra.util.ex.NotImplementedException; + +public class TopicOccurrence extends Model implements Comparable<TopicOccurrence> { + + private String topic; + private int count; + + public TopicOccurrence() {} + + public TopicOccurrence(String topic, int count) { + this.topic = topic; + this.count = count; + } + + public TopicOccurrence(Document document) { + fromDocument(document); + } + + public String getTopic() { + return topic; + } + + public void setTopic(String topic) { + this.topic = topic; + } + + public int getCount() { + return count; + } + + public void setCount(int count) { + this.count = count; + } + + @Override + public void fromDocument(Document document) { + setTopic(document.getString("topic")); + setCount(document.getInteger("count", 0)); + } + + @Override + public Document toDocument() { + Document document = new Document(); + document.append("topic", getTopic()); + document.append("count", getCount()); + return document; + } + + @Override + public void fromFile(File file) throws IOException { + throw new NotImplementedException(); + } + + @Override + public String toFileString() { + throw new NotImplementedException(); + } + + @Override + public int compareTo(TopicOccurrence o) { + return Integer.compare(getCount(), o.getCount()); + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java index d79dd8733e04fbef7ff0fa16189b2fa257f9f8ab..1405da85eca93cd6bf3539197fa1d0e5a8d73aca 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java @@ -39,11 +39,6 @@ public class TopicWord extends Model { this.likeliness = likeliness; } - @Override - public String getType() { - return TopicWord.class.getSimpleName().toLowerCase(); - } - @Override public void fromDocument(Document document) { this.word = document.getString("word");