From aa5befbc3c0483045641df45fb16cace434ab46f Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Sat, 2 Jan 2016 20:32:26 +0100 Subject: [PATCH] updated commands added infos to stats command added count function to db services added frequency list for counting --- ma-impl.sublime-workspace | 206 +----------------- .../main/java/de/vipra/cmd/CmdOptions.java | 2 +- .../src/main/java/de/vipra/cmd/Main.java | 4 +- .../java/de/vipra/cmd/file/FilebaseIndex.java | 4 + .../de/vipra/cmd/file/FilebaseVocabulary.java | 14 +- .../de/vipra/cmd/lda/JGibbLDAAnalyzer.java | 10 +- .../de/vipra/cmd/option/ClearCommand.java | 11 +- .../de/vipra/cmd/option/DeleteCommand.java | 2 - .../de/vipra/cmd/option/ImportCommand.java | 2 - .../de/vipra/cmd/option/StatsCommand.java | 38 +++- .../java/de/vipra/util/FrequencyList.java | 27 +++ .../main/java/de/vipra/util/StringUtils.java | 23 ++ .../main/java/de/vipra/util/model/Topic.java | 27 ++- .../java/de/vipra/util/model/TopicWord.java | 42 +++- .../vipra/util/service/DatabaseService.java | 4 + 15 files changed, 186 insertions(+), 230 deletions(-) create mode 100644 vipra-util/src/main/java/de/vipra/util/FrequencyList.java diff --git a/ma-impl.sublime-workspace b/ma-impl.sublime-workspace index 7804747e..04a61dba 100644 --- a/ma-impl.sublime-workspace +++ b/ma-impl.sublime-workspace @@ -271,23 +271,6 @@ }, "buffers": [ - { - "file": "TODO", - "settings": - { - "buffer_size": 134, - "line_ending": "Unix", - "name": "TODO" - } - }, - { - "file": "Vagrantfile", - "settings": - { - "buffer_size": 955, - "line_ending": "Unix" - } - } ], "build_system": "", "build_system_choices": @@ -470,6 +453,13 @@ ], "file_history": [ + "/home/eike/.local/share/vipra/jgibb/jgibb.twords", + "/home/eike/.local/share/vipra/jgibb/jgibb.tassign", + "/home/eike/Repositories/fu/ss15/ma/impl/TODO", + "/home/eike/.local/share/vipra/jgibb/vocab", + "/home/eike/.local/share/vipra/jgibb/index", + "/home/eike/.local/share/vipra/jgibb/jgibb.phi", + "/home/eike/.local/share/vipra/jgibb/jgibb", "/home/eike/Downloads/JGibbLDA-v.1.0/src/jgibblda/Constants.java", "/home/eike/Downloads/JGibbLDA-v.1.0/models/casestudy-en/model-final.others", "/home/eike/Downloads/JGibbLDA-v.1.0/models/casestudy-en/model-final.twords", @@ -911,190 +901,8 @@ "groups": [ { - "selected": 1, "sheets": [ - { - "buffer": 0, - "file": "TODO", - "semi_transient": false, - "settings": - { - "buffer_size": 134, - "regions": - { - }, - "selection": - [ - [ - 0, - 0 - ] - ], - "settings": - { - "BracketHighlighterBusy": false, - "auto_name": "TODO", - "bh_regions": - [ - "bh_default", - "bh_default_center", - "bh_default_open", - "bh_default_close", - "bh_default_content", - "bh_regex", - "bh_regex_center", - "bh_regex_open", - "bh_regex_close", - "bh_regex_content", - "bh_double_quote", - "bh_double_quote_center", - "bh_double_quote_open", - "bh_double_quote_close", - "bh_double_quote_content", - "bh_square", - "bh_square_center", - "bh_square_open", - "bh_square_close", - "bh_square_content", - "bh_angle", - "bh_angle_center", - "bh_angle_open", - "bh_angle_close", - "bh_angle_content", - "bh_curly", - "bh_curly_center", - "bh_curly_open", - "bh_curly_close", - "bh_curly_content", - "bh_unmatched", - "bh_unmatched_center", - "bh_unmatched_open", - "bh_unmatched_close", - "bh_unmatched_content", - "bh_c_define", - "bh_c_define_center", - "bh_c_define_open", - "bh_c_define_close", - "bh_c_define_content", - "bh_single_quote", - "bh_single_quote_center", - "bh_single_quote_open", - "bh_single_quote_close", - "bh_single_quote_content", - "bh_round", - "bh_round_center", - "bh_round_open", - "bh_round_close", - "bh_round_content", - "bh_tag", - "bh_tag_center", - "bh_tag_open", - "bh_tag_close", - "bh_tag_content" - ], - "incomplete_sync": null, - "syntax": "Packages/PlainTasks/PlainTasks.tmLanguage" - }, - "translation.x": 0.0, - "translation.y": 0.0, - "zoom_level": 1.0 - }, - "stack_index": 1, - "type": "text" - }, - { - "buffer": 1, - "file": "Vagrantfile", - "semi_transient": true, - "settings": - { - "buffer_size": 955, - "regions": - { - }, - "selection": - [ - [ - 757, - 757 - ] - ], - "settings": - { - "BracketHighlighterBusy": false, - "bh_regions": - [ - "bh_default", - "bh_default_center", - "bh_default_open", - "bh_default_close", - "bh_default_content", - "bh_regex", - "bh_regex_center", - "bh_regex_open", - "bh_regex_close", - "bh_regex_content", - "bh_double_quote", - "bh_double_quote_center", - "bh_double_quote_open", - "bh_double_quote_close", - "bh_double_quote_content", - "bh_square", - "bh_square_center", - "bh_square_open", - "bh_square_close", - "bh_square_content", - "bh_angle", - "bh_angle_center", - "bh_angle_open", - "bh_angle_close", - "bh_angle_content", - "bh_curly", - "bh_curly_center", - "bh_curly_open", - "bh_curly_close", - "bh_curly_content", - "bh_unmatched", - "bh_unmatched_center", - "bh_unmatched_open", - "bh_unmatched_close", - "bh_unmatched_content", - "bh_c_define", - "bh_c_define_center", - "bh_c_define_open", - "bh_c_define_close", - "bh_c_define_content", - "bh_single_quote", - "bh_single_quote_center", - "bh_single_quote_open", - "bh_single_quote_close", - "bh_single_quote_content", - "bh_round", - "bh_round_center", - "bh_round_open", - "bh_round_close", - "bh_round_content", - "bh_tag", - "bh_tag_center", - "bh_tag_open", - "bh_tag_close", - "bh_tag_content" - ], - "incomplete_sync": null, - "remote_loading": false, - "synced": false, - "syntax": "Packages/Ruby/Ruby.sublime-syntax", - "tab_size": 2, - "translate_tabs_to_spaces": true - }, - "translation.x": 0.0, - "translation.y": 0.0, - "zoom_level": 1.0 - }, - "stack_index": 0, - "type": "text" - } ] } ], diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java b/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java index 3ca745df..a5e6824b 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java @@ -26,7 +26,7 @@ public class CmdOptions extends Options { public static final String OPT_STATS = "p"; public static final String OPT_STATS_LONG = "print-stats"; - public static final String OPT_DEFAULTS = "d"; + public static final String OPT_DEFAULTS = "n"; public static final String OPT_DEFAULTS_LONG = "defaults"; public CmdOptions() { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java index 5352b762..7599e053 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java @@ -53,10 +53,12 @@ public class Main { return; } + boolean defaults = cline.hasOption(OPT_DEFAULTS); + List<Command> commands = new ArrayList<>(); if (cline.hasOption(OPT_CLEAR)) { - commands.add(new ClearCommand()); + commands.add(new ClearCommand(defaults)); } if (cline.hasOption(OPT_IMPORT)) { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java index dae34993..9efba97a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java @@ -49,6 +49,10 @@ public class FilebaseIndex implements Closeable, Iterable<String> { return index.remove(id); } + public int size() { + return index.size(); + } + @Override public void close() throws IOException { write(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java index a910d941..2f0ec931 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java @@ -28,11 +28,6 @@ public class FilebaseVocabulary implements Closeable, Iterable<String> { FileUtils.writeLines(file, Constants.FB_ENCODING.name(), vocables, null, false); } - @Override - public void close() throws IOException { - write(); - } - public void addVocabulary(String text) { addVocabulary(text.split("\\s+")); } @@ -50,6 +45,15 @@ public class FilebaseVocabulary implements Closeable, Iterable<String> { return vocables.indexOf(word); } + public int size() { + return vocables.size(); + } + + @Override + public void close() throws IOException { + write(); + } + @Override public Iterator<String> iterator() { return vocables.iterator(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java index ffebc4d5..5b9a1887 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java @@ -15,6 +15,7 @@ import de.vipra.cmd.ex.LDAAnalyzerException; import de.vipra.cmd.model.Article; import de.vipra.util.Config; import de.vipra.util.FileUtils; +import de.vipra.util.FrequencyList; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.Topic; @@ -108,15 +109,6 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { for (Topic topic : topics) { dbTopics.createSingle(topic); } - - // read document topics - BufferedReader reader = new BufferedReader( - new InputStreamReader(new FileInputStream(new File(modelDir, "jgibb.tassign")))); - String line; - while ((line = reader.readLine()) != null) { - String[] parts = line.trim().split("\\s+"); - - } } catch (IOException | DatabaseException e) { throw new LDAAnalyzerException(e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java index 47e95b20..9f8ac68a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java @@ -14,6 +14,7 @@ import de.vipra.util.Config; import de.vipra.util.ConsoleUtils; import de.vipra.util.Constants; import de.vipra.util.ex.ConfigException; +import de.vipra.util.model.Topic; import de.vipra.util.service.DatabaseService; public class ClearCommand implements Command { @@ -21,19 +22,27 @@ public class ClearCommand implements Command { public static final Logger log = LoggerFactory.getLogger(ClearCommand.class); public static final Logger out = LoggerFactory.getLogger("shellout"); + private boolean defaults; private Config config; private DatabaseService<Article> dbArticles; + private DatabaseService<Topic> dbTopics; + + public ClearCommand(boolean defaults) { + this.defaults = defaults; + } private void clear() throws ClearException, ConfigException { try { config = Config.getConfig(); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); } catch (Exception e) { throw new ClearException(e); } out.info("clearing database"); dbArticles.drop(); + dbTopics.drop(); out.info("clearing filebase"); File dataDir = config.getDataDirectory(); @@ -50,7 +59,7 @@ public class ClearCommand implements Command { public void run() throws ExecutionException { out.info("to confirm clearing, type 'clear' and press enter"); try { - if (ConsoleUtils.confirm("clear")) { + if (defaults || ConsoleUtils.confirm("clear")) { clear(); } } catch (ClearException | ConfigException e) { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java index aa8c9400..9d3b8c4f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java @@ -28,8 +28,6 @@ public class DeleteCommand implements Command { private DatabaseService<Article> dbArticles; private Filebase filebase; - DeleteCommand() {} - public DeleteCommand(String[] strings) { addIds(strings); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 5b6c8226..498f6b22 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -43,8 +43,6 @@ public class ImportCommand implements Command { private Processor preprocessor; private LDAAnalyzer analyzer; - ImportCommand() {} - public ImportCommand(String[] paths) { addPaths(paths); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java index 53d7a738..8410d784 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java @@ -1,13 +1,49 @@ package de.vipra.cmd.option; +import java.io.File; +import java.io.IOException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import de.vipra.cmd.ExecutionException; +import de.vipra.cmd.ex.FilebaseException; +import de.vipra.cmd.file.Filebase; +import de.vipra.util.Config; +import de.vipra.util.Constants; +import de.vipra.util.StringUtils; +import de.vipra.util.ex.ConfigException; +import de.vipra.util.model.Topic; +import de.vipra.util.service.DatabaseService; public class StatsCommand implements Command { + public static final Logger log = LoggerFactory.getLogger(StatsCommand.class); + public static final Logger out = LoggerFactory.getLogger("shellout"); + + private Config config; + private Filebase filebase; + private DatabaseService<Topic> dbTopics; + + private void stats() { + File modelFile = filebase.getModelFile(); + out.info("filebase size: " + StringUtils.humanReadableByteCount(modelFile.length(), true)); + out.info("# of articles: " + filebase.getIndex().size()); + out.info("# of words : " + filebase.getVocab().size()); + out.info("# of topics : " + dbTopics.count()); + } + @Override public void run() throws ExecutionException { - // TODO Auto-generated method stub + try { + config = Config.getConfig(); + filebase = Filebase.getFilebase(config); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); + stats(); + } catch (IOException | ConfigException | FilebaseException e) { + throw new ExecutionException(e); + } } } diff --git a/vipra-util/src/main/java/de/vipra/util/FrequencyList.java b/vipra-util/src/main/java/de/vipra/util/FrequencyList.java new file mode 100644 index 00000000..f512248d --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/FrequencyList.java @@ -0,0 +1,27 @@ +package de.vipra.util; + +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; + +public class FrequencyList<T> implements Iterable<T> { + + private Map<T, Integer> map = new LinkedHashMap<>(); + + public void add(T t) { + if (map.containsKey(t)) + map.put(t, map.get(t) + 1); + else + map.put(t, 1); + } + + public Integer get(T t) { + return map.get(t); + } + + @Override + public Iterator<T> iterator() { + return map.keySet().iterator(); + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/StringUtils.java b/vipra-util/src/main/java/de/vipra/util/StringUtils.java index 6916fa12..355845f1 100644 --- a/vipra-util/src/main/java/de/vipra/util/StringUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/StringUtils.java @@ -87,4 +87,27 @@ public class StringUtils { return lc; } + /** + * Turns byte counts into human readable strings. Taken from + * https://stackoverflow.com/questions/3758606 + * + * @param bytes + * number of bytes + * @param si + * true to use SI units + * @return formatted string + */ + public static String humanReadableByteCount(long bytes, boolean si) { + int unit = si ? 1000 : 1024; + if (bytes < unit) + return bytes + " B"; + int exp = (int) (Math.log(bytes) / Math.log(unit)); + String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i"); + return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); + } + + public static String humanReadableByteCount(long bytes) { + return humanReadableByteCount(bytes, true); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/model/Topic.java b/vipra-util/src/main/java/de/vipra/util/model/Topic.java index 168a4c94..8ef2730f 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Topic.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Topic.java @@ -2,10 +2,13 @@ package de.vipra.util.model; import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import org.bson.Document; +import de.vipra.util.ex.NotImplementedException; + public class Topic extends Model { private List<String> names; @@ -40,26 +43,34 @@ public class Topic extends Model { @Override public void fromDocument(Document document) { - // TODO Auto-generated method stub - + if (document.containsKey("words")) { + List<Document> topicWords = (List<Document>) document.get("words"); + words = new ArrayList<>(topicWords.size()); + for (Document word : topicWords) { + words.add(new TopicWord(word)); + } + } } @Override public Document toDocument() { - // TODO Auto-generated method stub - return null; + Document document = new Document(); + List<Document> topicWords = new ArrayList<>(words.size()); + for (TopicWord word : words) { + topicWords.add(word.toDocument()); + } + document.put("words", topicWords); + return document; } @Override public void fromFile(File file) throws IOException { - // TODO Auto-generated method stub - + throw new NotImplementedException(); } @Override public String toFileString() { - // TODO Auto-generated method stub - return null; + throw new NotImplementedException(); } } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java index 22fe8ab0..d79dd873 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java @@ -1,6 +1,13 @@ package de.vipra.util.model; -public class TopicWord { +import java.io.File; +import java.io.IOException; + +import org.bson.Document; + +import de.vipra.util.ex.NotImplementedException; + +public class TopicWord extends Model { private String word; private double likeliness; @@ -12,6 +19,10 @@ public class TopicWord { this.likeliness = likeliness; } + public TopicWord(Document document) { + fromDocument(document); + } + public String getWord() { return word; } @@ -28,4 +39,33 @@ public class TopicWord { this.likeliness = likeliness; } + @Override + public String getType() { + return TopicWord.class.getSimpleName().toLowerCase(); + } + + @Override + public void fromDocument(Document document) { + this.word = document.getString("word"); + this.likeliness = document.getDouble("likeliness"); + } + + @Override + public Document toDocument() { + Document document = new Document(); + document.put("word", getWord()); + document.put("likeliness", getLikeliness()); + return document; + } + + @Override + public void fromFile(File file) throws IOException { + throw new NotImplementedException(); + } + + @Override + public String toFileString() { + throw new NotImplementedException(); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java b/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java index 940f7e52..b8fe81ed 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java +++ b/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java @@ -106,6 +106,10 @@ public class DatabaseService<T extends Model> implements Service<T, DatabaseExce collection.drop(); } + public long count() { + return collection.count(); + } + public static <T extends Model> DatabaseService<T> getDatabaseService(Config config, Constants.Collection collection, Class<T> clazz) throws ConfigException { Mongo mongo = Mongo.getInstance(config); -- GitLab