diff --git a/ma-impl.sublime-workspace b/ma-impl.sublime-workspace index 7804747e3da27156af3a6af0c3f69b61db4463ca..04a61dba7becac2e1726b3122814882777fe672a 100644 --- a/ma-impl.sublime-workspace +++ b/ma-impl.sublime-workspace @@ -271,23 +271,6 @@ }, "buffers": [ - { - "file": "TODO", - "settings": - { - "buffer_size": 134, - "line_ending": "Unix", - "name": "TODO" - } - }, - { - "file": "Vagrantfile", - "settings": - { - "buffer_size": 955, - "line_ending": "Unix" - } - } ], "build_system": "", "build_system_choices": @@ -470,6 +453,13 @@ ], "file_history": [ + "/home/eike/.local/share/vipra/jgibb/jgibb.twords", + "/home/eike/.local/share/vipra/jgibb/jgibb.tassign", + "/home/eike/Repositories/fu/ss15/ma/impl/TODO", + "/home/eike/.local/share/vipra/jgibb/vocab", + "/home/eike/.local/share/vipra/jgibb/index", + "/home/eike/.local/share/vipra/jgibb/jgibb.phi", + "/home/eike/.local/share/vipra/jgibb/jgibb", "/home/eike/Downloads/JGibbLDA-v.1.0/src/jgibblda/Constants.java", "/home/eike/Downloads/JGibbLDA-v.1.0/models/casestudy-en/model-final.others", "/home/eike/Downloads/JGibbLDA-v.1.0/models/casestudy-en/model-final.twords", @@ -911,190 +901,8 @@ "groups": [ { - "selected": 1, "sheets": [ - { - "buffer": 0, - "file": "TODO", - "semi_transient": false, - "settings": - { - "buffer_size": 134, - "regions": - { - }, - "selection": - [ - [ - 0, - 0 - ] - ], - "settings": - { - "BracketHighlighterBusy": false, - "auto_name": "TODO", - "bh_regions": - [ - "bh_default", - "bh_default_center", - "bh_default_open", - "bh_default_close", - "bh_default_content", - "bh_regex", - "bh_regex_center", - "bh_regex_open", - "bh_regex_close", - "bh_regex_content", - "bh_double_quote", - "bh_double_quote_center", - "bh_double_quote_open", - "bh_double_quote_close", - "bh_double_quote_content", - "bh_square", - "bh_square_center", - "bh_square_open", - "bh_square_close", - "bh_square_content", - "bh_angle", - "bh_angle_center", - "bh_angle_open", - "bh_angle_close", - "bh_angle_content", - "bh_curly", - "bh_curly_center", - "bh_curly_open", - "bh_curly_close", - "bh_curly_content", - "bh_unmatched", - "bh_unmatched_center", - "bh_unmatched_open", - "bh_unmatched_close", - "bh_unmatched_content", - "bh_c_define", - "bh_c_define_center", - "bh_c_define_open", - "bh_c_define_close", - "bh_c_define_content", - "bh_single_quote", - "bh_single_quote_center", - "bh_single_quote_open", - "bh_single_quote_close", - "bh_single_quote_content", - "bh_round", - "bh_round_center", - "bh_round_open", - "bh_round_close", - "bh_round_content", - "bh_tag", - "bh_tag_center", - "bh_tag_open", - "bh_tag_close", - "bh_tag_content" - ], - "incomplete_sync": null, - "syntax": "Packages/PlainTasks/PlainTasks.tmLanguage" - }, - "translation.x": 0.0, - "translation.y": 0.0, - "zoom_level": 1.0 - }, - "stack_index": 1, - "type": "text" - }, - { - "buffer": 1, - "file": "Vagrantfile", - "semi_transient": true, - "settings": - { - "buffer_size": 955, - "regions": - { - }, - "selection": - [ - [ - 757, - 757 - ] - ], - "settings": - { - "BracketHighlighterBusy": false, - "bh_regions": - [ - "bh_default", - "bh_default_center", - "bh_default_open", - "bh_default_close", - "bh_default_content", - "bh_regex", - "bh_regex_center", - "bh_regex_open", - "bh_regex_close", - "bh_regex_content", - "bh_double_quote", - "bh_double_quote_center", - "bh_double_quote_open", - "bh_double_quote_close", - "bh_double_quote_content", - "bh_square", - "bh_square_center", - "bh_square_open", - "bh_square_close", - "bh_square_content", - "bh_angle", - "bh_angle_center", - "bh_angle_open", - "bh_angle_close", - "bh_angle_content", - "bh_curly", - "bh_curly_center", - "bh_curly_open", - "bh_curly_close", - "bh_curly_content", - "bh_unmatched", - "bh_unmatched_center", - "bh_unmatched_open", - "bh_unmatched_close", - "bh_unmatched_content", - "bh_c_define", - "bh_c_define_center", - "bh_c_define_open", - "bh_c_define_close", - "bh_c_define_content", - "bh_single_quote", - "bh_single_quote_center", - "bh_single_quote_open", - "bh_single_quote_close", - "bh_single_quote_content", - "bh_round", - "bh_round_center", - "bh_round_open", - "bh_round_close", - "bh_round_content", - "bh_tag", - "bh_tag_center", - "bh_tag_open", - "bh_tag_close", - "bh_tag_content" - ], - "incomplete_sync": null, - "remote_loading": false, - "synced": false, - "syntax": "Packages/Ruby/Ruby.sublime-syntax", - "tab_size": 2, - "translate_tabs_to_spaces": true - }, - "translation.x": 0.0, - "translation.y": 0.0, - "zoom_level": 1.0 - }, - "stack_index": 0, - "type": "text" - } ] } ], diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java b/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java index 3ca745df210576b5c88d0b55cf684502db94005f..a5e6824b0ea2d91575c27352c311eb66684b56c4 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/CmdOptions.java @@ -26,7 +26,7 @@ public class CmdOptions extends Options { public static final String OPT_STATS = "p"; public static final String OPT_STATS_LONG = "print-stats"; - public static final String OPT_DEFAULTS = "d"; + public static final String OPT_DEFAULTS = "n"; public static final String OPT_DEFAULTS_LONG = "defaults"; public CmdOptions() { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java index 5352b7621ca88acd60211014f9202a3080f65fdd..7599e053417ef0a66c104d46142dbd3515dca748 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java @@ -53,10 +53,12 @@ public class Main { return; } + boolean defaults = cline.hasOption(OPT_DEFAULTS); + List<Command> commands = new ArrayList<>(); if (cline.hasOption(OPT_CLEAR)) { - commands.add(new ClearCommand()); + commands.add(new ClearCommand(defaults)); } if (cline.hasOption(OPT_IMPORT)) { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java index dae34993e28849e996a328c7ba7e30856098d842..9efba97af7c498589102db454f49d45f9a6730e3 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java @@ -49,6 +49,10 @@ public class FilebaseIndex implements Closeable, Iterable<String> { return index.remove(id); } + public int size() { + return index.size(); + } + @Override public void close() throws IOException { write(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java index a910d941e47c895f88925ff1160fecb8e0de69dd..2f0ec9317e270c07e9dcf7725ad494d29a620676 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseVocabulary.java @@ -28,11 +28,6 @@ public class FilebaseVocabulary implements Closeable, Iterable<String> { FileUtils.writeLines(file, Constants.FB_ENCODING.name(), vocables, null, false); } - @Override - public void close() throws IOException { - write(); - } - public void addVocabulary(String text) { addVocabulary(text.split("\\s+")); } @@ -50,6 +45,15 @@ public class FilebaseVocabulary implements Closeable, Iterable<String> { return vocables.indexOf(word); } + public int size() { + return vocables.size(); + } + + @Override + public void close() throws IOException { + write(); + } + @Override public Iterator<String> iterator() { return vocables.iterator(); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java index ffebc4d5b5f1ee5055d47d679772e9c0189acea7..5b9a1887284a0d9bd8daae2e6c4a9a9b758b8aa8 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbLDAAnalyzer.java @@ -15,6 +15,7 @@ import de.vipra.cmd.ex.LDAAnalyzerException; import de.vipra.cmd.model.Article; import de.vipra.util.Config; import de.vipra.util.FileUtils; +import de.vipra.util.FrequencyList; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.Topic; @@ -108,15 +109,6 @@ public class JGibbLDAAnalyzer extends LDAAnalyzer { for (Topic topic : topics) { dbTopics.createSingle(topic); } - - // read document topics - BufferedReader reader = new BufferedReader( - new InputStreamReader(new FileInputStream(new File(modelDir, "jgibb.tassign")))); - String line; - while ((line = reader.readLine()) != null) { - String[] parts = line.trim().split("\\s+"); - - } } catch (IOException | DatabaseException e) { throw new LDAAnalyzerException(e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java index 47e95b20dffed43f6a1f0a6a4a8a88f6a3ad348a..9f8ac68ac6cf5249e19b01c2ca6393237daa91b5 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ClearCommand.java @@ -14,6 +14,7 @@ import de.vipra.util.Config; import de.vipra.util.ConsoleUtils; import de.vipra.util.Constants; import de.vipra.util.ex.ConfigException; +import de.vipra.util.model.Topic; import de.vipra.util.service.DatabaseService; public class ClearCommand implements Command { @@ -21,19 +22,27 @@ public class ClearCommand implements Command { public static final Logger log = LoggerFactory.getLogger(ClearCommand.class); public static final Logger out = LoggerFactory.getLogger("shellout"); + private boolean defaults; private Config config; private DatabaseService<Article> dbArticles; + private DatabaseService<Topic> dbTopics; + + public ClearCommand(boolean defaults) { + this.defaults = defaults; + } private void clear() throws ClearException, ConfigException { try { config = Config.getConfig(); dbArticles = DatabaseService.getDatabaseService(config, Constants.Collection.ARTICLES, Article.class); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); } catch (Exception e) { throw new ClearException(e); } out.info("clearing database"); dbArticles.drop(); + dbTopics.drop(); out.info("clearing filebase"); File dataDir = config.getDataDirectory(); @@ -50,7 +59,7 @@ public class ClearCommand implements Command { public void run() throws ExecutionException { out.info("to confirm clearing, type 'clear' and press enter"); try { - if (ConsoleUtils.confirm("clear")) { + if (defaults || ConsoleUtils.confirm("clear")) { clear(); } } catch (ClearException | ConfigException e) { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java index aa8c940024d69cb3f34fd5c5118763f486cc590a..9d3b8c4f2e7f2a69573cf9012d0382fc1050c765 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/DeleteCommand.java @@ -28,8 +28,6 @@ public class DeleteCommand implements Command { private DatabaseService<Article> dbArticles; private Filebase filebase; - DeleteCommand() {} - public DeleteCommand(String[] strings) { addIds(strings); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 5b6c822606761c09573dad4b683731a217319855..498f6b229c606ad19d69ab811baa2d9b0551a7f1 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -43,8 +43,6 @@ public class ImportCommand implements Command { private Processor preprocessor; private LDAAnalyzer analyzer; - ImportCommand() {} - public ImportCommand(String[] paths) { addPaths(paths); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java index 53d7a73872906ab12f136b09d4eff7d0b051d299..8410d7848d626a094f22cef4252a4ff40b8c8489 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/StatsCommand.java @@ -1,13 +1,49 @@ package de.vipra.cmd.option; +import java.io.File; +import java.io.IOException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import de.vipra.cmd.ExecutionException; +import de.vipra.cmd.ex.FilebaseException; +import de.vipra.cmd.file.Filebase; +import de.vipra.util.Config; +import de.vipra.util.Constants; +import de.vipra.util.StringUtils; +import de.vipra.util.ex.ConfigException; +import de.vipra.util.model.Topic; +import de.vipra.util.service.DatabaseService; public class StatsCommand implements Command { + public static final Logger log = LoggerFactory.getLogger(StatsCommand.class); + public static final Logger out = LoggerFactory.getLogger("shellout"); + + private Config config; + private Filebase filebase; + private DatabaseService<Topic> dbTopics; + + private void stats() { + File modelFile = filebase.getModelFile(); + out.info("filebase size: " + StringUtils.humanReadableByteCount(modelFile.length(), true)); + out.info("# of articles: " + filebase.getIndex().size()); + out.info("# of words : " + filebase.getVocab().size()); + out.info("# of topics : " + dbTopics.count()); + } + @Override public void run() throws ExecutionException { - // TODO Auto-generated method stub + try { + config = Config.getConfig(); + filebase = Filebase.getFilebase(config); + dbTopics = DatabaseService.getDatabaseService(config, Constants.Collection.TOPICS, Topic.class); + stats(); + } catch (IOException | ConfigException | FilebaseException e) { + throw new ExecutionException(e); + } } } diff --git a/vipra-util/src/main/java/de/vipra/util/FrequencyList.java b/vipra-util/src/main/java/de/vipra/util/FrequencyList.java new file mode 100644 index 0000000000000000000000000000000000000000..f512248d66c7d35192eb9e7f668749ae6d8b1632 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/FrequencyList.java @@ -0,0 +1,27 @@ +package de.vipra.util; + +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; + +public class FrequencyList<T> implements Iterable<T> { + + private Map<T, Integer> map = new LinkedHashMap<>(); + + public void add(T t) { + if (map.containsKey(t)) + map.put(t, map.get(t) + 1); + else + map.put(t, 1); + } + + public Integer get(T t) { + return map.get(t); + } + + @Override + public Iterator<T> iterator() { + return map.keySet().iterator(); + } + +} diff --git a/vipra-util/src/main/java/de/vipra/util/StringUtils.java b/vipra-util/src/main/java/de/vipra/util/StringUtils.java index 6916fa12f2f897708d29de6dcf2840eac8507294..355845f1288122fcf6388f774330c2bb82ce8fe4 100644 --- a/vipra-util/src/main/java/de/vipra/util/StringUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/StringUtils.java @@ -87,4 +87,27 @@ public class StringUtils { return lc; } + /** + * Turns byte counts into human readable strings. Taken from + * https://stackoverflow.com/questions/3758606 + * + * @param bytes + * number of bytes + * @param si + * true to use SI units + * @return formatted string + */ + public static String humanReadableByteCount(long bytes, boolean si) { + int unit = si ? 1000 : 1024; + if (bytes < unit) + return bytes + " B"; + int exp = (int) (Math.log(bytes) / Math.log(unit)); + String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i"); + return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); + } + + public static String humanReadableByteCount(long bytes) { + return humanReadableByteCount(bytes, true); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/model/Topic.java b/vipra-util/src/main/java/de/vipra/util/model/Topic.java index 168a4c944dd43314129e9dbe77013668e2517a7f..8ef2730f7d15c08aed0e543bcb017d4e08faf918 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Topic.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Topic.java @@ -2,10 +2,13 @@ package de.vipra.util.model; import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import org.bson.Document; +import de.vipra.util.ex.NotImplementedException; + public class Topic extends Model { private List<String> names; @@ -40,26 +43,34 @@ public class Topic extends Model { @Override public void fromDocument(Document document) { - // TODO Auto-generated method stub - + if (document.containsKey("words")) { + List<Document> topicWords = (List<Document>) document.get("words"); + words = new ArrayList<>(topicWords.size()); + for (Document word : topicWords) { + words.add(new TopicWord(word)); + } + } } @Override public Document toDocument() { - // TODO Auto-generated method stub - return null; + Document document = new Document(); + List<Document> topicWords = new ArrayList<>(words.size()); + for (TopicWord word : words) { + topicWords.add(word.toDocument()); + } + document.put("words", topicWords); + return document; } @Override public void fromFile(File file) throws IOException { - // TODO Auto-generated method stub - + throw new NotImplementedException(); } @Override public String toFileString() { - // TODO Auto-generated method stub - return null; + throw new NotImplementedException(); } } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java index 22fe8ab0f99190ea044a446e174c8c5ef8138bdc..d79dd8733e04fbef7ff0fa16189b2fa257f9f8ab 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java @@ -1,6 +1,13 @@ package de.vipra.util.model; -public class TopicWord { +import java.io.File; +import java.io.IOException; + +import org.bson.Document; + +import de.vipra.util.ex.NotImplementedException; + +public class TopicWord extends Model { private String word; private double likeliness; @@ -12,6 +19,10 @@ public class TopicWord { this.likeliness = likeliness; } + public TopicWord(Document document) { + fromDocument(document); + } + public String getWord() { return word; } @@ -28,4 +39,33 @@ public class TopicWord { this.likeliness = likeliness; } + @Override + public String getType() { + return TopicWord.class.getSimpleName().toLowerCase(); + } + + @Override + public void fromDocument(Document document) { + this.word = document.getString("word"); + this.likeliness = document.getDouble("likeliness"); + } + + @Override + public Document toDocument() { + Document document = new Document(); + document.put("word", getWord()); + document.put("likeliness", getLikeliness()); + return document; + } + + @Override + public void fromFile(File file) throws IOException { + throw new NotImplementedException(); + } + + @Override + public String toFileString() { + throw new NotImplementedException(); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java b/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java index 940f7e5287ab6c9f38eafb1fc13a02088fd9e528..b8fe81eddfebdd64dd5a022626025b428865c99d 100644 --- a/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java +++ b/vipra-util/src/main/java/de/vipra/util/service/DatabaseService.java @@ -106,6 +106,10 @@ public class DatabaseService<T extends Model> implements Service<T, DatabaseExce collection.drop(); } + public long count() { + return collection.count(); + } + public static <T extends Model> DatabaseService<T> getDatabaseService(Config config, Constants.Collection collection, Class<T> clazz) throws ConfigException { Mongo mongo = Mongo.getInstance(config);