diff --git a/ma-impl.sublime-project b/ma-impl.sublime-project index e5bab69b26eee760a8bb31f588ab9a63c2fae03a..24db30311b340c8d78001f0fb705810ab77a8c38 100644 --- a/ma-impl.sublime-project +++ b/ma-impl.sublime-project @@ -3,9 +3,6 @@ [ { "path": "." - }, - { - "path": "/home/eike/Downloads/dtm_release/dtm/example/model_run2" } ] } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java index 7b010b73f6279f179ac3a14496574ee5d1f40cf4..c9a47040e764ab823ad20925a818e00735ecb259 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java @@ -45,27 +45,29 @@ public class DTMDateIndex implements Closeable, Iterable<DTMDateIndex.DTMDateInd } } - private final File file; - private final WindowResolution windowResolution; - private final List<DTMDateIndexEntry> entries; - private final SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); - - public DTMDateIndex(File file, WindowResolution windowResolution) throws IOException, ParseException { - this.file = file; - this.windowResolution = windowResolution; + private File file; + private static WindowResolution windowResolution; + private static List<DTMDateIndexEntry> entries; + private static SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); + + public DTMDateIndex(File modelDir, WindowResolution wr, boolean reread) throws IOException, ParseException { + this.file = new File(modelDir, "dates"); + windowResolution = wr; if (file.exists()) { - List<String> dates = FileUtils.readFile(file); - this.entries = new ArrayList<>(dates.size()); - for (String date : dates) { - this.entries.add(new DTMDateIndexEntry(df.parse(date), true, null)); + if (entries == null || reread) { + List<String> dates = FileUtils.readFile(file); + entries = new ArrayList<>(dates.size()); + for (String date : dates) { + entries.add(new DTMDateIndexEntry(df.parse(date), true, null)); + } } - } else { - this.entries = new ArrayList<>(); + } else if (entries == null || reread) { + entries = new ArrayList<>(); } } public void add(Date date, String line) { - this.entries.add(new DTMDateIndexEntry(date, false, line)); + entries.add(new DTMDateIndexEntry(date, false, line)); } @Override diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java index 26ada4dc8f2b03ce555a8c385be85b9ea0a85837..8d16922892460d035590df23777c543d7fc1ddb0 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java @@ -20,7 +20,6 @@ import de.vipra.util.model.ArticleFull; public class DTMFilebase extends Filebase { public static final String FILE_MODEL = "dtm-mult.dat"; - public static final String FILE_DATES = "dtm-dates.dat"; public static final String FILE_VOCAB = "vocab"; private final DTMDateIndex dateindex; @@ -35,13 +34,16 @@ public class DTMFilebase extends Filebase { } catch (IOException | ConfigException e) { throw new FilebaseException(e); } + + File modelDir = getModelDir(); try { - this.dateindex = new DTMDateIndex(getModelFile(FILE_DATES), config.windowResolution); + this.dateindex = new DTMDateIndex(modelDir, config.windowResolution, false); } catch (IOException | ParseException e) { throw new FilebaseException("could not read date index file", e); } + try { - this.vocab = new DTMVocabulary(getModelFile(FILE_VOCAB)); + this.vocab = new DTMVocabulary(modelDir, false); } catch (IOException e) { throw new FilebaseException("could not read vocabulary file", e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java index 9994836637ce2491b00ee688c1171b20cff8930f..72399624df6ff722b724d5fd9c209375c82cf44f 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java @@ -17,20 +17,24 @@ import de.vipra.util.FileUtils; public class DTMVocabulary implements Closeable, Iterable<String> { private File file; - private List<String> vocables; - private Map<String, Integer> vocablesMap; - private int nextIndex = 1; + private static List<String> vocables; + private static Map<String, Integer> vocablesMap; + private static int nextIndex = 1; - public DTMVocabulary(File file) throws IOException { - this.file = file; + public DTMVocabulary(File modelDir, boolean reread) throws IOException { + this.file = new File(modelDir, "vocab"); if (file.exists()) { - vocables = new ArrayList<>(FileUtils.readFile(file)); - vocablesMap = new HashMap<>(vocables.size() + 200); + if (vocables == null || reread) + vocables = new ArrayList<>(FileUtils.readFile(file)); + if (vocablesMap == null || reread) + vocablesMap = new HashMap<>(vocables.size() + 200); for (String vocable : vocables) vocablesMap.put(vocable, nextIndex++); } else { - vocables = new ArrayList<>(500); - vocablesMap = new HashMap<>(500); + if (vocables == null || reread) + vocables = new ArrayList<>(500); + if (vocablesMap == null || reread) + vocablesMap = new HashMap<>(500); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index ba28fceb26b0b3180a563c6df8e697cd705161f7..5c49c9286557e7c1bf2ee1bcc69e927d70548635 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -83,8 +83,10 @@ public class ImportCommand implements Command { ArticleFull article = articleFromJSON(object); try { - // preprocess text and generate text statistics + // preprocess text ProcessedText processedText = processor.process(article.getText()); + + // generate text stats ArticleStats articleStats = ArticleStats.generateFromText(processedText.getText()); // add article to mongodb @@ -92,8 +94,9 @@ public class ImportCommand implements Command { article.setStats(articleStats); buffer.add(article); - // add article to filebase - filebase.add(article); + // add article to filebase if long enough + if (processedText.getWords().length >= Constants.DOCUMENT_MIN_LENGTH) + filebase.add(article); } catch (ProcessorException e) { log.error("could not preprocess text of article '" + article.getTitle() + "'"); } catch (DatabaseException e) { diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 2d2d47c99cd5b056426747e55df05fcd11ef7973..9e9fdf9ab89de21a326e271a39e6daf083301bd7 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -88,6 +88,11 @@ public class Constants { */ public static final double TOPIC_THRESHOLD = 0.01; + /** + * Minumum number of words per document. + */ + public static final int DOCUMENT_MIN_LENGTH = 10; + /** * Set this to true to save all found words in the database. If false, will * save only topic related words found by topic modeling. diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java index 99267ff4caaea60305efaf4b1289605be326ad04..ba27f1753231f123869b9a66cb0f5e68aa72da35 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java @@ -23,9 +23,7 @@ public class TopicFull implements Model<ObjectId>, Serializable { @Id private ObjectId id; - private String name; - private Integer index; @Embedded @@ -36,7 +34,6 @@ public class TopicFull implements Model<ObjectId>, Serializable { private List<ArticleFull> articles; private Date created; - private Date modified; @Override