From dbc61a663b78ccce199b67b33d077de864e0c0c7 Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Sat, 13 Feb 2016 15:01:47 +0100 Subject: [PATCH] updated dateindex and vocab to use static variables static variables to share data between instances --- ma-impl.sublime-project | 3 -- .../java/de/vipra/cmd/file/DTMDateIndex.java | 32 ++++++++++--------- .../java/de/vipra/cmd/file/DTMFilebase.java | 8 +++-- .../java/de/vipra/cmd/file/DTMVocabulary.java | 22 +++++++------ .../de/vipra/cmd/option/ImportCommand.java | 9 ++++-- .../main/java/de/vipra/util/Constants.java | 5 +++ .../java/de/vipra/util/model/TopicFull.java | 3 -- 7 files changed, 46 insertions(+), 36 deletions(-) diff --git a/ma-impl.sublime-project b/ma-impl.sublime-project index e5bab69b..24db3031 100644 --- a/ma-impl.sublime-project +++ b/ma-impl.sublime-project @@ -3,9 +3,6 @@ [ { "path": "." - }, - { - "path": "/home/eike/Downloads/dtm_release/dtm/example/model_run2" } ] } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java index 7b010b73..c9a47040 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java @@ -45,27 +45,29 @@ public class DTMDateIndex implements Closeable, Iterable<DTMDateIndex.DTMDateInd } } - private final File file; - private final WindowResolution windowResolution; - private final List<DTMDateIndexEntry> entries; - private final SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); - - public DTMDateIndex(File file, WindowResolution windowResolution) throws IOException, ParseException { - this.file = file; - this.windowResolution = windowResolution; + private File file; + private static WindowResolution windowResolution; + private static List<DTMDateIndexEntry> entries; + private static SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); + + public DTMDateIndex(File modelDir, WindowResolution wr, boolean reread) throws IOException, ParseException { + this.file = new File(modelDir, "dates"); + windowResolution = wr; if (file.exists()) { - List<String> dates = FileUtils.readFile(file); - this.entries = new ArrayList<>(dates.size()); - for (String date : dates) { - this.entries.add(new DTMDateIndexEntry(df.parse(date), true, null)); + if (entries == null || reread) { + List<String> dates = FileUtils.readFile(file); + entries = new ArrayList<>(dates.size()); + for (String date : dates) { + entries.add(new DTMDateIndexEntry(df.parse(date), true, null)); + } } - } else { - this.entries = new ArrayList<>(); + } else if (entries == null || reread) { + entries = new ArrayList<>(); } } public void add(Date date, String line) { - this.entries.add(new DTMDateIndexEntry(date, false, line)); + entries.add(new DTMDateIndexEntry(date, false, line)); } @Override diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java index 26ada4dc..8d169228 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java @@ -20,7 +20,6 @@ import de.vipra.util.model.ArticleFull; public class DTMFilebase extends Filebase { public static final String FILE_MODEL = "dtm-mult.dat"; - public static final String FILE_DATES = "dtm-dates.dat"; public static final String FILE_VOCAB = "vocab"; private final DTMDateIndex dateindex; @@ -35,13 +34,16 @@ public class DTMFilebase extends Filebase { } catch (IOException | ConfigException e) { throw new FilebaseException(e); } + + File modelDir = getModelDir(); try { - this.dateindex = new DTMDateIndex(getModelFile(FILE_DATES), config.windowResolution); + this.dateindex = new DTMDateIndex(modelDir, config.windowResolution, false); } catch (IOException | ParseException e) { throw new FilebaseException("could not read date index file", e); } + try { - this.vocab = new DTMVocabulary(getModelFile(FILE_VOCAB)); + this.vocab = new DTMVocabulary(modelDir, false); } catch (IOException e) { throw new FilebaseException("could not read vocabulary file", e); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java index 99948366..72399624 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java @@ -17,20 +17,24 @@ import de.vipra.util.FileUtils; public class DTMVocabulary implements Closeable, Iterable<String> { private File file; - private List<String> vocables; - private Map<String, Integer> vocablesMap; - private int nextIndex = 1; + private static List<String> vocables; + private static Map<String, Integer> vocablesMap; + private static int nextIndex = 1; - public DTMVocabulary(File file) throws IOException { - this.file = file; + public DTMVocabulary(File modelDir, boolean reread) throws IOException { + this.file = new File(modelDir, "vocab"); if (file.exists()) { - vocables = new ArrayList<>(FileUtils.readFile(file)); - vocablesMap = new HashMap<>(vocables.size() + 200); + if (vocables == null || reread) + vocables = new ArrayList<>(FileUtils.readFile(file)); + if (vocablesMap == null || reread) + vocablesMap = new HashMap<>(vocables.size() + 200); for (String vocable : vocables) vocablesMap.put(vocable, nextIndex++); } else { - vocables = new ArrayList<>(500); - vocablesMap = new HashMap<>(500); + if (vocables == null || reread) + vocables = new ArrayList<>(500); + if (vocablesMap == null || reread) + vocablesMap = new HashMap<>(500); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index ba28fceb..5c49c928 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -83,8 +83,10 @@ public class ImportCommand implements Command { ArticleFull article = articleFromJSON(object); try { - // preprocess text and generate text statistics + // preprocess text ProcessedText processedText = processor.process(article.getText()); + + // generate text stats ArticleStats articleStats = ArticleStats.generateFromText(processedText.getText()); // add article to mongodb @@ -92,8 +94,9 @@ public class ImportCommand implements Command { article.setStats(articleStats); buffer.add(article); - // add article to filebase - filebase.add(article); + // add article to filebase if long enough + if (processedText.getWords().length >= Constants.DOCUMENT_MIN_LENGTH) + filebase.add(article); } catch (ProcessorException e) { log.error("could not preprocess text of article '" + article.getTitle() + "'"); } catch (DatabaseException e) { diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 2d2d47c9..9e9fdf9a 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -88,6 +88,11 @@ public class Constants { */ public static final double TOPIC_THRESHOLD = 0.01; + /** + * Minumum number of words per document. + */ + public static final int DOCUMENT_MIN_LENGTH = 10; + /** * Set this to true to save all found words in the database. If false, will * save only topic related words found by topic modeling. diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java index 99267ff4..ba27f175 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicFull.java @@ -23,9 +23,7 @@ public class TopicFull implements Model<ObjectId>, Serializable { @Id private ObjectId id; - private String name; - private Integer index; @Embedded @@ -36,7 +34,6 @@ public class TopicFull implements Model<ObjectId>, Serializable { private List<ArticleFull> articles; private Date created; - private Date modified; @Override -- GitLab