diff --git a/build.sh b/build.sh index 3dd471204b0b1aff88fca5b54ba705f8153afa00..301135706f3a6999cd5cbbd955521735a76bb20a 100755 --- a/build.sh +++ b/build.sh @@ -130,10 +130,10 @@ else echo "error" exit 1 fi - rm -rf ../vipra-backend/src/main/public cd .. + rm -rf ./vipra-backend/src/main/public if [ $INCLUDE_FRONTEND_IN_WAR -eq 1 ]; then - cp -r ./public vipra-backend/src/main/public + cp -r ./vipra-ui/public ./vipra-backend/src/main/public else mkdir ./vipra-backend/src/main/public fi diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java index 4aba609f08fd06a70ac5402ba655e0254f950c20..c999d85f56f6c87d5ad3e2c6a884f985d5dae71c 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java @@ -69,12 +69,10 @@ public class InfoResource { info.put("db.topics", dbTopics.count(null)); info.put("db.words", dbWords.count(null)); - // configuration - info.put("config.analyzer", config.analyzer); - info.put("config.processor", config.processor); - info.put("config.windowres", config.windowResolution); - // constants + info.put("const.analyzer", Constants.ANALYZER); + info.put("const.processor", Constants.PROCESSOR); + info.put("const.windowres", Constants.WINDOW_RESOLUTION); info.put("const.importbuf", Constants.IMPORT_BUFFER_MAX); info.put("const.esboosttopics", Constants.ES_BOOST_TOPICS); info.put("const.esboosttitles", Constants.ES_BOOST_TITLES); diff --git a/vipra-backend/src/main/resources/config.properties b/vipra-backend/src/main/resources/config.properties index 61bfe4a9fb5a057b497c5947af9db341ff191c81..0ca6de7ca9d55514c174287484d9da96e410addb 100644 --- a/vipra-backend/src/main/resources/config.properties +++ b/vipra-backend/src/main/resources/config.properties @@ -3,6 +3,4 @@ db.port=27017 db.name=test es.host=localhost es.port=9300 -tm.processor=corenlp -tm.analyzer=jgibb tm.dtmpath=/home/eike/repos/master/dtm_release/dtm/main \ No newline at end of file diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java index ff1c200c6a38005827047c6561d4d0687590dc3a..d775be2669ce3ba17a382ad02e94fc7ff64cea8d 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java @@ -6,21 +6,21 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.text.ParseException; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; import de.vipra.cmd.ex.FilebaseException; -import de.vipra.cmd.file.DTMSequenceIndex.DTMDateIndexEntry; +import de.vipra.cmd.file.DTMIndex.ArticleDate; import de.vipra.util.Constants; import de.vipra.util.FileUtils; -import de.vipra.util.ex.ConfigException; import de.vipra.util.model.ArticleFull; public class DTMFilebase extends Filebase { public static final String FILE_NAME = "dtm-mult.dat"; - private final DTMSequenceIndex seqindex; + private final DTMIndex seqindex; private final DTMVocabulary vocab; private final File modelFile; @@ -29,8 +29,8 @@ public class DTMFilebase extends Filebase { final File modelDir = getModelDir(); try { - this.seqindex = new DTMSequenceIndex(modelDir); - } catch (IOException | ConfigException | ParseException e) { + this.seqindex = new DTMIndex(modelDir); + } catch (IOException | ParseException e) { throw new FilebaseException("could not read date index file", e); } @@ -45,8 +45,8 @@ public class DTMFilebase extends Filebase { @Override public synchronized void write(final List<ArticleFull> articles) throws IOException { if (!articles.isEmpty()) { - for (final ArticleFull article : articles) - seqindex.add(article.getDate(), vocab.transform(article.getProcessedText())); + for (int i = 0; i < articles.size(); i++) + seqindex.add(articles.get(i).getDate(), i); // use temp file final File modelFileTmp = getModelFile(FILE_NAME + ".tmp"); @@ -55,24 +55,35 @@ public class DTMFilebase extends Filebase { lines = FileUtils.iterateFileLines(modelFile); // concatenates the existing model file with new article entries in - // the temp file + // the temp file. The existing model is expected to be sorted + // correctly, therefore the file can be iterated sequentially. + // Because the database id index is created by the abstract + // filebase, it needs to be recreated with the new order final BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(modelFileTmp))); - for (final DTMDateIndexEntry e : seqindex) { - if (e.exists) { + final FilebaseIndex index = getIndex(); + final Iterator<String> currIndex = index.iterator(); + final List<String> newIndex = new ArrayList<>(); + for (final ArticleDate articleDate : seqindex.getArticleDates()) { + if (articleDate.isNew()) { + final ArticleFull article = articles.get(articleDate.index); + newIndex.add(article.getId().toString()); + writer.write(vocab.transform(article.getProcessedText())); + } else { if (lines == null) { writer.close(); throw new IOException("index inconsistency: missing article file"); } writer.write(lines.next()); - } else { - e.exists = true; - writer.write(e.line); + newIndex.add(currIndex.next()); } writer.write(Constants.LINE_SEP); } writer.close(); + // reset index to new order imposed by article dates + index.set(newIndex); + // replace model file by temp file if (modelFile.exists() && !modelFile.delete()) throw new IOException("could not delete file " + modelFile.getAbsolutePath()); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMIndex.java new file mode 100644 index 0000000000000000000000000000000000000000..64186b1107362e3301b7b1d81829bcf2fc1a7bdb --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMIndex.java @@ -0,0 +1,148 @@ +package de.vipra.cmd.file; + +import java.io.BufferedWriter; +import java.io.Closeable; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import de.vipra.util.Constants; +import de.vipra.util.FileUtils; + +public class DTMIndex implements Closeable { + + public static final class SequenceCount implements Comparable<SequenceCount> { + + public Date startDate; + public Date endDate; + public int count = 1; + + @Override + public int compareTo(final SequenceCount o) { + return startDate.compareTo(o.startDate); + } + } + + public static final class ArticleDate implements Comparable<ArticleDate> { + + public final Date date; + public final int index; + + public ArticleDate(final Date date, final int index) { + this.date = date; + this.index = index; + } + + @Override + public int compareTo(final ArticleDate o) { + return date.compareTo(o.date); + } + + public boolean isNew() { + return index != -1; + } + } + + public static final String DATE_FILE_NAME = "dates"; + public static final String SEQ_FILE_NAME = "dtm-seq.dat"; + + private static final SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); + + private final File inFile; + private final File outFile; + private final Map<Date, SequenceCount> sequenceMap = new HashMap<>(); + private final List<SequenceCount> sequenceList = new ArrayList<>(); + private final List<ArticleDate> articleDates = new ArrayList<>(); + + public DTMIndex(final File modelDir) throws IOException, ParseException { + this.inFile = new File(modelDir, DATE_FILE_NAME); + this.outFile = new File(modelDir, SEQ_FILE_NAME); + + if (inFile.exists()) { + final List<String> lines = FileUtils.readFile(inFile); + for (final String line : lines) { + add(df.parse(line)); + } + } + } + + private void add(final Date date) { + add(date, -1); + } + + public void add(final Date date, final int newArticleIndex) { + final Date startDate = Constants.WINDOW_RESOLUTION.startDate(date); + SequenceCount sequence = sequenceMap.get(startDate); + if (sequence == null) { + sequence = new SequenceCount(); + sequence.startDate = startDate; + sequence.endDate = Constants.WINDOW_RESOLUTION.endDate(date); + sequenceMap.put(startDate, sequence); + sequenceList.add(sequence); + Collections.sort(sequenceList); + } else { + sequence.count++; + } + final ArticleDate sequenceDate = new ArticleDate(date, newArticleIndex); + articleDates.add(sequenceDate); + } + + public SequenceCount getSequence(final int index) { + return sequenceList.get(index); + } + + public Date getStartDate(final int index) { + return getSequence(index).startDate; + } + + public Date getEndDate(final int index) { + return getSequence(index).endDate; + } + + public int sequenceCount() { + return sequenceMap.size(); + } + + public int entryCount() { + return articleDates.size(); + } + + public List<ArticleDate> getArticleDates() { + Collections.sort(articleDates); + return articleDates; + } + + @Override + public void close() throws IOException { + // write date index + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(inFile, false))); + Collections.sort(articleDates); + for (final ArticleDate entry : articleDates) { + writer.write(df.format(entry.date)); + writer.write(Constants.LINE_SEP); + } + writer.close(); + + // write window index + writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile, false))); + writer.write(Integer.toString(sequenceMap.size())); + writer.write(Constants.LINE_SEP); + + // write window sizes + for (final SequenceCount sequence : sequenceList) { + writer.write(Integer.toString(sequence.count)); + writer.write(Constants.LINE_SEP); + } + writer.close(); + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java deleted file mode 100644 index bd6b19eb5c8b75b99019fcc46df7c64d45dc8b3d..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java +++ /dev/null @@ -1,137 +0,0 @@ -package de.vipra.cmd.file; - -import java.io.BufferedWriter; -import java.io.Closeable; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Date; -import java.util.Iterator; -import java.util.List; - -import de.vipra.util.Config; -import de.vipra.util.Constants; -import de.vipra.util.Constants.WindowResolution; -import de.vipra.util.CountMap; -import de.vipra.util.FileUtils; -import de.vipra.util.ex.ConfigException; - -public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DTMDateIndexEntry> { - - public static final String FILE_NAME = "dtm-seq.dat"; - - public static class DTMDateIndexEntry implements Comparable<DTMDateIndexEntry> { - public Date date; - public boolean exists; - public String line; - - public DTMDateIndexEntry(final Date date, final boolean exists, final String line) { - this.date = date; - this.exists = exists; - this.line = line; - } - - @Override - public int compareTo(final DTMDateIndexEntry o) { - if (o == null) - return 1; - if (date == null) - return -1; - return this.date.compareTo(o.date); - } - } - - private final File file; - private final boolean readonly = false; - private final WindowResolution windowResolution; - private static List<DTMDateIndexEntry> entries; - private static SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); - private static CountMap<String> windowSizes = new CountMap<>(); - - public DTMSequenceIndex(final File modelDir) throws IOException, ParseException, ConfigException { - this(modelDir, false); - } - - public DTMSequenceIndex(final File modelDir, final boolean reread) - throws IOException, ParseException, ConfigException { - this.file = new File(modelDir, "dates"); - final Config config = Config.getConfig(); - this.windowResolution = config.windowResolution; - if (file.exists()) { - if (entries == null || reread) { - final List<String> dates = FileUtils.readFile(file); - entries = new ArrayList<>(dates.size()); - for (final String date : dates) - add(df.parse(date)); - } - } else if (entries == null || reread) { - entries = new ArrayList<>(); - } - } - - private void add(final Date date) { - add(date, null); - } - - public void add(final Date date, final String line) { - entries.add(new DTMDateIndexEntry(date, line == null, line)); - windowSizes.count(windowResolution.fromDate(date)); - } - - public int size() { - return windowSizes.size(); - } - - public Date getStartDate(int index) { - // TODO implement - return null; - } - - public Date getEndDate(int index) { - // TODO implement - return null; - } - - @Override - public Iterator<DTMDateIndexEntry> iterator() { - Collections.sort(entries); - return entries.iterator(); - } - - @Override - public void close() throws IOException { - if (readonly) - return; - - // write date index - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, false))); - for (final DTMDateIndexEntry entry : entries) { - writer.write(df.format(entry.date)); - writer.write(Constants.LINE_SEP); - } - writer.close(); - - // write window index - final File seqFile = new File(file.getParentFile(), FILE_NAME); - writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seqFile, false))); - writer.write(Integer.toString(windowSizes.size())); - writer.write(Constants.LINE_SEP); - - // write window sizes - final String[] windows = windowSizes.keySet().toArray(new String[windowSizes.size()]); - Arrays.sort(windows); - for (final String window : windows) { - writer.write(Integer.toString(windowSizes.get(window))); - writer.write(Constants.LINE_SEP); - } - - writer.close(); - } - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java index 97507171d5039b162072c05f22c4ffb28148c5d8..639a728c07e82fcb54ef1303094f20863ee609b1 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/Filebase.java @@ -83,7 +83,7 @@ public abstract class Filebase implements Closeable { public static Filebase getFilebase(final Config config) throws FilebaseException, ConfigException { final File dataDir = config.getDataDirectory(); - switch (config.analyzer) { + switch (Constants.ANALYZER) { case DTM: return new DTMFilebase(dataDir); case JGIBB: diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java index d3ed722056133d5cbdd4448c3bdc85c9f8cc6c4b..beb2ee94c2b84ec8e7e42f13ea61fc3b4a9ad1aa 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseIndex.java @@ -15,7 +15,7 @@ public class FilebaseIndex implements Closeable, Iterable<String> { public static final String FILE_NAME = "index"; private final File file; - private final List<String> index; + private List<String> index; public FilebaseIndex(final File modelDir) throws IOException { this.file = new File(modelDir, FILE_NAME); @@ -39,6 +39,10 @@ public class FilebaseIndex implements Closeable, Iterable<String> { return i; } + public void set(final List<String> index) { + this.index = index; + } + public int indexOf(final String id) { return index.indexOf(id); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java index 15a3765acf0afc580a0e8d362b8e1cddf98dc068..8f9ce5bd5ca6a8e7838c40d9bb3d8de08681d9a0 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/Analyzer.java @@ -2,6 +2,7 @@ package de.vipra.cmd.lda; import de.vipra.cmd.ex.AnalyzerException; import de.vipra.util.Config; +import de.vipra.util.Constants; public abstract class Analyzer { @@ -21,7 +22,7 @@ public abstract class Analyzer { public static Analyzer getAnalyzer(final Config config) throws AnalyzerException { Analyzer analyzer = null; - switch (config.analyzer) { + switch (Constants.ANALYZER) { case DTM: analyzer = new DTMAnalyzer(); break; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index 188d49f3467374e49995b9742d4e382bdc992c4c..ecfc02ad7ef90655594efba8a4d61c8f7b0d7823 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -18,7 +18,7 @@ import org.apache.logging.log4j.Logger; import org.bson.types.ObjectId; import de.vipra.cmd.ex.AnalyzerException; -import de.vipra.cmd.file.DTMSequenceIndex; +import de.vipra.cmd.file.DTMIndex; import de.vipra.cmd.file.DTMVocabulary; import de.vipra.cmd.file.FilebaseIndex; import de.vipra.util.ArrayUtils; @@ -47,7 +47,7 @@ public class DTMAnalyzer extends Analyzer { private File outDir; private File outDirSeq; private DTMVocabulary vocab; - private DTMSequenceIndex sequences; + private DTMIndex seqindex; private FilebaseIndex index; private MongoService<ArticleFull, ObjectId> dbArticles; private MongoService<TopicFull, ObjectId> dbTopics; @@ -65,8 +65,8 @@ public class DTMAnalyzer extends Analyzer { this.outDir = new File(modelDir, "out"); this.outDirSeq = new File(outDir, "lda-seq"); this.vocab = new DTMVocabulary(modelDir); - this.sequences = new DTMSequenceIndex(modelDir); this.index = new FilebaseIndex(modelDir); + this.seqindex = new DTMIndex(modelDir); this.dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); this.dbTopics = MongoService.getDatabaseService(config, TopicFull.class); this.dbWords = MongoService.getDatabaseService(config, Word.class); @@ -136,7 +136,7 @@ public class DTMAnalyzer extends Analyzer { // read topic definition files and create topics final int wordCount = vocab.size(); - final int sequencesCount = sequences.size(); + final int sequencesCount = seqindex.sequenceCount(); // collects created topics final List<TopicFull> newTopics = new ArrayList<>(Constants.K_TOPICS); // collects created words @@ -146,6 +146,9 @@ public class DTMAnalyzer extends Analyzer { log.info("sequences: " + sequencesCount); log.info("topics: " + Constants.K_TOPICS); + final boolean seqRelativeCutoff = Constants.MINIMUM_RELATIVE_PROB > 0; + final boolean seqPercentCutoff = Constants.PERCENT_PROB < 1; + // for each topic file for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { final File seqFile = new File(outDirSeq, @@ -155,6 +158,7 @@ public class DTMAnalyzer extends Analyzer { final TopicFull newTopic = new TopicFull(); final List<Sequence> newSequences = new ArrayList<>(sequencesCount); newTopic.setSequences(newSequences); + newTopics.add(newTopic); in = new BufferedReader(new InputStreamReader(new FileInputStream(seqFile))); @@ -172,39 +176,59 @@ public class DTMAnalyzer extends Analyzer { // find maximum final double[] maxSeqLikelinesses = ArrayUtils.findColMaximum(likelinesses); + // collect top words in each sequence for topic name + final Set<TopicWord> topTopicWords = new HashSet<>(); + // go through each sequence and gather all words that are above // the minimum relative word likeliness for (int idxSeq = 0; idxSeq < sequencesCount; idxSeq++) { - final double maxSeqLikeliness = maxSeqLikelinesses[idxSeq]; - final double minAcceptableSeqLikeliness = (maxSeqLikeliness >= 0 ? 1 - : 2 - Constants.MINIMUM_RELATIVE_PROB) * maxSeqLikeliness; + // calculate relative cutoff probability + final double minAcceptableSeqLikeliness; + if (Constants.MINIMUM_RELATIVE_PROB > 0) { + final double maxSeqLikeliness = maxSeqLikelinesses[idxSeq]; + minAcceptableSeqLikeliness = (maxSeqLikeliness >= 0 ? 1 : 2 - Constants.MINIMUM_RELATIVE_PROB) + * maxSeqLikeliness; + } + + // collect words final List<TopicWord> newSeqTopicWords = new ArrayList<>(wordCount); for (int idxWord = 0; idxWord < wordCount; idxWord++) { final double likeliness = likelinesses[idxWord][idxSeq]; - if (likeliness >= minAcceptableSeqLikeliness) { + if (!seqRelativeCutoff || likeliness >= minAcceptableSeqLikeliness) { final Word word = vocab.getWord(idxWord); newWords.add(word); final TopicWord topicWord = new TopicWord(word, likeliness); newSeqTopicWords.add(topicWord); } } - Collections.sort(newSeqTopicWords, Comparator.reverseOrder()); + // collect top n words + if (!newSeqTopicWords.isEmpty()) { + Collections.sort(newSeqTopicWords, Comparator.reverseOrder()); + + // top n percent cutoff + if (seqPercentCutoff) { + final int fromIndex = (int) Math.round(newSeqTopicWords.size() * Constants.PERCENT_PROB); + newSeqTopicWords.subList(fromIndex, newSeqTopicWords.size()); + } + + topTopicWords.addAll(newSeqTopicWords.subList(0, + Math.min(newSeqTopicWords.size(), Constants.TOPIC_AUTO_NAMING_WORDS))); + } + + // create sequence final Sequence newSequence = new Sequence(); newSequence.setNumber(idxSeq); newSequence.setWords(newSeqTopicWords); - newSequence.setStartDate(sequences.getStartDate(idxSeq)); - newSequence.setEndDate(sequences.getEndDate(idxSeq)); + newSequence.setStartDate(seqindex.getStartDate(idxSeq)); + newSequence.setEndDate(seqindex.getEndDate(idxSeq)); newSequences.add(newSequence); } - newTopics.add(newTopic); - } - - // sort topic words and generate topic name - for (final TopicFull topic : newTopics) { - Collections.sort(topic.getWords(), Collections.reverseOrder()); - topic.setName(TopicFull.getNameFromWords(topic.getWords())); + // sort topic words and generate topic name + final List<TopicWord> topTopicWordsList = new ArrayList<>(topTopicWords); + Collections.sort(topTopicWordsList); + newTopic.setName(TopicFull.getNameFromWords(topTopicWordsList)); } log.info("creating " + newTopics.size() + " " + StringUtils.quantity(newTopics.size(), "topic")); @@ -225,12 +249,12 @@ public class DTMAnalyzer extends Analyzer { final File gamFile = new File(outDirSeq, "gam.dat"); in = new BufferedReader(new InputStreamReader(new FileInputStream(gamFile))); - for (String articleId : index) { + for (final String articleId : index) { // normalize topic proportions double totalTopicProportions = 0; - double[] topicProportions = new double[Constants.K_TOPICS]; + final double[] topicProportions = new double[Constants.K_TOPICS]; for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { - double topicProportion = Double.parseDouble(in.readLine()); + final double topicProportion = Double.parseDouble(in.readLine()); topicProportions[idxTopic] = topicProportion; totalTopicProportions += topicProportion; } @@ -238,8 +262,8 @@ public class DTMAnalyzer extends Analyzer { // create topic references final List<TopicRef> newTopicRefs = new ArrayList<>(Constants.K_TOPICS); for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { - TopicRef newTopicRef = new TopicRef(); - TopicFull topicFull = newTopics.get(idxTopic); + final TopicRef newTopicRef = new TopicRef(); + final TopicFull topicFull = newTopics.get(idxTopic); newTopicRef.setTopic(new Topic(topicFull.getId())); newTopicRef.setShare(topicProportions[idxTopic] / totalTopicProportions); newTopicRefs.add(newTopicRef); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 8a75ca2357b0871ce9e3c32e994b03c5a92738c5..8e17e0e60defe916eb5cf689fbce5f0eb2b5b875 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -233,7 +233,7 @@ public class ImportCommand implements Command { config = Config.getConfig(); dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); filebase = Filebase.getFilebase(config); - processor = Processor.getProcessor(config); + processor = Processor.getProcessor(); buffer = new ArticleBuffer(dbArticles); executor = Executors.newFixedThreadPool(threadCount); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java index db4b9ca4dbde696c7be763ad45d2a1b0801a7056..54caa942147c2bdf1727c2fd54ce9b70d1e340fc 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/Processor.java @@ -3,7 +3,6 @@ package de.vipra.cmd.text; import java.util.List; import de.vipra.cmd.ex.ProcessorException; -import de.vipra.util.Config; import de.vipra.util.Constants; public abstract class Processor { @@ -20,10 +19,10 @@ public abstract class Processor { public abstract ProcessedText process(String input) throws ProcessorException; - public static Processor getProcessor(final Config config) { + public static Processor getProcessor() { final List<String> stopWords = Constants.STOPWORDS; - switch (config.processor) { + switch (Constants.PROCESSOR) { case CORENLP: return new CoreNLPProcessor(stopWords); default: diff --git a/vipra-cmd/src/main/resources/config.properties b/vipra-cmd/src/main/resources/config.properties index 6f38f34439e93fdde6eba975c5f46b7acddeee69..b7f21bc6797a242a07e4b325351608fc9b55179c 100644 --- a/vipra-cmd/src/main/resources/config.properties +++ b/vipra-cmd/src/main/resources/config.properties @@ -3,7 +3,4 @@ db.port=27017 db.name=test es.host=localhost es.port=9300 -tm.processor=corenlp -tm.analyzer=dtm -tm.dtmpath=/home/eike/repos/master/ma-impl/dtm_release/dtm/main -tm.windowresolution=yearly \ No newline at end of file +tm.dtmpath=/home/eike/repos/master/ma-impl/dtm_release/dtm/main \ No newline at end of file diff --git a/vipra-ui/app/html/about.html b/vipra-ui/app/html/about.html index 499be08ae5788ceb6f21dedb48e06aca0351e70d..bad0047e78a1b784c4cc4d9f97fa3ef1cf415062 100644 --- a/vipra-ui/app/html/about.html +++ b/vipra-ui/app/html/about.html @@ -78,28 +78,28 @@ </tr> </tbody> </table> - <h3>Configuration</h3> + <h3>Constants</h3> <table class="table table-bordered table-fixed"> <tbody> <tr> <th style="width:33%">Analyzer</th> - <td ng-bind-template="{{::info.config.analyzer}}"></td> + <td ng-bind-template="{{::info.const.analyzer}}"></td> </tr> <tr> <th>Processor</th> - <td ng-bind-template="{{::info.config.processor}}"></td> + <td ng-bind-template="{{::info.const.processor}}"></td> </tr> <tr> <th>Window resolution</th> - <td ng-bind-template="{{::info.config.windowres}}"></td> + <td ng-bind-template="{{::info.const.windowres}}"></td> + </tr> + <tr class="well"> + <td colspan="2"> + Analyzer, text processor and dynamic window resolution. + </td> </tr> - </tbody> - </table> - <h3>Constants</h3> - <table class="table table-bordered table-fixed"> - <tbody> <tr> - <th style="width:33%">Import buffer</th> + <th>Import buffer</th> <td ng-bind-template="{{::info.const.importbuf}}"></td> </tr> <tr class="well"> diff --git a/vipra-util/src/main/java/de/vipra/util/CalendarUtils.java b/vipra-util/src/main/java/de/vipra/util/CalendarUtils.java index bab86af0e9777f7dabf6c3b87fb1135e3d263e66..1d8df0089ec08004256604dbf8fc4fda4032a80a 100644 --- a/vipra-util/src/main/java/de/vipra/util/CalendarUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/CalendarUtils.java @@ -19,4 +19,48 @@ public class CalendarUtils { return (int) Math.ceil(c.get(Calendar.MONTH) / 3.0); } + public static final int getQuarterStart(final Calendar c) { + switch (c.get(Calendar.MONTH)) { + case 0: + case 1: + case 2: + return 0; + case 3: + case 4: + case 5: + return 3; + case 6: + case 7: + case 8: + return 6; + case 9: + case 10: + case 11: + return 9; + } + return 0; + } + + public static final int getQuarterEnd(final Calendar c) { + switch (c.get(Calendar.MONTH)) { + case 0: + case 1: + case 2: + return 2; + case 3: + case 4: + case 5: + return 5; + case 6: + case 7: + case 8: + return 8; + case 9: + case 10: + case 11: + return 11; + } + return 0; + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/Config.java b/vipra-util/src/main/java/de/vipra/util/Config.java index 849819b8a6d0adc7b787e38d2bf73c1f0e77a14b..490ded1b5b8f794b44a453704ceeef8aa778edb1 100644 --- a/vipra-util/src/main/java/de/vipra/util/Config.java +++ b/vipra-util/src/main/java/de/vipra/util/Config.java @@ -16,9 +16,6 @@ import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import de.vipra.util.Constants.Analyzer; -import de.vipra.util.Constants.Processor; -import de.vipra.util.Constants.WindowResolution; import de.vipra.util.an.ConfigKey; import de.vipra.util.ex.ConfigException; import de.vipra.util.model.Model; @@ -63,29 +60,6 @@ public class Config { @ConfigKey("es.port") public int elasticsearchPort = Constants.ES_PORT; - /** - * The text processor to be used. To find a list of available values, - * {@link de.vipra.util.Constants.Processor}. - */ - @ConfigKey("tm.processor") - public Processor processor = Constants.Processor.DEFAULT(); - - /** - * The topic modeling analyzer to be used. To find a list of available - * analyzers, {@link de.vipra.util.Constants.Analyzer}. - */ - @ConfigKey("tm.analyzer") - public Analyzer analyzer = Constants.Analyzer.DEFAULT(); - - /** - * The dynamic topic modeling window resolution to be used. This value is - * only used, if the selected analyzer supports dynamic topic modeling. To - * find a list of available analyzers, - * {@link de.vipra.util.Constants.WindowResolution}. - */ - @ConfigKey("tm.windowresolution") - public WindowResolution windowResolution = Constants.WindowResolution.DEFAULT(); - /** * Path to the dtm executable. If using dtm as the anaylyzer, this path must * be set to the dtm executable. diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 2466791f942bfdf3f6a577b80c4920bf247d2e68..6232b169d143986431ffa9a3e303dfed0fa2fd1a 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -81,6 +81,11 @@ public class Constants { */ public static final double MINIMUM_RELATIVE_PROB = 0.01; + /** + * + */ + public static final double PERCENT_PROB = 0.9; + /** * Topics with a share greater or equal to this number are regarded as * accepted topics to that article. Value range: [0.0, 1.0] @@ -114,6 +119,26 @@ public class Constants { */ public static final int DOCUMENT_MIN_LENGTH = 10; + /** + * The text processor to be used. To find a list of available values, + * {@link de.vipra.util.Constants.Processor}. + */ + public static final Processor PROCESSOR = Processor.CORENLP; + + /** + * The topic modeling analyzer to be used. To find a list of available + * analyzers, {@link de.vipra.util.Constants.Analyzer}. + */ + public static final Analyzer ANALYZER = Analyzer.DTM; + + /** + * The dynamic topic modeling window resolution to be used. This value is + * only used, if the selected analyzer supports dynamic topic modeling. To + * find a list of available analyzers, + * {@link de.vipra.util.Constants.WindowResolution}. + */ + public static final WindowResolution WINDOW_RESOLUTION = WindowResolution.YEARLY; + /** * Stopwords list. Extensive list of stopwords used to clean imported * articles of the most common words before topic modeling is applied. @@ -322,7 +347,7 @@ public class Constants { this.name = def.name; } - public String fromDate(final Date date) { + public String sequenceLabel(final Date date) { final Calendar c = new GregorianCalendar(); c.setTime(date); String str = c.get(Calendar.YEAR) + ""; @@ -340,6 +365,59 @@ public class Constants { return str; } + public Date startDate(final Date date) { + final Calendar in = new GregorianCalendar(); + in.setTime(date); + final Calendar out = new GregorianCalendar(); + int month = 0; + switch (this) { + case QUARTERLY: + month = CalendarUtils.getQuarterStart(in); + break; + case MONTHLY: + month = in.get(Calendar.MONTH); + break; + default: + break; + } + out.set(Calendar.YEAR, in.get(Calendar.YEAR)); + out.set(Calendar.MONTH, month); + out.set(Calendar.DAY_OF_MONTH, out.getActualMaximum(Calendar.DAY_OF_MONTH)); + out.set(Calendar.HOUR, 0); + out.set(Calendar.MINUTE, 0); + out.set(Calendar.SECOND, 0); + out.set(Calendar.MILLISECOND, 0); + return out.getTime(); + } + + public Date endDate(final Date date) { + final Calendar in = new GregorianCalendar(); + in.setTime(date); + final Calendar out = new GregorianCalendar(); + int month = 0; + switch (this) { + case YEARLY: + month = 11; + break; + case QUARTERLY: + month = CalendarUtils.getQuarterEnd(in); + break; + case MONTHLY: + month = in.get(Calendar.MONTH); + break; + default: + break; + } + out.set(Calendar.YEAR, in.get(Calendar.YEAR)); + out.set(Calendar.MONTH, month); + out.set(Calendar.DAY_OF_MONTH, out.getActualMaximum(Calendar.DAY_OF_MONTH)); + out.set(Calendar.HOUR, 0); + out.set(Calendar.MINUTE, 0); + out.set(Calendar.SECOND, 0); + out.set(Calendar.MILLISECOND, 0); + return out.getTime(); + } + public static WindowResolution DEFAULT() { return YEARLY; } diff --git a/vipra-util/src/main/java/de/vipra/util/CountMap.java b/vipra-util/src/main/java/de/vipra/util/CountMap.java index bd66f6ec38e358bfa3710ce22916058591528807..0285cd3912b06acd0f67dc0d0460b118cb1edebf 100644 --- a/vipra-util/src/main/java/de/vipra/util/CountMap.java +++ b/vipra-util/src/main/java/de/vipra/util/CountMap.java @@ -45,4 +45,8 @@ public class CountMap<T> { return map.keySet(); } + public boolean contains(final Object key) { + return map.containsKey(key); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java index 36be515e2596f09e06f869df9ba0ffa593d9733d..2a0050aec6bf8e183ad84be2f0efec4776e69c0f 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/Sequence.java +++ b/vipra-util/src/main/java/de/vipra/util/model/Sequence.java @@ -22,7 +22,7 @@ public class Sequence implements Comparable<Sequence>, Serializable { return startDate; } - public void setStartDate(Date startDate) { + public void setStartDate(final Date startDate) { this.startDate = startDate; } @@ -30,7 +30,7 @@ public class Sequence implements Comparable<Sequence>, Serializable { return endDate; } - public void setEndDate(Date endDate) { + public void setEndDate(final Date endDate) { this.endDate = endDate; } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java index f6d2bba8c70080c8f58c3345211548d4e9445bfd..b0d26d8c437cc82aec81695fbfd2a92e32c85fca 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java @@ -30,7 +30,7 @@ public class TopicRef implements Comparable<TopicRef>, Serializable { return count; } - public void setCount(Integer count) { + public void setCount(final Integer count) { this.count = count; } diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java index 84c2f8301e22c3ac0fbae05431ba0cddbc85f6c9..81f453c0c2e3b453abbe921d9b9c8e4eae965ce1 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicWord.java @@ -67,4 +67,29 @@ public class TopicWord implements Comparable<TopicWord>, Serializable { return "TopicWord [word=" + word + ", likeliness=" + likeliness + "]"; } + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((word == null) ? 0 : word.hashCode()); + return result; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final TopicWord other = (TopicWord) obj; + if (word == null) { + if (other.word != null) + return false; + } else if (!word.equals(other.word)) + return false; + return true; + } + }