diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java index 97182d3ed141443e085d8360d69ba8e8da7cbfb7..b00357b72ab94e51e70672897c434ab05ab0c87c 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java @@ -77,6 +77,8 @@ public class InfoResource { info.put("const.decaylambda", Constants.RISING_DECAY_LAMBDA); info.put("const.minrelprob", Constants.MINIMUM_RELATIVE_PROB); info.put("const.minshare", Constants.MINIMUM_SHARE); + info.put("const.maxsimdocs", Constants.MAX_SIMILAR_DOCUMENTS); + info.put("const.maxdiv", Constants.MAX_DIVERGENCE); info.put("const.dynminiter", Constants.DYNAMIC_MIN_ITER); info.put("const.dynmaxiter", Constants.DYNAMIC_MAX_ITER); info.put("const.statiter", Constants.STATIC_ITER); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java index 20ca5d260dd73809c275033cbb4723d10fcbf9aa..d1060a8f3d6d9a65dcf5348f6b8866583038b042 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/DTMAnalyzer.java @@ -24,12 +24,15 @@ import de.vipra.cmd.file.FilebaseIndex; import de.vipra.util.ArrayUtils; import de.vipra.util.Config; import de.vipra.util.Constants; +import de.vipra.util.MongoUtils; import de.vipra.util.StringUtils; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; +import de.vipra.util.model.Article; import de.vipra.util.model.ArticleFull; import de.vipra.util.model.Sequence; import de.vipra.util.model.SequenceFull; +import de.vipra.util.model.SimilarArticle; import de.vipra.util.model.Topic; import de.vipra.util.model.TopicFull; import de.vipra.util.model.TopicRef; @@ -337,13 +340,16 @@ public class DTMAnalyzer extends Analyzer { throw new AnalyzerException(e); } - // create topic references + // create topic references and store document similarities - int idxArticle = 0; + int idxArticle = -1; for (final String articleId : idindex) { - final double[] topicDistribution = topicDistributions[idxArticle++]; + idxArticle++; + + final double[] topicDistribution = topicDistributions[idxArticle]; // create topic references + double reducedShare = 0; final List<TopicRef> newTopicRefs = new ArrayList<>(Constants.K_TOPICS); for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) { @@ -357,7 +363,32 @@ public class DTMAnalyzer extends Analyzer { } } + // calculate divergences + + List<SimilarArticle> similarArticles = new ArrayList<>(articlesCount - 1); + + for (int idxArticle2 = 0; idxArticle2 < articlesCount; idxArticle2++) { + if (idxArticle == idxArticle2) + continue; + + double divergence = ArrayUtils.jsDivergence(topicDistributions[idxArticle], + topicDistributions[idxArticle2]); + if (divergence > Constants.MAX_DIVERGENCE) + continue; + + SimilarArticle similarArticle = new SimilarArticle(); + similarArticle.setArticle(new Article(MongoUtils.objectId(idindex.get(idxArticle2)))); + similarArticle.setDivergence(divergence); + similarArticles.add(similarArticle); + } + + Collections.sort(similarArticles); + + if (similarArticles.size() > Constants.MAX_SIMILAR_DOCUMENTS) + similarArticles.subList(Constants.MAX_SIMILAR_DOCUMENTS, similarArticles.size()).clear(); + // update article + if (!newTopicRefs.isEmpty()) { // renormalize share for (final TopicRef newTopicRef : newTopicRefs) @@ -369,10 +400,11 @@ public class DTMAnalyzer extends Analyzer { final ArticleFull article = new ArticleFull(); article.setId(articleId); article.setTopics(newTopicRefs); + article.setSimilarArticles(similarArticles); try { // TODO: using field name here. Hard to refactor - dbArticles.updateSingle(article, "topics"); + dbArticles.updateSingle(article, "topics", "similarArticles"); } catch (final DatabaseException e) { log.error(e); } diff --git a/vipra-ui/app/html/about.html b/vipra-ui/app/html/about.html index c1849104efce06730ac1e435ce058ba21eefd188..83bd3d4591f51b9ac75d5a45a824be4a2b009523 100644 --- a/vipra-ui/app/html/about.html +++ b/vipra-ui/app/html/about.html @@ -175,6 +175,24 @@ The minimum share of a topic to be accepted for an article. Topic shares are renormalized after rejecting topics below this threshold. </td> </tr> + <tr> + <th>Maximum similar documents</th> + <td ng-bind-template="{{::info.const.maxsimdocs}}"></td> + </tr> + <tr class="well"> + <td colspan="2"> + Maximum number of similar documents for each document. + </td> + </tr> + <tr> + <th>Maximum divergence</th> + <td ng-bind-template="{{::info.const.maxdiv}}"></td> + </tr> + <tr class="well"> + <td colspan="2"> + Maximum divergence between a document and similar documents. Lower values mean more similar documents (less divergence). + </td> + </tr> <tr> <th>Dynamic minimum iterations</th> <td ng-bind-template="{{::info.const.dynminiter}}"></td> diff --git a/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java b/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java index cdf232ae2e6b456ab0b87f8a06791832422c785f..8f2db914d6260d56259561bb332624da5c8c5f26 100644 --- a/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/ArrayUtils.java @@ -16,4 +16,43 @@ public class ArrayUtils { return maximum; } + /** + * Jensen Shannon Divergence to measure similarity between two probability + * distributions. + * + * @param p1 + * left distribution + * @param p2 + * right distribution + * @return divergence + */ + public static double jsDivergence(double[] p1, double[] p2) { + assert p1.length == p2.length; + double[] avg = new double[p1.length]; + for (int i = 0; i < p1.length; i++) + avg[i] = (p1[i] + p2[i]) / 2.0; + return (klDivergence(p1, avg) + klDivergence(p2, avg)) / 2.0; + } + + /** + * Kullback Leibler Divergence to measure similarity between two probability + * distributions. + * + * @param p1 + * left distribution + * @param p2 + * right distribution + * @return divergence + */ + public static double klDivergence(double[] p1, double[] p2) { + assert p1.length == p2.length; + double result = 0; + for (int i = 0; i < p1.length; i++) { + if (p1[i] == 0 || p2[i] == 0) + continue; + result += p1[i] * Math.log(p1[i] / p2[i]); + } + return result / Math.log(2); + } + } diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 9f0a17a5bd5cc1e2b1dffa7aab9e034d5ee03af5..d479aca51ff904e98abac1704118125a993c60d3 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -97,6 +97,17 @@ public class Constants { */ public static final double MINIMUM_SHARE = 0.01; + /** + * Maximum number of similar documents for each document. + */ + public static final int MAX_SIMILAR_DOCUMENTS = 20; + + /** + * Maximum divergence between a document and similar documents. Lower values + * mean more similar documents (less divergence). Default 1.0. + */ + public static final double MAX_DIVERGENCE = 1.0; + /** * Dynamic minimum iterations. Used for dynamic topic modeling. Default 100. */ diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java index f9e3ac4491b703698915a418ba03aeaef58fa3b8..cac031097f319cf9a58ff429ad997d2a830aaf85 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java @@ -57,6 +57,10 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable { @QueryIgnore(multi = true) private List<TopicRef> topics; + @Embedded + @QueryIgnore(multi = true) + private List<SimilarArticle> similarArticles; + @Embedded @QueryIgnore(multi = true) private ArticleStats stats; @@ -161,6 +165,14 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable { return topics.toArray(new String[topics.size()]); } + public List<SimilarArticle> getSimilarArticles() { + return similarArticles; + } + + public void setSimilarArticles(List<SimilarArticle> similarArticles) { + this.similarArticles = similarArticles; + } + public ArticleStats getStats() { return stats; } @@ -240,8 +252,9 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable { @Override public String toString() { return "ArticleFull [id=" + id + ", title=" + title + ", text=" + text + ", processedText=" - + Arrays.toString(processedText) + ", url=" + url + ", date=" + date + ", topics=" + topics + ", stats=" - + stats + ", created=" + created + ", modified=" + modified + ", meta=" + meta + "]"; + + Arrays.toString(processedText) + ", url=" + url + ", date=" + date + ", topics=" + topics + + ", similarArticles=" + similarArticles + ", stats=" + stats + ", created=" + created + ", modified=" + + modified + ", meta=" + meta + "]"; } } \ No newline at end of file diff --git a/vipra-util/src/main/java/de/vipra/util/model/SimilarArticle.java b/vipra-util/src/main/java/de/vipra/util/model/SimilarArticle.java new file mode 100644 index 0000000000000000000000000000000000000000..86fc4560553ad88b37cad0389ce654728736d904 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/model/SimilarArticle.java @@ -0,0 +1,43 @@ +package de.vipra.util.model; + +import java.io.Serializable; + +import org.mongodb.morphia.annotations.Embedded; +import org.mongodb.morphia.annotations.Reference; + +@SuppressWarnings("serial") +@Embedded +public class SimilarArticle implements Comparable<SimilarArticle>, Serializable { + + @Reference(ignoreMissing = true) + private Article article; + + private double divergence; + + public Article getArticle() { + return article; + } + + public void setArticle(Article article) { + this.article = article; + } + + public double getDivergence() { + return divergence; + } + + public void setDivergence(double divergence) { + this.divergence = divergence; + } + + @Override + public int compareTo(SimilarArticle o) { + return Double.compare(divergence, o.getDivergence()); + } + + @Override + public String toString() { + return "SimilarArticle [article=" + article + ", divergence=" + divergence + "]"; + } + +}