Skip to content
Snippets Groups Projects
Commit d3685ee0 authored by Eike Cochu's avatar Eike Cochu
Browse files

added divergence calculation

added jsd and kld divergence formulas
added SimilarArticle embeddded reference for similar articles
added constants to limit divergence and article count
updated about page with new constants
parent 31f6129b
Branches
No related tags found
No related merge requests found
......@@ -77,6 +77,8 @@ public class InfoResource {
info.put("const.decaylambda", Constants.RISING_DECAY_LAMBDA);
info.put("const.minrelprob", Constants.MINIMUM_RELATIVE_PROB);
info.put("const.minshare", Constants.MINIMUM_SHARE);
info.put("const.maxsimdocs", Constants.MAX_SIMILAR_DOCUMENTS);
info.put("const.maxdiv", Constants.MAX_DIVERGENCE);
info.put("const.dynminiter", Constants.DYNAMIC_MIN_ITER);
info.put("const.dynmaxiter", Constants.DYNAMIC_MAX_ITER);
info.put("const.statiter", Constants.STATIC_ITER);
......
......@@ -24,12 +24,15 @@ import de.vipra.cmd.file.FilebaseIndex;
import de.vipra.util.ArrayUtils;
import de.vipra.util.Config;
import de.vipra.util.Constants;
import de.vipra.util.MongoUtils;
import de.vipra.util.StringUtils;
import de.vipra.util.ex.ConfigException;
import de.vipra.util.ex.DatabaseException;
import de.vipra.util.model.Article;
import de.vipra.util.model.ArticleFull;
import de.vipra.util.model.Sequence;
import de.vipra.util.model.SequenceFull;
import de.vipra.util.model.SimilarArticle;
import de.vipra.util.model.Topic;
import de.vipra.util.model.TopicFull;
import de.vipra.util.model.TopicRef;
......@@ -337,13 +340,16 @@ public class DTMAnalyzer extends Analyzer {
throw new AnalyzerException(e);
}
// create topic references
// create topic references and store document similarities
int idxArticle = 0;
int idxArticle = -1;
for (final String articleId : idindex) {
final double[] topicDistribution = topicDistributions[idxArticle++];
idxArticle++;
final double[] topicDistribution = topicDistributions[idxArticle];
// create topic references
double reducedShare = 0;
final List<TopicRef> newTopicRefs = new ArrayList<>(Constants.K_TOPICS);
for (int idxTopic = 0; idxTopic < Constants.K_TOPICS; idxTopic++) {
......@@ -357,7 +363,32 @@ public class DTMAnalyzer extends Analyzer {
}
}
// calculate divergences
List<SimilarArticle> similarArticles = new ArrayList<>(articlesCount - 1);
for (int idxArticle2 = 0; idxArticle2 < articlesCount; idxArticle2++) {
if (idxArticle == idxArticle2)
continue;
double divergence = ArrayUtils.jsDivergence(topicDistributions[idxArticle],
topicDistributions[idxArticle2]);
if (divergence > Constants.MAX_DIVERGENCE)
continue;
SimilarArticle similarArticle = new SimilarArticle();
similarArticle.setArticle(new Article(MongoUtils.objectId(idindex.get(idxArticle2))));
similarArticle.setDivergence(divergence);
similarArticles.add(similarArticle);
}
Collections.sort(similarArticles);
if (similarArticles.size() > Constants.MAX_SIMILAR_DOCUMENTS)
similarArticles.subList(Constants.MAX_SIMILAR_DOCUMENTS, similarArticles.size()).clear();
// update article
if (!newTopicRefs.isEmpty()) {
// renormalize share
for (final TopicRef newTopicRef : newTopicRefs)
......@@ -369,10 +400,11 @@ public class DTMAnalyzer extends Analyzer {
final ArticleFull article = new ArticleFull();
article.setId(articleId);
article.setTopics(newTopicRefs);
article.setSimilarArticles(similarArticles);
try {
// TODO: using field name here. Hard to refactor
dbArticles.updateSingle(article, "topics");
dbArticles.updateSingle(article, "topics", "similarArticles");
} catch (final DatabaseException e) {
log.error(e);
}
......
......@@ -175,6 +175,24 @@
The minimum share of a topic to be accepted for an article. Topic shares are renormalized after rejecting topics below this threshold.
</td>
</tr>
<tr>
<th>Maximum similar documents</th>
<td ng-bind-template="{{::info.const.maxsimdocs}}"></td>
</tr>
<tr class="well">
<td colspan="2">
Maximum number of similar documents for each document.
</td>
</tr>
<tr>
<th>Maximum divergence</th>
<td ng-bind-template="{{::info.const.maxdiv}}"></td>
</tr>
<tr class="well">
<td colspan="2">
Maximum divergence between a document and similar documents. Lower values mean more similar documents (less divergence).
</td>
</tr>
<tr>
<th>Dynamic minimum iterations</th>
<td ng-bind-template="{{::info.const.dynminiter}}"></td>
......
......@@ -16,4 +16,43 @@ public class ArrayUtils {
return maximum;
}
/**
* Jensen Shannon Divergence to measure similarity between two probability
* distributions.
*
* @param p1
* left distribution
* @param p2
* right distribution
* @return divergence
*/
public static double jsDivergence(double[] p1, double[] p2) {
assert p1.length == p2.length;
double[] avg = new double[p1.length];
for (int i = 0; i < p1.length; i++)
avg[i] = (p1[i] + p2[i]) / 2.0;
return (klDivergence(p1, avg) + klDivergence(p2, avg)) / 2.0;
}
/**
* Kullback Leibler Divergence to measure similarity between two probability
* distributions.
*
* @param p1
* left distribution
* @param p2
* right distribution
* @return divergence
*/
public static double klDivergence(double[] p1, double[] p2) {
assert p1.length == p2.length;
double result = 0;
for (int i = 0; i < p1.length; i++) {
if (p1[i] == 0 || p2[i] == 0)
continue;
result += p1[i] * Math.log(p1[i] / p2[i]);
}
return result / Math.log(2);
}
}
......@@ -97,6 +97,17 @@ public class Constants {
*/
public static final double MINIMUM_SHARE = 0.01;
/**
* Maximum number of similar documents for each document.
*/
public static final int MAX_SIMILAR_DOCUMENTS = 20;
/**
* Maximum divergence between a document and similar documents. Lower values
* mean more similar documents (less divergence). Default 1.0.
*/
public static final double MAX_DIVERGENCE = 1.0;
/**
* Dynamic minimum iterations. Used for dynamic topic modeling. Default 100.
*/
......
......@@ -57,6 +57,10 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable {
@QueryIgnore(multi = true)
private List<TopicRef> topics;
@Embedded
@QueryIgnore(multi = true)
private List<SimilarArticle> similarArticles;
@Embedded
@QueryIgnore(multi = true)
private ArticleStats stats;
......@@ -161,6 +165,14 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable {
return topics.toArray(new String[topics.size()]);
}
public List<SimilarArticle> getSimilarArticles() {
return similarArticles;
}
public void setSimilarArticles(List<SimilarArticle> similarArticles) {
this.similarArticles = similarArticles;
}
public ArticleStats getStats() {
return stats;
}
......@@ -240,8 +252,9 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable {
@Override
public String toString() {
return "ArticleFull [id=" + id + ", title=" + title + ", text=" + text + ", processedText="
+ Arrays.toString(processedText) + ", url=" + url + ", date=" + date + ", topics=" + topics + ", stats="
+ stats + ", created=" + created + ", modified=" + modified + ", meta=" + meta + "]";
+ Arrays.toString(processedText) + ", url=" + url + ", date=" + date + ", topics=" + topics
+ ", similarArticles=" + similarArticles + ", stats=" + stats + ", created=" + created + ", modified="
+ modified + ", meta=" + meta + "]";
}
}
\ No newline at end of file
package de.vipra.util.model;
import java.io.Serializable;
import org.mongodb.morphia.annotations.Embedded;
import org.mongodb.morphia.annotations.Reference;
@SuppressWarnings("serial")
@Embedded
public class SimilarArticle implements Comparable<SimilarArticle>, Serializable {
@Reference(ignoreMissing = true)
private Article article;
private double divergence;
public Article getArticle() {
return article;
}
public void setArticle(Article article) {
this.article = article;
}
public double getDivergence() {
return divergence;
}
public void setDivergence(double divergence) {
this.divergence = divergence;
}
@Override
public int compareTo(SimilarArticle o) {
return Double.compare(divergence, o.getDivergence());
}
@Override
public String toString() {
return "SimilarArticle [article=" + article + ", divergence=" + divergence + "]";
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment