Skip to content
Snippets Groups Projects
Commit fd5bc11d authored by Eike Cochu's avatar Eike Cochu
Browse files

removed topic similarity, too complex, meh

parent 08e1d67b
Branches
No related tags found
No related merge requests found
Showing
with 92 additions and 141 deletions
......@@ -2,7 +2,6 @@ package de.vipra.cmd.lda;
import de.vipra.cmd.ex.AnalyzerException;
import de.vipra.util.Config;
import de.vipra.util.WordMap;
public abstract class Analyzer {
......@@ -16,11 +15,11 @@ public abstract class Analyzer {
return name;
}
public abstract void init(Config config, WordMap wordMap) throws AnalyzerException;
public abstract void init(Config config) throws AnalyzerException;
public abstract void analyze() throws AnalyzerException;
public static Analyzer getAnalyzer(Config config, WordMap wordMap) throws AnalyzerException {
public static Analyzer getAnalyzer(Config config) throws AnalyzerException {
Analyzer analyzer = null;
switch (config.analyzer) {
case DTM:
......@@ -32,7 +31,7 @@ public abstract class Analyzer {
default:
return null;
}
analyzer.init(config, wordMap);
analyzer.init(config);
return analyzer;
}
......
......@@ -12,7 +12,6 @@ import de.vipra.cmd.ex.AnalyzerException;
import de.vipra.util.Config;
import de.vipra.util.Constants;
import de.vipra.util.StringUtils;
import de.vipra.util.WordMap;
import de.vipra.util.ex.ConfigException;
public class DTMAnalyzer extends Analyzer {
......@@ -33,7 +32,7 @@ public class DTMAnalyzer extends Analyzer {
}
@Override
public void init(Config config, WordMap wordMap) throws AnalyzerException {
public void init(Config config) throws AnalyzerException {
try {
File dataDir = config.getDataDirectory();
this.modelDir = new File(dataDir, NAME);
......
......@@ -26,7 +26,6 @@ import de.vipra.util.Config;
import de.vipra.util.Constants;
import de.vipra.util.CountMap;
import de.vipra.util.FileUtils;
import de.vipra.util.WordMap;
import de.vipra.util.ex.ConfigException;
import de.vipra.util.ex.DatabaseException;
import de.vipra.util.model.ArticleFull;
......@@ -58,7 +57,7 @@ public class JGibbAnalyzer extends Analyzer {
}
@Override
public void init(Config config, WordMap wordMap) throws AnalyzerException {
public void init(Config config) throws AnalyzerException {
options = new LDACmdOption();
try {
......@@ -109,8 +108,11 @@ public class JGibbAnalyzer extends Analyzer {
throw new AnalyzerException(e);
}
// the list of new topics
List<TopicFull> newTopics = new ArrayList<>(options.K);
// a map of topic index -> topic. resolves topic ids from tassign file
Map<Integer, Topic> newTopicsMap = new HashMap<>(options.K);
// set of new words
Set<Word> newWords = new HashSet<>();
TopicFull newTopic = null;
......@@ -182,14 +184,15 @@ public class JGibbAnalyzer extends Analyzer {
// create list of topics refs referencing topics with counted
// occurrences, sum accepted topic word count
long reducedCount = 0;
List<TopicRef> newTopicRefs = new ArrayList<>();
List<TopicRef> newTopicRefs = new ArrayList<>(countMap.size());
for (Entry<String, Integer> entry : countMap.entrySet()) {
// check if topic above threshold
if ((entry.getValue() / totalCount) >= Constants.TOPIC_THRESHOLD) {
reducedCount += entry.getValue();
Topic topic = newTopicsMap.get(Integer.parseInt(entry.getKey()));
TopicRef ref = new TopicRef();
ref.setCount(entry.getValue());
ref.setTopic(newTopicsMap.get(Integer.parseInt(entry.getKey())));
ref.setTopic(topic);
newTopicRefs.add(ref);
}
}
......@@ -203,6 +206,7 @@ public class JGibbAnalyzer extends Analyzer {
ArticleFull article = new ArticleFull();
article.setId(index.get(articleIndex++));
article.setTopics(newTopicRefs);
try {
// TODO: using field name here. Hard to refactor
dbArticles.updateSingle(article, "topics");
......
......@@ -7,25 +7,18 @@ import de.vipra.cmd.lda.Analyzer;
import de.vipra.util.Config;
import de.vipra.util.StringUtils;
import de.vipra.util.Timer;
import de.vipra.util.WordMap;
import de.vipra.util.model.Word;
import de.vipra.util.service.MongoService;
public class ModelingCommand implements Command {
public static final Logger log = LogManager.getLogger(ModelingCommand.class);
private Config config;
private MongoService<Word, String> dbWords;
private WordMap wordMap;
private Analyzer analyzer;
@Override
public void run() throws Exception {
config = Config.getConfig();
dbWords = MongoService.getDatabaseService(config, Word.class);
wordMap = new WordMap(dbWords);
analyzer = Analyzer.getAnalyzer(config, wordMap);
analyzer = Analyzer.getAnalyzer(config);
log.info("using analyzer: " + analyzer.getName());
......
......@@ -32,12 +32,6 @@
<td>
<a class="btn btn-default" ui-sref="topics.show.articles({id:topic.id})">Articles</a>
</td>
<td>
<bs-dropdown label="Similar Topics">
<li><a ui-sref="topics.show.similar({id:topic.id, type:'by-words'})">By word share</a></li>
<li><a ui-sref="topics.show.similar({id:topic.id, type:'by-articles'})">By article share</a></li>
</bs-dropdown>
</td>
</tr>
</table>
</div>
......@@ -52,10 +46,6 @@
<th>ID</th>
<td ng-bind="::topic.id"></td>
</tr>
<tr>
<th>Index</th>
<td ng-bind="::topic.index"></td>
</tr>
<tr>
<th>Created</th>
<td ng-bind="::topicCreated"></td>
......@@ -87,7 +77,7 @@
<tbody>
<tr ng-repeat="word in topic.words | orderBy:wordSort:wordSortRev">
<td><a ui-sref="words.show({id:word.id})" ng-bind="word.id"></a></td>
<td ng-bind="word.likeliness"></td>
<td ng-bind-template="{{word.likeliness.toFixed(6)}}"></td>
</tr>
</tbody>
</table>
......
<div ng-cloak ng-hide="$state.current.name !== 'topics.show.similar'">
</div>
<div ng-cloak ui-view></div>
\ No newline at end of file
......@@ -96,16 +96,7 @@
templateUrl: 'html/topics/articles.html',
controller: 'TopicsArticlesController',
ncyBreadcrumb: {
label: 'Topic Articles'
}
});
$stateProvider.state('topics.show.similar', {
url: '/similar/:type',
templateUrl: 'html/topics/similar.html',
controller: 'TopicsSimilarController',
ncyBreadcrumb: {
label: 'Similar Topics (by {{typeLabel}})'
label: 'Articles'
}
});
......
......@@ -29,4 +29,8 @@ public class CountMap<T> {
return map.entrySet();
}
public int size() {
return map.size();
}
}
package de.vipra.util;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
public class MultiMap<K, V> {
private final Map<K, Set<V>> map;
public MultiMap() {
this.map = new HashMap<K, Set<V>>();
}
public void put(K key, V value) {
Set<V> set = map.get(key);
if (set == null)
set = new HashSet<>();
set.add(value);
map.put(key, set);
}
public void put(K key, Collection<V> values) {
Set<V> set = map.get(key);
if (set == null)
set = new HashSet<>();
set.addAll(values);
map.put(key, set);
}
public Set<V> get(K key) {
return map.get(key);
}
public Set<Entry<K, Set<V>>> entrySet() {
return map.entrySet();
}
public int size() {
return map.size();
}
}
package de.vipra.util;
import de.vipra.util.model.Topic;
public class TopicSimilarity {
private Topic topic;
private int shareCount;
public Topic getTopic() {
return topic;
}
public void setTopic(Topic topic) {
this.topic = topic;
}
public int getShareCount() {
return shareCount;
}
public void setShareCount(int shareCount) {
this.shareCount = shareCount;
}
}
package de.vipra.util;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.vipra.util.ex.DatabaseException;
import de.vipra.util.model.Word;
import de.vipra.util.service.MongoService;
public class WordMap {
public static final Logger log = LoggerFactory.getLogger(WordMap.class);
private final MongoService<Word, String> dbWords;
private final Map<String, Word> wordMap;
private final Set<Word> newWords;
private boolean createNow = false;
public WordMap(MongoService<Word, String> dbWords) {
this.dbWords = dbWords;
this.wordMap = new HashMap<>();
this.newWords = new HashSet<>();
List<Word> words = dbWords.getAll();
for (Word word : words)
wordMap.put(word.getId().toLowerCase(), word);
}
public Word get(Object w) {
String strWord = w.toString().toLowerCase();
Word word = wordMap.get(strWord);
if (word == null) {
word = new Word(strWord);
createWord(word);
wordMap.put(strWord, word);
}
return word;
}
public void add(Object w) {
get(w);
}
private Word createWord(Word word) {
if (createNow) {
try {
dbWords.createSingle(word);
newWords.add(word);
} catch (DatabaseException e) {
log.error("could not create word in database", e);
throw new RuntimeException(e);
}
}
return word;
}
public void create() throws DatabaseException {
List<Word> newWords = new ArrayList<>();
for (Entry<String, Word> e : wordMap.entrySet())
if (!e.getValue().isCreated())
newWords.add(e.getValue());
dbWords.createMultiple(newWords);
this.newWords.addAll(newWords);
}
public boolean isCreateNow() {
return createNow;
}
public void setCreateNow(boolean createNow) {
this.createNow = createNow;
}
public Set<Word> getNewWords() {
return newWords;
}
}
......@@ -10,7 +10,6 @@ import org.mongodb.morphia.annotations.Embedded;
import org.mongodb.morphia.annotations.Entity;
import org.mongodb.morphia.annotations.Id;
import org.mongodb.morphia.annotations.PrePersist;
import org.mongodb.morphia.annotations.Transient;
import de.vipra.util.Constants;
import de.vipra.util.MongoUtils;
......@@ -32,9 +31,6 @@ public class TopicFull implements Model<ObjectId>, Serializable {
@QueryIgnore(multi = true)
private List<TopicWord> words;
@Transient
private List<ArticleFull> articles;
private Date created;
private Date modified;
......@@ -77,14 +73,6 @@ public class TopicFull implements Model<ObjectId>, Serializable {
this.words = topicWords;
}
public List<ArticleFull> getArticles() {
return articles;
}
public void setArticles(List<ArticleFull> articles) {
this.articles = articles;
}
public Date getCreated() {
return created;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment