From 42db9bc3c8517550611e1baea23a74f399962015 Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Wed, 24 Feb 2016 20:29:49 +0100 Subject: [PATCH] updated build script, fixed jgibb modeling fixed topic name generation fixed topic word saving public subdirectory added to gitignore calculating topic share when modeling now --- build.sh | 2 +- vipra-backend/.gitignore | 1 + .../de/vipra/rest/resource/InfoResource.java | 1 - vipra-backend/src/main/webapp/WEB-INF/web.xml | 2 +- .../java/de/vipra/cmd/lda/JGibbAnalyzer.java | 60 ++++++++++++++----- vipra-ui/app/js/controllers.js | 7 +-- .../main/java/de/vipra/util/Constants.java | 6 -- .../java/de/vipra/util/model/TopicRef.java | 11 ++++ 8 files changed, 60 insertions(+), 30 deletions(-) diff --git a/build.sh b/build.sh index 63f31a0d..3c49ceb7 100755 --- a/build.sh +++ b/build.sh @@ -57,7 +57,7 @@ echo "-------------------------------" >> $LOG cd ./vipra-ui ./build.sh >> $LOG 2>&1 cd .. -cp -r ./vipra-ui/public/* ./vipra-backend/src/main/webapp +cp -r ./vipra-ui/public ./vipra-backend/src/main/webapp/public if [ $? -ne 0 ]; then echo "error" exit 1 diff --git a/vipra-backend/.gitignore b/vipra-backend/.gitignore index 988b884b..04c8f6de 100644 --- a/vipra-backend/.gitignore +++ b/vipra-backend/.gitignore @@ -1,2 +1,3 @@ *.class /target/ +/src/main/webapp/public/ diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java index fbcd8576..9bd18963 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java @@ -81,7 +81,6 @@ public class InfoResource { info.put("const.topicautoname", Constants.TOPIC_AUTO_NAMING_WORDS); info.put("const.ktopics", Constants.K_TOPICS); info.put("const.ktopicwords", Constants.K_TOPIC_WORDS); - info.put("const.likeprecision", Constants.LIKELINESS_PRECISION); info.put("const.minimumlike", Constants.MINIMUM_LIKELINESS); info.put("const.topicthresh", Constants.TOPIC_THRESHOLD); info.put("const.docminfreq", Constants.DOCUMENT_MIN_WORD_FREQ); diff --git a/vipra-backend/src/main/webapp/WEB-INF/web.xml b/vipra-backend/src/main/webapp/WEB-INF/web.xml index d28582b4..90ba036b 100644 --- a/vipra-backend/src/main/webapp/WEB-INF/web.xml +++ b/vipra-backend/src/main/webapp/WEB-INF/web.xml @@ -4,7 +4,7 @@ xsi:schemaLocation="http://xmlns.jcp.org/xml/ns/javaee http://xmlns.jcp.org/xml/ns/javaee/web-app_3_1.xsd" version="3.1"> <welcome-file-list> - <welcome-file>index.html</welcome-file> + <welcome-file>public/index.html</welcome-file> </welcome-file-list> <servlet> <servlet-name>jersey</servlet-name> diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java index fb8a117e..c2d35fb7 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java @@ -6,10 +6,13 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -47,6 +50,7 @@ public class JGibbAnalyzer extends Analyzer { private LDACmdOption options; private MongoService<ArticleFull, ObjectId> dbArticles; private MongoService<TopicFull, ObjectId> dbTopics; + private MongoService<Word, String> dbWords; private FilebaseIndex index; protected JGibbAnalyzer() { @@ -79,6 +83,7 @@ public class JGibbAnalyzer extends Analyzer { config = Config.getConfig(); dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); dbTopics = MongoService.getDatabaseService(config, TopicFull.class); + dbWords = MongoService.getDatabaseService(config, Word.class); index = new FilebaseIndex(new File(modelDir, "index")); } catch (Exception e) { throw new AnalyzerException(e); @@ -106,6 +111,7 @@ public class JGibbAnalyzer extends Analyzer { List<TopicFull> newTopics = new ArrayList<>(options.K); Map<Integer, Topic> newTopicsMap = new HashMap<>(options.K); + Set<Word> newWords = new HashSet<>(); TopicFull newTopic = null; List<TopicWord> topicWords = null; @@ -123,13 +129,29 @@ public class JGibbAnalyzer extends Analyzer { } String[] parts = line.trim().split("\\s+"); - TopicWord topicWord = new TopicWord(new Word(parts[0]), Double.parseDouble(parts[1])); - topicWords.add(topicWord); + double likeliness = Double.parseDouble(parts[1]); + + // check word likeliness + if (likeliness >= Constants.MINIMUM_LIKELINESS) { + Word newWord = new Word(parts[0]); + TopicWord topicWord = new TopicWord(newWord, likeliness); + topicWords.add(topicWord); + newWords.add(newWord); + } + } + + // sort topic words and generate topic name + for (TopicFull topic : newTopics) { + Collections.sort(topic.getWords(), Collections.reverseOrder()); + topic.setName(TopicFull.getNameFromWords(topic.getWords())); } + // recreate topics and words dbTopics.drop(); + dbWords.drop(); try { dbTopics.createMultiple(newTopics); + dbWords.createMultiple(newWords); } catch (DatabaseException e) { throw new AnalyzerException(e); } @@ -151,29 +173,37 @@ public class JGibbAnalyzer extends Analyzer { // extract topic ids and count them CountMap<String> countMap = new CountMap<>(); Matcher matcher = topicIndexPattern.matcher(line); + double totalCount = 0; while (matcher.find()) { countMap.count(matcher.group(1)); + totalCount++; } // create list of topics refs referencing topics with counted // occurrences List<TopicRef> newTopicRefs = new ArrayList<>(); for (Entry<String, Integer> entry : countMap.entrySet()) { - TopicRef ref = new TopicRef(); - ref.setCount(entry.getValue()); - ref.setTopic(newTopicsMap.get(Integer.parseInt(entry.getKey()))); - newTopicRefs.add(ref); + // check if topic above threshold + double topicShare = entry.getValue() / totalCount; + if (topicShare >= Constants.TOPIC_THRESHOLD) { + TopicRef ref = new TopicRef(); + ref.setCount(entry.getValue()); + ref.setTopic(newTopicsMap.get(Integer.parseInt(entry.getKey()))); + newTopicRefs.add(ref); + } } - // update article with topic references (partial update) - ArticleFull article = new ArticleFull(); - article.setId(index.get(articleIndex++)); - article.setTopics(newTopicRefs); - try { - // TODO: using field name here. Hard to refactor - dbArticles.updateSingle(article, "topics"); - } catch (DatabaseException e) { - log.error(e); + if (!newTopicRefs.isEmpty()) { + // update article with topic references (partial update) + ArticleFull article = new ArticleFull(); + article.setId(index.get(articleIndex++)); + article.setTopics(newTopicRefs); + try { + // TODO: using field name here. Hard to refactor + dbArticles.updateSingle(article, "topics"); + } catch (DatabaseException e) { + log.error(e); + } } } in.close(); diff --git a/vipra-ui/app/js/controllers.js b/vipra-ui/app/js/controllers.js index daa5d471..71e13c62 100644 --- a/vipra-ui/app/js/controllers.js +++ b/vipra-ui/app/js/controllers.js @@ -302,13 +302,8 @@ if($scope.article.topics) { var topicShareSeries = [], topics = $scope.article.topics; - topicsCount = 0; - for(var i = 0; i < topics.length; i++) - topicsCount += topics[i].count; for(var i = 0; i < topics.length; i++) { - var share = Vipra.toPercent(topics[i].count / topicsCount); - topics[i].share = share; - topicShareSeries.push({name: topics[i].topic.name.ellipsize(20), y: share}); + topicShareSeries.push({name: topics[i].topic.name.ellipsize(20), y: topics[i].share}); } // highcharts data diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 32d35b3e..c5bd5390 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -75,12 +75,6 @@ public class Constants { * library supports this parameter. */ public static final int K_TOPIC_WORDS = 50; - - /** - * Precision of likeliness numbers. Likeliness is calculated for words to - * belong to topics. - */ - public static final int LIKELINESS_PRECISION = 6; /** * Minimum likeliness of words. Words with lower likeliness are ignored diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java index 00837669..3277e56e 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java @@ -11,8 +11,11 @@ public class TopicRef implements Comparable<TopicRef>, Serializable { @Reference(ignoreMissing = true) private Topic topic; + private Integer count; + private Double share; + public Integer getCount() { return count; } @@ -29,6 +32,14 @@ public class TopicRef implements Comparable<TopicRef>, Serializable { this.topic = topic; } + public Double getShare() { + return share; + } + + public void setShare(Double share) { + this.share = share; + } + @Override public int compareTo(TopicRef arg0) { return count - arg0.getCount(); -- GitLab