diff --git a/build.sh b/build.sh index 63f31a0df005942ec80000b28b82125853735837..3c49ceb7dfedc8c8b081fda04336b1f76cf6fb82 100755 --- a/build.sh +++ b/build.sh @@ -57,7 +57,7 @@ echo "-------------------------------" >> $LOG cd ./vipra-ui ./build.sh >> $LOG 2>&1 cd .. -cp -r ./vipra-ui/public/* ./vipra-backend/src/main/webapp +cp -r ./vipra-ui/public ./vipra-backend/src/main/webapp/public if [ $? -ne 0 ]; then echo "error" exit 1 diff --git a/vipra-backend/.gitignore b/vipra-backend/.gitignore index 988b884ba04555e3e7f3b63b683693287e6a0d17..04c8f6dec6c4af1de6e79f5da9061d5abb293376 100644 --- a/vipra-backend/.gitignore +++ b/vipra-backend/.gitignore @@ -1,2 +1,3 @@ *.class /target/ +/src/main/webapp/public/ diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java index fbcd85761343b6f25ab1f93640a3d884326b3f61..9bd18963a1a7dbef12767cd988119f16bde5c307 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/InfoResource.java @@ -81,7 +81,6 @@ public class InfoResource { info.put("const.topicautoname", Constants.TOPIC_AUTO_NAMING_WORDS); info.put("const.ktopics", Constants.K_TOPICS); info.put("const.ktopicwords", Constants.K_TOPIC_WORDS); - info.put("const.likeprecision", Constants.LIKELINESS_PRECISION); info.put("const.minimumlike", Constants.MINIMUM_LIKELINESS); info.put("const.topicthresh", Constants.TOPIC_THRESHOLD); info.put("const.docminfreq", Constants.DOCUMENT_MIN_WORD_FREQ); diff --git a/vipra-backend/src/main/webapp/WEB-INF/web.xml b/vipra-backend/src/main/webapp/WEB-INF/web.xml index d28582b4638dc6242b05aa7ad3d72027a8865922..90ba036b82b59a102580eca5ef485dd6858e74c4 100644 --- a/vipra-backend/src/main/webapp/WEB-INF/web.xml +++ b/vipra-backend/src/main/webapp/WEB-INF/web.xml @@ -4,7 +4,7 @@ xsi:schemaLocation="http://xmlns.jcp.org/xml/ns/javaee http://xmlns.jcp.org/xml/ns/javaee/web-app_3_1.xsd" version="3.1"> <welcome-file-list> - <welcome-file>index.html</welcome-file> + <welcome-file>public/index.html</welcome-file> </welcome-file-list> <servlet> <servlet-name>jersey</servlet-name> diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java index fb8a117e8cc64af114d541694d45792ebfd5baaf..c2d35fb7e01ecb6bdb641a202ef89352b7eba266 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/lda/JGibbAnalyzer.java @@ -6,10 +6,13 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -47,6 +50,7 @@ public class JGibbAnalyzer extends Analyzer { private LDACmdOption options; private MongoService<ArticleFull, ObjectId> dbArticles; private MongoService<TopicFull, ObjectId> dbTopics; + private MongoService<Word, String> dbWords; private FilebaseIndex index; protected JGibbAnalyzer() { @@ -79,6 +83,7 @@ public class JGibbAnalyzer extends Analyzer { config = Config.getConfig(); dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); dbTopics = MongoService.getDatabaseService(config, TopicFull.class); + dbWords = MongoService.getDatabaseService(config, Word.class); index = new FilebaseIndex(new File(modelDir, "index")); } catch (Exception e) { throw new AnalyzerException(e); @@ -106,6 +111,7 @@ public class JGibbAnalyzer extends Analyzer { List<TopicFull> newTopics = new ArrayList<>(options.K); Map<Integer, Topic> newTopicsMap = new HashMap<>(options.K); + Set<Word> newWords = new HashSet<>(); TopicFull newTopic = null; List<TopicWord> topicWords = null; @@ -123,13 +129,29 @@ public class JGibbAnalyzer extends Analyzer { } String[] parts = line.trim().split("\\s+"); - TopicWord topicWord = new TopicWord(new Word(parts[0]), Double.parseDouble(parts[1])); - topicWords.add(topicWord); + double likeliness = Double.parseDouble(parts[1]); + + // check word likeliness + if (likeliness >= Constants.MINIMUM_LIKELINESS) { + Word newWord = new Word(parts[0]); + TopicWord topicWord = new TopicWord(newWord, likeliness); + topicWords.add(topicWord); + newWords.add(newWord); + } + } + + // sort topic words and generate topic name + for (TopicFull topic : newTopics) { + Collections.sort(topic.getWords(), Collections.reverseOrder()); + topic.setName(TopicFull.getNameFromWords(topic.getWords())); } + // recreate topics and words dbTopics.drop(); + dbWords.drop(); try { dbTopics.createMultiple(newTopics); + dbWords.createMultiple(newWords); } catch (DatabaseException e) { throw new AnalyzerException(e); } @@ -151,29 +173,37 @@ public class JGibbAnalyzer extends Analyzer { // extract topic ids and count them CountMap<String> countMap = new CountMap<>(); Matcher matcher = topicIndexPattern.matcher(line); + double totalCount = 0; while (matcher.find()) { countMap.count(matcher.group(1)); + totalCount++; } // create list of topics refs referencing topics with counted // occurrences List<TopicRef> newTopicRefs = new ArrayList<>(); for (Entry<String, Integer> entry : countMap.entrySet()) { - TopicRef ref = new TopicRef(); - ref.setCount(entry.getValue()); - ref.setTopic(newTopicsMap.get(Integer.parseInt(entry.getKey()))); - newTopicRefs.add(ref); + // check if topic above threshold + double topicShare = entry.getValue() / totalCount; + if (topicShare >= Constants.TOPIC_THRESHOLD) { + TopicRef ref = new TopicRef(); + ref.setCount(entry.getValue()); + ref.setTopic(newTopicsMap.get(Integer.parseInt(entry.getKey()))); + newTopicRefs.add(ref); + } } - // update article with topic references (partial update) - ArticleFull article = new ArticleFull(); - article.setId(index.get(articleIndex++)); - article.setTopics(newTopicRefs); - try { - // TODO: using field name here. Hard to refactor - dbArticles.updateSingle(article, "topics"); - } catch (DatabaseException e) { - log.error(e); + if (!newTopicRefs.isEmpty()) { + // update article with topic references (partial update) + ArticleFull article = new ArticleFull(); + article.setId(index.get(articleIndex++)); + article.setTopics(newTopicRefs); + try { + // TODO: using field name here. Hard to refactor + dbArticles.updateSingle(article, "topics"); + } catch (DatabaseException e) { + log.error(e); + } } } in.close(); diff --git a/vipra-ui/app/js/controllers.js b/vipra-ui/app/js/controllers.js index daa5d471b04c7e2f80328aa6d625323760ea07d1..71e13c62425ed5861ce6f06a8e2df279d0d136c5 100644 --- a/vipra-ui/app/js/controllers.js +++ b/vipra-ui/app/js/controllers.js @@ -302,13 +302,8 @@ if($scope.article.topics) { var topicShareSeries = [], topics = $scope.article.topics; - topicsCount = 0; - for(var i = 0; i < topics.length; i++) - topicsCount += topics[i].count; for(var i = 0; i < topics.length; i++) { - var share = Vipra.toPercent(topics[i].count / topicsCount); - topics[i].share = share; - topicShareSeries.push({name: topics[i].topic.name.ellipsize(20), y: share}); + topicShareSeries.push({name: topics[i].topic.name.ellipsize(20), y: topics[i].share}); } // highcharts data diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 32d35b3e159322c298d6de318442d47ec8d4e139..c5bd5390ece80923acb9022e4d024e0f1a2103e9 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -75,12 +75,6 @@ public class Constants { * library supports this parameter. */ public static final int K_TOPIC_WORDS = 50; - - /** - * Precision of likeliness numbers. Likeliness is calculated for words to - * belong to topics. - */ - public static final int LIKELINESS_PRECISION = 6; /** * Minimum likeliness of words. Words with lower likeliness are ignored diff --git a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java index 00837669c2ddee92726a0e63f0f557b418206754..3277e56e2c553bc67d4bfe73eb2cd39d089faf18 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TopicRef.java @@ -11,8 +11,11 @@ public class TopicRef implements Comparable<TopicRef>, Serializable { @Reference(ignoreMissing = true) private Topic topic; + private Integer count; + private Double share; + public Integer getCount() { return count; } @@ -29,6 +32,14 @@ public class TopicRef implements Comparable<TopicRef>, Serializable { this.topic = topic; } + public Double getShare() { + return share; + } + + public void setShare(Double share) { + this.share = share; + } + @Override public int compareTo(TopicRef arg0) { return count - arg0.getCount();