From 610c1d3e858e22117c7a91983f64f732c607f968 Mon Sep 17 00:00:00 2001 From: Eike Cochu <eike@cochu.com> Date: Mon, 4 Apr 2016 18:35:32 +0200 Subject: [PATCH] words to lowercase and trim --- vipra-cmd/runcfg/CMD.launch | 18 ++++++++++++++++++ vipra-cmd/src/main/java/de/vipra/cmd/Main.java | 2 ++ .../de/vipra/cmd/file/FilebaseWordIndex.java | 5 +++-- .../java/de/vipra/cmd/text/ProcessedText.java | 2 +- .../de/vipra/cmd/text/SpotlightResponse.java | 11 +++++++++++ vipra-ui/app/html/articles/show.html | 6 ++++-- vipra-ui/app/js/controllers.js | 2 +- .../java/de/vipra/util/model/TextEntity.java | 12 +++++++++++- 8 files changed, 51 insertions(+), 7 deletions(-) create mode 100644 vipra-cmd/runcfg/CMD.launch diff --git a/vipra-cmd/runcfg/CMD.launch b/vipra-cmd/runcfg/CMD.launch new file mode 100644 index 00000000..d65397a0 --- /dev/null +++ b/vipra-cmd/runcfg/CMD.launch @@ -0,0 +1,18 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication"> +<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS"> +<listEntry value="/vipra-cmd/src/main/java/de/vipra/cmd/Main.java"/> +</listAttribute> +<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES"> +<listEntry value="1"/> +</listAttribute> +<listAttribute key="org.eclipse.debug.ui.favoriteGroups"> +<listEntry value="org.eclipse.debug.ui.launchGroup.run"/> +</listAttribute> +<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> +<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-dcC yearly -AI /home/eike/repos/master/ma-impl/vm/data/data.json"/> +<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> +<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/> +</launchConfiguration> diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java index 096f7612..145b7965 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java @@ -93,6 +93,8 @@ public class Main { ConsoleUtils.error(cause.getMessage()); else ConsoleUtils.error(e.getMessage()); + if (opts.isDebug() && !opts.isSilent()) + e.printStackTrace(System.out); } } } else { diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java index c939f635..42d0f02a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java @@ -68,7 +68,7 @@ public class FilebaseWordIndex implements Iterable<String> { public void countWords(final List<ArticleWord> articleWords) { for (final ArticleWord articleWord : articleWords) - wordDocumentCount.count(articleWord.getId()); + wordDocumentCount.count(articleWord.getId().toLowerCase()); } public String transform(final String[] words) { @@ -86,7 +86,8 @@ public class FilebaseWordIndex implements Iterable<String> { return sb.toString(); } - public int index(final String word) { + public int index(String word) { + word = word.toLowerCase().trim(); Integer index = wordIndex.get(word); if (index == null) { index = nextIndex++; diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java index 57f2b7c3..3564c6c2 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java @@ -18,7 +18,7 @@ public class ProcessedText { private final List<ArticleWord> articleWords; public ProcessedText(final String text, final long wordCount) { - words = text.split("\\s+"); + words = text.toLowerCase().trim().split("\\s+"); originalWordCount = wordCount; reducedWordCount = words.length; reductionRatio = 1 - ((double) reducedWordCount / wordCount); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java index 30832f44..d2cd9b82 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java @@ -9,6 +9,7 @@ import java.util.Set; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; +import de.vipra.util.CountMap; import de.vipra.util.model.TextEntity; @JsonIgnoreProperties(ignoreUnknown = true) @@ -26,13 +27,23 @@ public class SpotlightResponse { } public List<TextEntity> getEntities() { + final CountMap<String> textEntitiesCount = new CountMap<>(resources.size()); final Set<TextEntity> textEntities = new HashSet<>(resources.size()); + // get entities and count for (SpotlightResource resource : resources) { textEntities.add(new TextEntity(resource.getSurfaceForm(), resource.getUri())); + textEntitiesCount.count(resource.getSurfaceForm()); // TODO add types to entities? } + + // insert count + for (TextEntity textEntity : textEntities) + textEntity.setCount(textEntitiesCount.get(textEntity.getEntity())); + + // to list and sort final List<TextEntity> textEntitiesList = new ArrayList<>(textEntities); Collections.sort(textEntitiesList); + return textEntitiesList; } diff --git a/vipra-ui/app/html/articles/show.html b/vipra-ui/app/html/articles/show.html index 573b548c..2af0c21f 100644 --- a/vipra-ui/app/html/articles/show.html +++ b/vipra-ui/app/html/articles/show.html @@ -110,6 +110,7 @@ <thead> <tr> <th ng-model="articlesShowModels.entitiesSort" sort-by="entity">Entity</th> + <th ng-model="articlesShowModels.entitiesSort" sort-by="count">Count</th> </tr> </thead> <tbody> @@ -117,6 +118,7 @@ <td> <entity-link entity="::entity" /> </td> + <td ng-bind="::entity.count"></td> </tr> </tbody> </table> @@ -135,12 +137,12 @@ <div class="panel panel-default"> <div class="panel-heading"> Found <ng-pluralize count="allWords.length||0" when="{0:'no words',1:'1 word',other:'{} unique words'}"></ng-pluralize> for this article.<br> - Article has <ng-pluralize count="article.stats.wordCount||0" when="{0:'no words',1:'1 word',other:'{} words'}"></ng-pluralize>, <span ng-bind-template="{{::article.stats.processedWordCount}} after cleaning ({{::Vipra.toPercent(article.stats.reductionRatio)}}% reduction)"></span>. + Article has <ng-pluralize count="article.stats.wordCount||0" when="{0:'no words',1:'1 word',other:'{} words'}"></ng-pluralize>, <span ng-bind-template="{{::article.stats.processedWordCount}} after cleaning ({{::Vipra.toPercent(article.stats.reductionRatio)}}% reduction)" ng-show="article.stats.wordCount>0"></span>. </div> <table class="table table-bordered table-condensed table-fixed"> <thead> <tr> - <th ng-model="articlesShowModels.wordsSort" sort-by="word">Word</th> + <th ng-model="articlesShowModels.wordsSort" sort-by="id">Word</th> <th ng-model="articlesShowModels.wordsSort" sort-by="count">Count</th> <th>Share</th> <th>Reduced share</th> diff --git a/vipra-ui/app/js/controllers.js b/vipra-ui/app/js/controllers.js index 1fc17d66..d7ca1504 100644 --- a/vipra-ui/app/js/controllers.js +++ b/vipra-ui/app/js/controllers.js @@ -638,7 +638,7 @@ topicsSort: '-share', similarSort: '-share', wordsSort: '-count', - entitiesSort: 'entity' + entitiesSort: '-count' }; ArticleFactory.get({ diff --git a/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java b/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java index e66fa89f..d0bb6945 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java +++ b/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java @@ -15,6 +15,8 @@ public class TextEntity implements Comparable<TextEntity>, Serializable { private String url; + private Integer count; + public TextEntity() {} public TextEntity(final String entity, final String url) { @@ -38,6 +40,14 @@ public class TextEntity implements Comparable<TextEntity>, Serializable { this.url = url; } + public Integer getCount() { + return count; + } + + public void setCount(Integer count) { + this.count = count; + } + @Override public int hashCode() { final int prime = 31; @@ -65,7 +75,7 @@ public class TextEntity implements Comparable<TextEntity>, Serializable { @Override public int compareTo(TextEntity o) { - return entity.compareTo(o.getEntity()); + return count.compareTo(o.getCount()); } } -- GitLab