From 610c1d3e858e22117c7a91983f64f732c607f968 Mon Sep 17 00:00:00 2001
From: Eike Cochu <eike@cochu.com>
Date: Mon, 4 Apr 2016 18:35:32 +0200
Subject: [PATCH] words to lowercase and trim

---
 vipra-cmd/runcfg/CMD.launch                    | 18 ++++++++++++++++++
 vipra-cmd/src/main/java/de/vipra/cmd/Main.java |  2 ++
 .../de/vipra/cmd/file/FilebaseWordIndex.java   |  5 +++--
 .../java/de/vipra/cmd/text/ProcessedText.java  |  2 +-
 .../de/vipra/cmd/text/SpotlightResponse.java   | 11 +++++++++++
 vipra-ui/app/html/articles/show.html           |  6 ++++--
 vipra-ui/app/js/controllers.js                 |  2 +-
 .../java/de/vipra/util/model/TextEntity.java   | 12 +++++++++++-
 8 files changed, 51 insertions(+), 7 deletions(-)
 create mode 100644 vipra-cmd/runcfg/CMD.launch

diff --git a/vipra-cmd/runcfg/CMD.launch b/vipra-cmd/runcfg/CMD.launch
new file mode 100644
index 00000000..d65397a0
--- /dev/null
+++ b/vipra-cmd/runcfg/CMD.launch
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication">
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
+<listEntry value="/vipra-cmd/src/main/java/de/vipra/cmd/Main.java"/>
+</listAttribute>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
+<listEntry value="1"/>
+</listAttribute>
+<listAttribute key="org.eclipse.debug.ui.favoriteGroups">
+<listEntry value="org.eclipse.debug.ui.launchGroup.run"/>
+</listAttribute>
+<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/>
+<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/>
+<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-dcC yearly -AI /home/eike/repos/master/ma-impl/vm/data/data.json"/>
+<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/>
+<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>
+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-ea"/>
+</launchConfiguration>
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java
index 096f7612..145b7965 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/Main.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/Main.java
@@ -93,6 +93,8 @@ public class Main {
 						ConsoleUtils.error(cause.getMessage());
 					else
 						ConsoleUtils.error(e.getMessage());
+					if (opts.isDebug() && !opts.isSilent())
+						e.printStackTrace(System.out);
 				}
 			}
 		} else {
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java
index c939f635..42d0f02a 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/FilebaseWordIndex.java
@@ -68,7 +68,7 @@ public class FilebaseWordIndex implements Iterable<String> {
 
 	public void countWords(final List<ArticleWord> articleWords) {
 		for (final ArticleWord articleWord : articleWords)
-			wordDocumentCount.count(articleWord.getId());
+			wordDocumentCount.count(articleWord.getId().toLowerCase());
 	}
 
 	public String transform(final String[] words) {
@@ -86,7 +86,8 @@ public class FilebaseWordIndex implements Iterable<String> {
 		return sb.toString();
 	}
 
-	public int index(final String word) {
+	public int index(String word) {
+		word = word.toLowerCase().trim();
 		Integer index = wordIndex.get(word);
 		if (index == null) {
 			index = nextIndex++;
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java
index 57f2b7c3..3564c6c2 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java
@@ -18,7 +18,7 @@ public class ProcessedText {
 	private final List<ArticleWord> articleWords;
 
 	public ProcessedText(final String text, final long wordCount) {
-		words = text.split("\\s+");
+		words = text.toLowerCase().trim().split("\\s+");
 		originalWordCount = wordCount;
 		reducedWordCount = words.length;
 		reductionRatio = 1 - ((double) reducedWordCount / wordCount);
diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java
index 30832f44..d2cd9b82 100644
--- a/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java
+++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/SpotlightResponse.java
@@ -9,6 +9,7 @@ import java.util.Set;
 import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 import com.fasterxml.jackson.annotation.JsonProperty;
 
+import de.vipra.util.CountMap;
 import de.vipra.util.model.TextEntity;
 
 @JsonIgnoreProperties(ignoreUnknown = true)
@@ -26,13 +27,23 @@ public class SpotlightResponse {
 	}
 
 	public List<TextEntity> getEntities() {
+		final CountMap<String> textEntitiesCount = new CountMap<>(resources.size());
 		final Set<TextEntity> textEntities = new HashSet<>(resources.size());
+		// get entities and count
 		for (SpotlightResource resource : resources) {
 			textEntities.add(new TextEntity(resource.getSurfaceForm(), resource.getUri()));
+			textEntitiesCount.count(resource.getSurfaceForm());
 			// TODO add types to entities?
 		}
+		
+		// insert count
+		for (TextEntity textEntity : textEntities)
+			textEntity.setCount(textEntitiesCount.get(textEntity.getEntity()));
+		
+		// to list and sort
 		final List<TextEntity> textEntitiesList = new ArrayList<>(textEntities);
 		Collections.sort(textEntitiesList);
+		
 		return textEntitiesList;
 	}
 
diff --git a/vipra-ui/app/html/articles/show.html b/vipra-ui/app/html/articles/show.html
index 573b548c..2af0c21f 100644
--- a/vipra-ui/app/html/articles/show.html
+++ b/vipra-ui/app/html/articles/show.html
@@ -110,6 +110,7 @@
                 <thead>
                   <tr>
                     <th ng-model="articlesShowModels.entitiesSort" sort-by="entity">Entity</th>
+                    <th ng-model="articlesShowModels.entitiesSort" sort-by="count">Count</th>
                   </tr>
                 </thead>
                 <tbody>
@@ -117,6 +118,7 @@
                     <td>
                       <entity-link entity="::entity" />
                     </td>
+                    <td ng-bind="::entity.count"></td>
                   </tr>
                 </tbody>
               </table>
@@ -135,12 +137,12 @@
             <div class="panel panel-default">
               <div class="panel-heading">
                 Found <ng-pluralize count="allWords.length||0" when="{0:'no words',1:'1 word',other:'{} unique words'}"></ng-pluralize> for this article.<br>
-                Article has <ng-pluralize count="article.stats.wordCount||0" when="{0:'no words',1:'1 word',other:'{} words'}"></ng-pluralize>, <span ng-bind-template="{{::article.stats.processedWordCount}} after cleaning ({{::Vipra.toPercent(article.stats.reductionRatio)}}% reduction)"></span>.
+                Article has <ng-pluralize count="article.stats.wordCount||0" when="{0:'no words',1:'1 word',other:'{} words'}"></ng-pluralize>, <span ng-bind-template="{{::article.stats.processedWordCount}} after cleaning ({{::Vipra.toPercent(article.stats.reductionRatio)}}% reduction)" ng-show="article.stats.wordCount>0"></span>.
               </div>
               <table class="table table-bordered table-condensed table-fixed">
                 <thead>
                   <tr>
-                    <th ng-model="articlesShowModels.wordsSort" sort-by="word">Word</th>
+                    <th ng-model="articlesShowModels.wordsSort" sort-by="id">Word</th>
                     <th ng-model="articlesShowModels.wordsSort" sort-by="count">Count</th>
                     <th>Share</th>
                     <th>Reduced share</th>
diff --git a/vipra-ui/app/js/controllers.js b/vipra-ui/app/js/controllers.js
index 1fc17d66..d7ca1504 100644
--- a/vipra-ui/app/js/controllers.js
+++ b/vipra-ui/app/js/controllers.js
@@ -638,7 +638,7 @@
         topicsSort: '-share',
         similarSort: '-share',
         wordsSort: '-count',
-        entitiesSort: 'entity'
+        entitiesSort: '-count'
       };
 
       ArticleFactory.get({
diff --git a/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java b/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java
index e66fa89f..d0bb6945 100644
--- a/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java
+++ b/vipra-util/src/main/java/de/vipra/util/model/TextEntity.java
@@ -15,6 +15,8 @@ public class TextEntity implements Comparable<TextEntity>, Serializable {
 
 	private String url;
 
+	private Integer count;
+
 	public TextEntity() {}
 
 	public TextEntity(final String entity, final String url) {
@@ -38,6 +40,14 @@ public class TextEntity implements Comparable<TextEntity>, Serializable {
 		this.url = url;
 	}
 
+	public Integer getCount() {
+		return count;
+	}
+
+	public void setCount(Integer count) {
+		this.count = count;
+	}
+
 	@Override
 	public int hashCode() {
 		final int prime = 31;
@@ -65,7 +75,7 @@ public class TextEntity implements Comparable<TextEntity>, Serializable {
 
 	@Override
 	public int compareTo(TextEntity o) {
-		return entity.compareTo(o.getEntity());
+		return count.compareTo(o.getCount());
 	}
 
 }
-- 
GitLab