diff --git a/ma-impl.sublime-workspace b/ma-impl.sublime-workspace index 42dc46ba506d730fa0c0131de47e421dc541d865..6520e543d140111a3333797697328921b5ae677b 100644 --- a/ma-impl.sublime-workspace +++ b/ma-impl.sublime-workspace @@ -278,6 +278,14 @@ "buffer_size": 955, "line_ending": "Unix" } + }, + { + "contents": "", + "settings": + { + "buffer_size": 0, + "line_ending": "Unix" + } } ], "build_system": "", @@ -899,7 +907,7 @@ "groups": [ { - "selected": 0, + "selected": 1, "sheets": [ { @@ -924,51 +932,108 @@ "BracketHighlighterBusy": false, "bh_regions": [ - "bh_default", - "bh_default_center", - "bh_default_open", - "bh_default_close", - "bh_default_content", - "bh_c_define", - "bh_c_define_center", - "bh_c_define_open", - "bh_c_define_close", - "bh_c_define_content", "bh_square", "bh_square_center", "bh_square_open", "bh_square_close", "bh_square_content", + "bh_default", + "bh_default_center", + "bh_default_open", + "bh_default_close", + "bh_default_content", + "bh_single_quote", + "bh_single_quote_center", + "bh_single_quote_open", + "bh_single_quote_close", + "bh_single_quote_content", "bh_round", "bh_round_center", "bh_round_open", "bh_round_close", "bh_round_content", - "bh_unmatched", - "bh_unmatched_center", - "bh_unmatched_open", - "bh_unmatched_close", - "bh_unmatched_content", - "bh_regex", - "bh_regex_center", - "bh_regex_open", - "bh_regex_close", - "bh_regex_content", - "bh_double_quote", - "bh_double_quote_center", - "bh_double_quote_open", - "bh_double_quote_close", - "bh_double_quote_content", "bh_tag", "bh_tag_center", "bh_tag_open", "bh_tag_close", "bh_tag_content", + "bh_double_quote", + "bh_double_quote_center", + "bh_double_quote_open", + "bh_double_quote_close", + "bh_double_quote_content", + "bh_regex", + "bh_regex_center", + "bh_regex_open", + "bh_regex_close", + "bh_regex_content", + "bh_c_define", + "bh_c_define_center", + "bh_c_define_open", + "bh_c_define_close", + "bh_c_define_content", + "bh_curly", + "bh_curly_center", + "bh_curly_open", + "bh_curly_close", + "bh_curly_content", "bh_angle", "bh_angle_center", "bh_angle_open", "bh_angle_close", "bh_angle_content", + "bh_unmatched", + "bh_unmatched_center", + "bh_unmatched_open", + "bh_unmatched_close", + "bh_unmatched_content" + ], + "incomplete_sync": null, + "remote_loading": false, + "synced": false, + "syntax": "Packages/Ruby/Ruby.sublime-syntax", + "tab_size": 2, + "translate_tabs_to_spaces": true + }, + "translation.x": 0.0, + "translation.y": 0.0, + "zoom_level": 1.0 + }, + "stack_index": 1, + "type": "text" + }, + { + "buffer": 1, + "semi_transient": false, + "settings": + { + "buffer_size": 0, + "regions": + { + }, + "selection": + [ + [ + 0, + 0 + ] + ], + "settings": + { + "BracketHighlighterBusy": false, + "auto_name": "", + "bh_regions": + [ + "bh_tag", + "bh_tag_center", + "bh_tag_open", + "bh_tag_close", + "bh_tag_content", + "bh_double_quote", + "bh_double_quote_center", + "bh_double_quote_open", + "bh_double_quote_close", + "bh_double_quote_content", "bh_curly", "bh_curly_center", "bh_curly_open", @@ -978,14 +1043,46 @@ "bh_single_quote_center", "bh_single_quote_open", "bh_single_quote_close", - "bh_single_quote_content" + "bh_single_quote_content", + "bh_regex", + "bh_regex_center", + "bh_regex_open", + "bh_regex_close", + "bh_regex_content", + "bh_c_define", + "bh_c_define_center", + "bh_c_define_open", + "bh_c_define_close", + "bh_c_define_content", + "bh_default", + "bh_default_center", + "bh_default_open", + "bh_default_close", + "bh_default_content", + "bh_unmatched", + "bh_unmatched_center", + "bh_unmatched_open", + "bh_unmatched_close", + "bh_unmatched_content", + "bh_round", + "bh_round_center", + "bh_round_open", + "bh_round_close", + "bh_round_content", + "bh_angle", + "bh_angle_center", + "bh_angle_open", + "bh_angle_close", + "bh_angle_content", + "bh_square", + "bh_square_center", + "bh_square_open", + "bh_square_close", + "bh_square_content" ], + "default_dir": "/home/eike/Repositories/fu/ss15/ma/impl", "incomplete_sync": null, - "remote_loading": false, - "synced": false, - "syntax": "Packages/Ruby/Ruby.sublime-syntax", - "tab_size": 2, - "translate_tabs_to_spaces": true + "syntax": "Packages/Text/Plain text.tmLanguage" }, "translation.x": 0.0, "translation.y": 0.0, diff --git a/vipra-cmd/pom.xml b/vipra-cmd/pom.xml index ad440cb717d05db4ef8fc4b675dbd19b5835fea4..870fd2ac6fa8372d5801639e2789eccac8897069 100644 --- a/vipra-cmd/pom.xml +++ b/vipra-cmd/pom.xml @@ -18,6 +18,7 @@ </properties> <dependencies> + <!-- Apache Commons --> <dependency> <groupId>commons-cli</groupId> <artifactId>commons-cli</artifactId> @@ -29,18 +30,27 @@ <version>2.4</version> </dependency> + <!-- JsonSimple --> <dependency> <groupId>com.googlecode.json-simple</groupId> <artifactId>json-simple</artifactId> <version>1.1.1</version> </dependency> + <!-- ElasticSearch --> <dependency> <groupId>org.elasticsearch</groupId> <artifactId>elasticsearch</artifactId> <version>2.1.0</version> </dependency> + <!-- Stanford CoreNLP --> + <dependency> + <groupId>edu.stanford.nlp</groupId> + <artifactId>stanford-corenlp</artifactId> + <version>3.5.2</version> + </dependency> + <!-- Logging --> <dependency> <groupId>org.apache.logging.log4j</groupId> diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 572b2bbe1da81a5445d0a9096e7d595993610c7d..82b713824b48e868f42690b2a1948300063115e5 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -5,7 +5,9 @@ import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.json.simple.JSONArray; import org.json.simple.JSONObject; @@ -18,6 +20,7 @@ import de.vipra.cmd.model.Article; import de.vipra.util.Config; import de.vipra.util.ConfigException; import de.vipra.util.Constants; +import de.vipra.util.FileUtils; import de.vipra.util.StringUtils; import de.vipra.util.ex.DatabaseException; import de.vipra.util.ex.FilebaseException; @@ -98,7 +101,28 @@ public class ImportCommand implements Command { } } - private void importArticle(JSONObject obj) throws FilebaseException, DatabaseException { + private String removeStopWords(String text) throws IOException { + List<String> stopwordsList = FileUtils.readFile(FileUtils.getFile(Constants.STOPWORDS_FILE)); + Set<String> stopwords = new HashSet<>(stopwordsList); + String[] words = text.split("\\s+"); + StringBuilder sb = new StringBuilder(); + for (String word : words) { + if (stopwords.contains(word)) { + continue; + } + sb.append(word).append(" "); + } + return sb.toString().trim(); + } + + private String preprocessText(String text) throws IOException { + text = text.toLowerCase(); + text = removeStopWords(text); + text = text.replace("[^a-zA-Z0-9 ]", ""); + return text; + } + + private void importArticle(JSONObject obj) throws FilebaseException, DatabaseException, IOException { out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\""); Article article = new Article(); article.fromJSON(obj); @@ -107,6 +131,7 @@ public class ImportCommand implements Command { article = dbArticles.createSingle(article); // add article to filebase + article.setText(preprocessText(article.getText())); article = fbArticles.createSingle(article); // 3. index article via elasticsearch, include topics diff --git a/vipra-cmd/src/main/resources/stopwords.txt b/vipra-cmd/src/main/resources/stopwords.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e35caf48842d63a8139485c99a9eedfbd1fb822 --- /dev/null +++ b/vipra-cmd/src/main/resources/stopwords.txt @@ -0,0 +1,173 @@ +a +about +above +after +again +against +all +am +an +and +any +are +aren't +as +at +be +because +been +before +being +below +between +both +but +by +can't +cannot +could +couldn't +did +didn't +do +does +doesn't +doing +don't +down +during +each +few +for +from +further +had +hadn't +has +hasn't +have +haven't +having +he +he'd +he'll +he's +her +here +here's +hers +herself +him +himself +his +how +how's +i +i'd +i'll +i'm +i've +if +in +into +is +isn't +it +it's +its +itself +let's +me +more +most +mustn't +my +myself +no +nor +not +of +off +on +once +only +or +other +ought +our +ours ourselves +out +over +own +same +shan't +she +she'd +she'll +she's +should +shouldn't +so +some +such +than +that +that's +the +their +theirs +them +themselves +then +there +there's +these +they +they'd +they'll +they're +they've +this +those +through +to +too +under +until +up +very +was +wasn't +we +we'd +we'll +we're +we've +were +weren't +what +what's +when +when's +where +where's +which +while +who +who's +whom +why +why's +with +won't +would +wouldn't +you +you'd +you'll +you're +you've +your +yours +yourself +yourselves \ No newline at end of file diff --git a/vipra-util/pom.xml b/vipra-util/pom.xml index 03b50d666602ba831e271cbb6881b3356e9b4b27..59d8436d1af0eb92ddd79512ee90221ea4b66d8b 100644 --- a/vipra-util/pom.xml +++ b/vipra-util/pom.xml @@ -13,6 +13,7 @@ </properties> <dependencies> + <!-- Apache Commons --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> diff --git a/vipra-util/src/main/java/de/vipra/util/Config.java b/vipra-util/src/main/java/de/vipra/util/Config.java index b4b795b3bc93dfdb7db7407dbb21842f0a855f57..a2032a4c9c93f7cd35645c22b32af7f6a476a6c6 100644 --- a/vipra-util/src/main/java/de/vipra/util/Config.java +++ b/vipra-util/src/main/java/de/vipra/util/Config.java @@ -19,11 +19,7 @@ public class Config { private final Properties props = new Properties(); public Config() throws IOException, ConfigException { - InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(Constants.CONFIG_FILE); - if (is == null) { - is = Config.class.getResourceAsStream(Constants.CONFIG_FILE); - } - load(is); + load(FileUtils.getResource(Constants.CONFIG_FILE)); } public Config(InputStream is) throws IOException, ConfigException { diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 4146b9c2c7878ce182b35e55ed64d76730a73f18..4aa09bf529ea532180df0bdcd4d4b8fb31a73a01 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -7,8 +7,9 @@ public class Constants { public static final String FB_DIR = "vipra"; public static final Charset FB_ENCODING = StandardCharsets.UTF_8; - + public static final String CONFIG_FILE = "config.properties"; + public static final String STOPWORDS_FILE = "stopwords.txt"; public static final String DEFAULT_HOST = "localhost"; public static final int DEFAULT_PORT = 27017; diff --git a/vipra-util/src/main/java/de/vipra/util/FileUtils.java b/vipra-util/src/main/java/de/vipra/util/FileUtils.java index 397610a4c8a7310de539fab2c234e8415a170a42..42151f8d752026df2da9d7eee8e9e1288209a9c4 100644 --- a/vipra-util/src/main/java/de/vipra/util/FileUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/FileUtils.java @@ -2,6 +2,7 @@ package de.vipra.util; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Paths; import java.util.List; @@ -12,4 +13,12 @@ public class FileUtils extends org.apache.commons.io.FileUtils { return Files.readAllLines(Paths.get(file.getAbsolutePath()), Constants.FB_ENCODING); } + public static InputStream getResource(String name) { + InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(name); + if (is == null) { + is = Config.class.getResourceAsStream(name); + } + return is; + } + }