Skip to content
Snippets Groups Projects
Commit a0f7bae4 authored by Eike Cochu's avatar Eike Cochu
Browse files

added corenlp dependency

added basic text preprocessing
added stopwords list
parent 421673e4
Branches
No related tags found
No related merge requests found
...@@ -278,6 +278,14 @@ ...@@ -278,6 +278,14 @@
"buffer_size": 955, "buffer_size": 955,
"line_ending": "Unix" "line_ending": "Unix"
} }
},
{
"contents": "",
"settings":
{
"buffer_size": 0,
"line_ending": "Unix"
}
} }
], ],
"build_system": "", "build_system": "",
...@@ -899,7 +907,7 @@ ...@@ -899,7 +907,7 @@
"groups": "groups":
[ [
{ {
"selected": 0, "selected": 1,
"sheets": "sheets":
[ [
{ {
...@@ -924,51 +932,108 @@ ...@@ -924,51 +932,108 @@
"BracketHighlighterBusy": false, "BracketHighlighterBusy": false,
"bh_regions": "bh_regions":
[ [
"bh_default",
"bh_default_center",
"bh_default_open",
"bh_default_close",
"bh_default_content",
"bh_c_define",
"bh_c_define_center",
"bh_c_define_open",
"bh_c_define_close",
"bh_c_define_content",
"bh_square", "bh_square",
"bh_square_center", "bh_square_center",
"bh_square_open", "bh_square_open",
"bh_square_close", "bh_square_close",
"bh_square_content", "bh_square_content",
"bh_default",
"bh_default_center",
"bh_default_open",
"bh_default_close",
"bh_default_content",
"bh_single_quote",
"bh_single_quote_center",
"bh_single_quote_open",
"bh_single_quote_close",
"bh_single_quote_content",
"bh_round", "bh_round",
"bh_round_center", "bh_round_center",
"bh_round_open", "bh_round_open",
"bh_round_close", "bh_round_close",
"bh_round_content", "bh_round_content",
"bh_unmatched",
"bh_unmatched_center",
"bh_unmatched_open",
"bh_unmatched_close",
"bh_unmatched_content",
"bh_regex",
"bh_regex_center",
"bh_regex_open",
"bh_regex_close",
"bh_regex_content",
"bh_double_quote",
"bh_double_quote_center",
"bh_double_quote_open",
"bh_double_quote_close",
"bh_double_quote_content",
"bh_tag", "bh_tag",
"bh_tag_center", "bh_tag_center",
"bh_tag_open", "bh_tag_open",
"bh_tag_close", "bh_tag_close",
"bh_tag_content", "bh_tag_content",
"bh_double_quote",
"bh_double_quote_center",
"bh_double_quote_open",
"bh_double_quote_close",
"bh_double_quote_content",
"bh_regex",
"bh_regex_center",
"bh_regex_open",
"bh_regex_close",
"bh_regex_content",
"bh_c_define",
"bh_c_define_center",
"bh_c_define_open",
"bh_c_define_close",
"bh_c_define_content",
"bh_curly",
"bh_curly_center",
"bh_curly_open",
"bh_curly_close",
"bh_curly_content",
"bh_angle", "bh_angle",
"bh_angle_center", "bh_angle_center",
"bh_angle_open", "bh_angle_open",
"bh_angle_close", "bh_angle_close",
"bh_angle_content", "bh_angle_content",
"bh_unmatched",
"bh_unmatched_center",
"bh_unmatched_open",
"bh_unmatched_close",
"bh_unmatched_content"
],
"incomplete_sync": null,
"remote_loading": false,
"synced": false,
"syntax": "Packages/Ruby/Ruby.sublime-syntax",
"tab_size": 2,
"translate_tabs_to_spaces": true
},
"translation.x": 0.0,
"translation.y": 0.0,
"zoom_level": 1.0
},
"stack_index": 1,
"type": "text"
},
{
"buffer": 1,
"semi_transient": false,
"settings":
{
"buffer_size": 0,
"regions":
{
},
"selection":
[
[
0,
0
]
],
"settings":
{
"BracketHighlighterBusy": false,
"auto_name": "",
"bh_regions":
[
"bh_tag",
"bh_tag_center",
"bh_tag_open",
"bh_tag_close",
"bh_tag_content",
"bh_double_quote",
"bh_double_quote_center",
"bh_double_quote_open",
"bh_double_quote_close",
"bh_double_quote_content",
"bh_curly", "bh_curly",
"bh_curly_center", "bh_curly_center",
"bh_curly_open", "bh_curly_open",
...@@ -978,14 +1043,46 @@ ...@@ -978,14 +1043,46 @@
"bh_single_quote_center", "bh_single_quote_center",
"bh_single_quote_open", "bh_single_quote_open",
"bh_single_quote_close", "bh_single_quote_close",
"bh_single_quote_content" "bh_single_quote_content",
"bh_regex",
"bh_regex_center",
"bh_regex_open",
"bh_regex_close",
"bh_regex_content",
"bh_c_define",
"bh_c_define_center",
"bh_c_define_open",
"bh_c_define_close",
"bh_c_define_content",
"bh_default",
"bh_default_center",
"bh_default_open",
"bh_default_close",
"bh_default_content",
"bh_unmatched",
"bh_unmatched_center",
"bh_unmatched_open",
"bh_unmatched_close",
"bh_unmatched_content",
"bh_round",
"bh_round_center",
"bh_round_open",
"bh_round_close",
"bh_round_content",
"bh_angle",
"bh_angle_center",
"bh_angle_open",
"bh_angle_close",
"bh_angle_content",
"bh_square",
"bh_square_center",
"bh_square_open",
"bh_square_close",
"bh_square_content"
], ],
"default_dir": "/home/eike/Repositories/fu/ss15/ma/impl",
"incomplete_sync": null, "incomplete_sync": null,
"remote_loading": false, "syntax": "Packages/Text/Plain text.tmLanguage"
"synced": false,
"syntax": "Packages/Ruby/Ruby.sublime-syntax",
"tab_size": 2,
"translate_tabs_to_spaces": true
}, },
"translation.x": 0.0, "translation.x": 0.0,
"translation.y": 0.0, "translation.y": 0.0,
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
</properties> </properties>
<dependencies> <dependencies>
<!-- Apache Commons -->
<dependency> <dependency>
<groupId>commons-cli</groupId> <groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId> <artifactId>commons-cli</artifactId>
...@@ -29,18 +30,27 @@ ...@@ -29,18 +30,27 @@
<version>2.4</version> <version>2.4</version>
</dependency> </dependency>
<!-- JsonSimple -->
<dependency> <dependency>
<groupId>com.googlecode.json-simple</groupId> <groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId> <artifactId>json-simple</artifactId>
<version>1.1.1</version> <version>1.1.1</version>
</dependency> </dependency>
<!-- ElasticSearch -->
<dependency> <dependency>
<groupId>org.elasticsearch</groupId> <groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId> <artifactId>elasticsearch</artifactId>
<version>2.1.0</version> <version>2.1.0</version>
</dependency> </dependency>
<!-- Stanford CoreNLP -->
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.5.2</version>
</dependency>
<!-- Logging --> <!-- Logging -->
<dependency> <dependency>
<groupId>org.apache.logging.log4j</groupId> <groupId>org.apache.logging.log4j</groupId>
......
...@@ -5,7 +5,9 @@ import java.io.FileReader; ...@@ -5,7 +5,9 @@ import java.io.FileReader;
import java.io.FilenameFilter; import java.io.FilenameFilter;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
import org.json.simple.JSONArray; import org.json.simple.JSONArray;
import org.json.simple.JSONObject; import org.json.simple.JSONObject;
...@@ -18,6 +20,7 @@ import de.vipra.cmd.model.Article; ...@@ -18,6 +20,7 @@ import de.vipra.cmd.model.Article;
import de.vipra.util.Config; import de.vipra.util.Config;
import de.vipra.util.ConfigException; import de.vipra.util.ConfigException;
import de.vipra.util.Constants; import de.vipra.util.Constants;
import de.vipra.util.FileUtils;
import de.vipra.util.StringUtils; import de.vipra.util.StringUtils;
import de.vipra.util.ex.DatabaseException; import de.vipra.util.ex.DatabaseException;
import de.vipra.util.ex.FilebaseException; import de.vipra.util.ex.FilebaseException;
...@@ -98,7 +101,28 @@ public class ImportCommand implements Command { ...@@ -98,7 +101,28 @@ public class ImportCommand implements Command {
} }
} }
private void importArticle(JSONObject obj) throws FilebaseException, DatabaseException { private String removeStopWords(String text) throws IOException {
List<String> stopwordsList = FileUtils.readFile(FileUtils.getFile(Constants.STOPWORDS_FILE));
Set<String> stopwords = new HashSet<>(stopwordsList);
String[] words = text.split("\\s+");
StringBuilder sb = new StringBuilder();
for (String word : words) {
if (stopwords.contains(word)) {
continue;
}
sb.append(word).append(" ");
}
return sb.toString().trim();
}
private String preprocessText(String text) throws IOException {
text = text.toLowerCase();
text = removeStopWords(text);
text = text.replace("[^a-zA-Z0-9 ]", "");
return text;
}
private void importArticle(JSONObject obj) throws FilebaseException, DatabaseException, IOException {
out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\""); out.info("importing \"" + StringUtils.ellipsize(obj.get("title").toString(), 80) + "\"");
Article article = new Article(); Article article = new Article();
article.fromJSON(obj); article.fromJSON(obj);
...@@ -107,6 +131,7 @@ public class ImportCommand implements Command { ...@@ -107,6 +131,7 @@ public class ImportCommand implements Command {
article = dbArticles.createSingle(article); article = dbArticles.createSingle(article);
// add article to filebase // add article to filebase
article.setText(preprocessText(article.getText()));
article = fbArticles.createSingle(article); article = fbArticles.createSingle(article);
// 3. index article via elasticsearch, include topics // 3. index article via elasticsearch, include topics
......
a
about
above
after
again
against
all
am
an
and
any
are
aren't
as
at
be
because
been
before
being
below
between
both
but
by
can't
cannot
could
couldn't
did
didn't
do
does
doesn't
doing
don't
down
during
each
few
for
from
further
had
hadn't
has
hasn't
have
haven't
having
he
he'd
he'll
he's
her
here
here's
hers
herself
him
himself
his
how
how's
i
i'd
i'll
i'm
i've
if
in
into
is
isn't
it
it's
its
itself
let's
me
more
most
mustn't
my
myself
no
nor
not
of
off
on
once
only
or
other
ought
our
ours ourselves
out
over
own
same
shan't
she
she'd
she'll
she's
should
shouldn't
so
some
such
than
that
that's
the
their
theirs
them
themselves
then
there
there's
these
they
they'd
they'll
they're
they've
this
those
through
to
too
under
until
up
very
was
wasn't
we
we'd
we'll
we're
we've
were
weren't
what
what's
when
when's
where
where's
which
while
who
who's
whom
why
why's
with
won't
would
wouldn't
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
\ No newline at end of file
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
</properties> </properties>
<dependencies> <dependencies>
<!-- Apache Commons -->
<dependency> <dependency>
<groupId>commons-io</groupId> <groupId>commons-io</groupId>
<artifactId>commons-io</artifactId> <artifactId>commons-io</artifactId>
......
...@@ -19,11 +19,7 @@ public class Config { ...@@ -19,11 +19,7 @@ public class Config {
private final Properties props = new Properties(); private final Properties props = new Properties();
public Config() throws IOException, ConfigException { public Config() throws IOException, ConfigException {
InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(Constants.CONFIG_FILE); load(FileUtils.getResource(Constants.CONFIG_FILE));
if (is == null) {
is = Config.class.getResourceAsStream(Constants.CONFIG_FILE);
}
load(is);
} }
public Config(InputStream is) throws IOException, ConfigException { public Config(InputStream is) throws IOException, ConfigException {
......
...@@ -7,8 +7,9 @@ public class Constants { ...@@ -7,8 +7,9 @@ public class Constants {
public static final String FB_DIR = "vipra"; public static final String FB_DIR = "vipra";
public static final Charset FB_ENCODING = StandardCharsets.UTF_8; public static final Charset FB_ENCODING = StandardCharsets.UTF_8;
public static final String CONFIG_FILE = "config.properties"; public static final String CONFIG_FILE = "config.properties";
public static final String STOPWORDS_FILE = "stopwords.txt";
public static final String DEFAULT_HOST = "localhost"; public static final String DEFAULT_HOST = "localhost";
public static final int DEFAULT_PORT = 27017; public static final int DEFAULT_PORT = 27017;
......
...@@ -2,6 +2,7 @@ package de.vipra.util; ...@@ -2,6 +2,7 @@ package de.vipra.util;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.List; import java.util.List;
...@@ -12,4 +13,12 @@ public class FileUtils extends org.apache.commons.io.FileUtils { ...@@ -12,4 +13,12 @@ public class FileUtils extends org.apache.commons.io.FileUtils {
return Files.readAllLines(Paths.get(file.getAbsolutePath()), Constants.FB_ENCODING); return Files.readAllLines(Paths.get(file.getAbsolutePath()), Constants.FB_ENCODING);
} }
public static InputStream getResource(String name) {
InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(name);
if (is == null) {
is = Config.class.getResourceAsStream(name);
}
return is;
}
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment