Skip to content
Snippets Groups Projects
Commit 030e1980 authored by Eike Cochu's avatar Eike Cochu
Browse files

fixed cmd error logging

removed corenlp errors from ouput
removed jars from repository and added to gitignore
parent 3c95d790
No related branches found
No related tags found
No related merge requests found
*.log
*.jar
.vagrant/
vm/webapps/
......@@ -86,11 +86,6 @@
<artifactId>log4j-slf4j-impl</artifactId>
<version>${log4jVersion}</version>
</dependency>
<dependency>
<groupId>uk.org.lidalia</groupId>
<artifactId>sysout-over-slf4j</artifactId>
<version>1.0.2</version>
</dependency>
<!-- MongoDB Database Adapter -->
<dependency>
......
......@@ -30,7 +30,6 @@ import de.vipra.util.ConsoleUtils;
import de.vipra.util.ConsoleUtils.Choice;
import de.vipra.util.StringUtils;
import de.vipra.util.Timer;
import uk.org.lidalia.sysoutslf4j.context.SysOutOverSLF4J;
public class Main {
......@@ -39,7 +38,8 @@ public class Main {
static {
MorphiaLoggerFactory.registerLogger(SLF4JLoggerImplFactory.class);
SysOutOverSLF4J.sendSystemOutAndErrToSLF4J();
// close stderr to mute corenlp messages
System.err.close();
}
public static void main(String[] args) {
......
......@@ -4,7 +4,6 @@ import java.util.List;
import java.util.Properties;
import de.vipra.cmd.ex.PreprocessorException;
import de.vipra.util.Constants;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
......@@ -42,7 +41,7 @@ public class CoreNLPProcessor extends Processor {
sb.append(word.word()).append(" ");
}
}
String text = sb.toString().trim().replaceAll(Constants.CHARS_DISALLOWED, "").replaceAll("\\s+", " ");
String text = clean(sb.toString());
return new ProcessedText(text);
}
......
......@@ -4,8 +4,6 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import de.vipra.util.Constants;
public class CustomProcessor extends Processor {
private final Set<String> stopWords;
......@@ -31,7 +29,7 @@ public class CustomProcessor extends Processor {
public ProcessedText preprocess(String input) {
input = input.toLowerCase();
input = removeStopWords(input);
input = input.replace(Constants.CHARS_DISALLOWED, "");
input = clean(input);
return new ProcessedText(input);
}
......
......@@ -36,7 +36,11 @@ public class LuceneProcessor extends Processor {
stream.reset();
stream = new PorterStemFilter(stream);
stream = new TrimFilter(stream);
stream = new PatternReplaceFilter(stream, Pattern.compile(Constants.REGEX_EMAIL), "", true);
stream = new PatternReplaceFilter(stream, Pattern.compile(Constants.REGEX_URL), "", true);
stream = new PatternReplaceFilter(stream, Pattern.compile(Constants.REGEX_NUMBER), "", true);
stream = new PatternReplaceFilter(stream, Pattern.compile(Constants.CHARS_DISALLOWED), "", true);
stream = new PatternReplaceFilter(stream, Pattern.compile("\\s+"), " ", true);
ArrayList<String> result = new ArrayList<>();
while (stream.incrementToken()) {
result.add(stream.getAttribute(CharTermAttribute.class).toString());
......
......@@ -36,4 +36,10 @@ public abstract class Processor {
}
}
public static String clean(String in) {
return in.replaceAll(Constants.REGEX_EMAIL, "").replaceAll(Constants.REGEX_URL, "")
.replaceAll(Constants.REGEX_NUMBER, "").replaceAll(Constants.CHARS_DISALLOWED, "")
.replaceAll("\\s+", " ").trim();
}
}
......@@ -143,6 +143,21 @@ public class Constants {
*/
public static final String CHARS_DISALLOWED = "[^a-zA-Z0-9 ]";
/**
* Regular expression to find and remove email addresses from text.
*/
public static final String REGEX_EMAIL = "[^\\s]*@[^\\s]*";
/**
* Regular expressiong to find and remove urls from text.
*/
public static final String REGEX_URL = "(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)(([\\w\\-]+\\.){1,}?([\\w\\-.~]+\\/?)*[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)";
/**
* Regular expressiong to find and remove numbers from text.
*/
public static final String REGEX_NUMBER = "\\b[0-9]+\\b";
/*
* OTHER
*/
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment