diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java index a9600df9df1754c0e9afc2e9f9f9634d97957e6b..314ae1535d608daa4c925d1c3af0165c7e23d222 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/SearchResource.java @@ -65,8 +65,8 @@ public class SearchResource { SearchResponse response = null; try { response = client.prepareSearch("articles") - .setQuery(QueryBuilders.multiMatchQuery(query, "topics^" + Constants.BOOST_TOPICS, - "title^" + Constants.BOOST_TITLES, "_all")) + .setQuery(QueryBuilders.multiMatchQuery(query, "topics^" + Constants.ES_BOOST_TOPICS, + "title^" + Constants.ES_BOOST_TITLES, "_all")) .setFrom(skip).setSize(limit).execute().actionGet(); } catch (Exception e) { e.printStackTrace(); diff --git a/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java b/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java index 619d28c86c904fe6d92f5d6a4fa9f2d90b016cd9..6a0d7bde5fdc71e57f17b988124d9865e4e4b0f7 100644 --- a/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java +++ b/vipra-backend/src/main/java/de/vipra/rest/resource/TopicResource.java @@ -34,7 +34,6 @@ import de.vipra.util.model.Topic; import de.vipra.util.model.TopicFull; import de.vipra.util.service.MongoService; import de.vipra.util.service.Service.QueryBuilder; -import de.vipra.ws.WebSocket; @Path("topics") public class TopicResource { @@ -138,7 +137,6 @@ public class TopicResource { try { dbTopics.replaceSingle(topic); - WebSocket.sendToState("topics.show", "{\"msg\":\"topic updated\"}"); return res.ok(topic); } catch (DatabaseException e) { e.printStackTrace(); diff --git a/vipra-backend/src/main/java/de/vipra/ws/State.java b/vipra-backend/src/main/java/de/vipra/ws/State.java new file mode 100644 index 0000000000000000000000000000000000000000..ec76e527854c2102693717b5b661f4f8990e7588 --- /dev/null +++ b/vipra-backend/src/main/java/de/vipra/ws/State.java @@ -0,0 +1,29 @@ +package de.vipra.ws; + +public enum State { + INDEX("index"), + ABOUT("about"), + NETWORK("network"), + ARTICLES("articles"), + TOPICS("topics"), + WORDS("words"), + ID(null); + + private final String state; + + State(String state) { + this.state = state; + } + + public String getState() { + return state; + } + + public static State find(String str) { + for (State state : State.values()) + if (state.state.equalsIgnoreCase(str)) + return state; + return ID; + } + +} diff --git a/vipra-backend/src/main/java/de/vipra/ws/StateSession.java b/vipra-backend/src/main/java/de/vipra/ws/StateSession.java new file mode 100644 index 0000000000000000000000000000000000000000..2552bdf6cf753bbb2acb150016658310053f5d19 --- /dev/null +++ b/vipra-backend/src/main/java/de/vipra/ws/StateSession.java @@ -0,0 +1,51 @@ +package de.vipra.ws; + +import javax.websocket.Session; + +public class StateSession { + + private final Session session; + private State state; + + public StateSession(Session session) { + if (session == null) + throw new NullPointerException("session cannot be null"); + this.session = session; + } + + public State getState() { + return state; + } + + public void setState(State state) { + this.state = state; + } + + public void setState(String state) { + this.state = State.find(state); + } + + public Session getSession() { + return session; + } + + @Override + public boolean equals(Object o) { + if (o == null) + return false; + + if (o instanceof StateSession) + o = ((StateSession) o).getSession(); + + if (o instanceof Session) + return o.equals(session); + + return false; + } + + @Override + public int hashCode() { + return session.hashCode(); + } + +} diff --git a/vipra-backend/src/main/java/de/vipra/ws/StateSessionMap.java b/vipra-backend/src/main/java/de/vipra/ws/StateSessionMap.java new file mode 100644 index 0000000000000000000000000000000000000000..1d8d5ac8c19d1c8bb74617646e6b293b7b790865 --- /dev/null +++ b/vipra-backend/src/main/java/de/vipra/ws/StateSessionMap.java @@ -0,0 +1,20 @@ +package de.vipra.ws; + +import java.util.HashMap; +import java.util.stream.Stream; + +import javax.websocket.Session; + +public class StateSessionMap extends HashMap<StateSession, StateSession> { + + private static final long serialVersionUID = 1L; + + public Stream<Session> stream(State state) { + return this.entrySet().stream().filter(s -> s.getKey().getState() == state).map(s -> s.getKey().getSession()); + } + + public void add(StateSession session) { + this.put(session, session); + } + +} diff --git a/vipra-backend/src/main/java/de/vipra/ws/WebSocket.java b/vipra-backend/src/main/java/de/vipra/ws/WebSocket.java index 5e7fa899b8441c656cef0393d7a3697b1e09e3d1..46a2b58bf01cbd422ce1cf5d4e296f8cafbe4829 100644 --- a/vipra-backend/src/main/java/de/vipra/ws/WebSocket.java +++ b/vipra-backend/src/main/java/de/vipra/ws/WebSocket.java @@ -1,9 +1,6 @@ package de.vipra.ws; import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; -import java.util.Set; import javax.websocket.OnClose; import javax.websocket.OnError; @@ -16,10 +13,10 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; -import de.vipra.util.MultiMap; import de.vipra.ws.msg.InitMessage; import de.vipra.ws.msg.WebSocketMessage; @@ -29,13 +26,12 @@ public class WebSocket { public static final Logger log = LogManager.getLogger(WebSocket.class); public static final ObjectMapper mapper = new ObjectMapper(); - public static final Set<Session> sessions = new HashSet<>(); - public static final MultiMap<String, Session> states = new MultiMap<>(); + public static final StateSessionMap sessions = new StateSessionMap(); @OnOpen public void open(Session session) { log.debug("connect"); - sessions.add(session); + sessions.add(new StateSession(session)); } @OnClose @@ -45,7 +41,9 @@ public class WebSocket { } @OnError - public void onError(Throwable error) {} + public void onError(Throwable error) { + log.error(error); + } @OnMessage public void handleMessage(String input, Session session) @@ -65,20 +63,18 @@ public class WebSocket { public void handleInitMessage(InitMessage message, Session session) { log.debug("init message received. state = " + message.getState()); - states.put(message.getState(), session); + sessions.get(session).setState(message.getState()); } - public static void sendToState(String state, String message) { - Collection<Session> sessions = states.get(state); - if (sessions != null) { - for (Session session : sessions) { - try { - session.getBasicRemote().sendText(message); - } catch (IOException e) { - log.error(e); - } - } + public static void sendToState(State state, Object message) { + String json; + try { + json = mapper.writeValueAsString(message); + } catch (JsonProcessingException e) { + log.error(e); + return; } + sessions.stream(state).forEach(s -> s.getAsyncRemote().sendText(json)); } } diff --git a/vipra-backend/src/main/resources/config.properties b/vipra-backend/src/main/resources/config.properties index 0778073f0a04af174ec97a6263bcf433f71a90d6..07030840d45dd8fed8c2c2d5bff5fe0a54939a4f 100644 --- a/vipra-backend/src/main/resources/config.properties +++ b/vipra-backend/src/main/resources/config.properties @@ -1,6 +1,3 @@ db.host=localhost db.port=27017 -db.name=test -tm.processor=corenlp -tm.analyzer=jgibb -tm.saveallwords=false \ No newline at end of file +db.name=test \ No newline at end of file diff --git a/vipra-cmd/runcfg/CMD - Import 2.launch b/vipra-cmd/runcfg/CMD - Import 2.launch new file mode 100644 index 0000000000000000000000000000000000000000..89c246dd7ba64c418b3875e63aada477ff6caa5b --- /dev/null +++ b/vipra-cmd/runcfg/CMD - Import 2.launch @@ -0,0 +1,18 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication"> +<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS"> +<listEntry value="/vipra-cmd/src/main/java/de/vipra/cmd/Main.java"/> +</listAttribute> +<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES"> +<listEntry value="1"/> +</listAttribute> +<listAttribute key="org.eclipse.debug.ui.favoriteGroups"> +<listEntry value="org.eclipse.debug.ui.launchGroup.run"/> +</listAttribute> +<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/> +<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="de.vipra.cmd.Main"/> +<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="-i /home/eike/repos/master/ma-impl/vm/data/test-2.json"/> +<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="vipra-cmd"/> +<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/> +<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Dlog4j.configurationFile=log4j2dev.xml"/> +</launchConfiguration> diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java index 8d16922892460d035590df23777c543d7fc1ddb0..4e3cccb39d460af8b3548104ccbdec852b1d5144 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java @@ -10,7 +10,7 @@ import java.util.Iterator; import java.util.List; import de.vipra.cmd.ex.FilebaseException; -import de.vipra.cmd.file.DTMDateIndex.DTMDateIndexEntry; +import de.vipra.cmd.file.DTMSequenceIndex.DTMDateIndexEntry; import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.FileUtils; @@ -19,10 +19,9 @@ import de.vipra.util.model.ArticleFull; public class DTMFilebase extends Filebase { - public static final String FILE_MODEL = "dtm-mult.dat"; - public static final String FILE_VOCAB = "vocab"; + public static final String FILE_NAME = "dtm-mult.dat"; - private final DTMDateIndex dateindex; + private final DTMSequenceIndex seqindex; private final DTMVocabulary vocab; private final File modelFile; @@ -37,7 +36,7 @@ public class DTMFilebase extends Filebase { File modelDir = getModelDir(); try { - this.dateindex = new DTMDateIndex(modelDir, config.windowResolution, false); + this.seqindex = new DTMSequenceIndex(modelDir, config.windowResolution, false); } catch (IOException | ParseException e) { throw new FilebaseException("could not read date index file", e); } @@ -47,24 +46,22 @@ public class DTMFilebase extends Filebase { } catch (IOException e) { throw new FilebaseException("could not read vocabulary file", e); } - this.modelFile = getModelFile(FILE_MODEL); + this.modelFile = getModelFile(FILE_NAME); } @Override public synchronized void write(List<ArticleFull> articles) throws IOException { if (!articles.isEmpty()) { - // index new articles - for (ArticleFull article : articles) { - dateindex.add(article.getDate(), vocab.indexText(article.getProcessedText())); - } + for (ArticleFull article : articles) + seqindex.add(article.getDate(), vocab.transform(article.getProcessedText())); // write temp file - File modelFileTmp = getModelFile(FILE_MODEL + ".tmp"); + File modelFileTmp = getModelFile(FILE_NAME + ".tmp"); Iterator<String> lines = null; if (modelFile.exists()) lines = FileUtils.iterateFileLines(modelFile); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(modelFileTmp))); - for (DTMDateIndexEntry e : dateindex) { + for (DTMDateIndexEntry e : seqindex) { if (e.exists) { if (lines == null) { writer.close(); @@ -95,7 +92,7 @@ public class DTMFilebase extends Filebase { // write vocabulary and windows vocab.close(); - dateindex.close(); + seqindex.close(); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java similarity index 89% rename from vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java rename to vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java index c9a47040e764ab823ad20925a818e00735ecb259..7dc6b2228accb5f33657d3cf91736c277cfb317a 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMSequenceIndex.java @@ -20,9 +20,9 @@ import de.vipra.util.Constants; import de.vipra.util.Constants.WindowResolution; import de.vipra.util.FileUtils; -public class DTMDateIndex implements Closeable, Iterable<DTMDateIndex.DTMDateIndexEntry> { +public class DTMSequenceIndex implements Closeable, Iterable<DTMSequenceIndex.DTMDateIndexEntry> { - public static final String FILE_WINDOWS = "dtm-seq.dat"; + public static final String FILE_NAME = "dtm-seq.dat"; public static class DTMDateIndexEntry implements Comparable<DTMDateIndexEntry> { public Date date; @@ -50,7 +50,7 @@ public class DTMDateIndex implements Closeable, Iterable<DTMDateIndex.DTMDateInd private static List<DTMDateIndexEntry> entries; private static SimpleDateFormat df = new SimpleDateFormat(Constants.DATETIME_FORMAT); - public DTMDateIndex(File modelDir, WindowResolution wr, boolean reread) throws IOException, ParseException { + public DTMSequenceIndex(File modelDir, WindowResolution wr, boolean reread) throws IOException, ParseException { this.file = new File(modelDir, "dates"); windowResolution = wr; if (file.exists()) { @@ -99,7 +99,7 @@ public class DTMDateIndex implements Closeable, Iterable<DTMDateIndex.DTMDateInd writer.close(); // write window index - File seqFile = new File(file.getParentFile(), FILE_WINDOWS); + File seqFile = new File(file.getParentFile(), FILE_NAME); writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seqFile, false))); writer.write(Integer.toString(windows.size())); writer.write(Constants.LINE_SEP); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java index 72399624df6ff722b724d5fd9c209375c82cf44f..fc056a3a3d9a40e8541e62e7556a51d561c83380 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMVocabulary.java @@ -4,7 +4,6 @@ import java.io.Closeable; import java.io.File; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -16,13 +15,15 @@ import de.vipra.util.FileUtils; public class DTMVocabulary implements Closeable, Iterable<String> { + public static final String FILE_NAME = "vocab"; + private File file; private static List<String> vocables; private static Map<String, Integer> vocablesMap; private static int nextIndex = 1; public DTMVocabulary(File modelDir, boolean reread) throws IOException { - this.file = new File(modelDir, "vocab"); + this.file = new File(modelDir, FILE_NAME); if (file.exists()) { if (vocables == null || reread) vocables = new ArrayList<>(FileUtils.readFile(file)); @@ -42,20 +43,7 @@ public class DTMVocabulary implements Closeable, Iterable<String> { FileUtils.writeLines(file, Constants.FILEBASE_ENCODING.name(), vocables, null, false); } - public void addVocabulary(String text) { - addVocabulary(text.split("\\s+")); - } - - public void addVocabulary(String[] text) { - for (String word : text) { - if (!vocablesMap.containsKey(word)) { - vocablesMap.put(word, nextIndex++); - vocables.add(word); - } - } - } - - public int index(String word) { + private int index(String word) { Integer index = vocablesMap.get(word); if (index == null) { index = nextIndex++; @@ -69,11 +57,10 @@ public class DTMVocabulary implements Closeable, Iterable<String> { return vocablesMap.size(); } - public String indexText(String in) { + public String transform(String[] words) { // count unique words - List<String> wordList = Arrays.asList(in.split("\\s+")); - Map<String, Integer> wordMap = new HashMap<>(wordList.size()); - for (String word : wordList) { + Map<String, Integer> wordMap = new HashMap<>(words.length); + for (String word : words) { Integer count = wordMap.get(word); if (count == null) wordMap.put(word, 1); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java index 5f205147c8756c5d1fb04ca9d42e0a2a60a720f5..f15c290819abcfd719ca640331d49d50bea563a7 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/JGibbFilebase.java @@ -9,6 +9,7 @@ import java.util.List; import de.vipra.cmd.ex.FilebaseException; import de.vipra.util.model.ArticleFull; +import edu.stanford.nlp.util.StringUtils; public class JGibbFilebase extends Filebase { @@ -24,7 +25,7 @@ public class JGibbFilebase extends Filebase { if (!articles.isEmpty()) { BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(modelFile))); for (ArticleFull article : articles) - writer.write(article.getProcessedText() + "\n"); + writer.write(StringUtils.join(article.getProcessedText()) + "\n"); writer.close(); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java index 5c49c9286557e7c1bf2ee1bcc69e927d70548635..55163b1abd7dd2e120d7e861a0fe9269595ee00e 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ImportCommand.java @@ -7,7 +7,6 @@ import java.io.FilenameFilter; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; @@ -29,12 +28,10 @@ import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.StringUtils; import de.vipra.util.Timer; -import de.vipra.util.WordMap; import de.vipra.util.ex.ConfigException; import de.vipra.util.ex.DatabaseException; import de.vipra.util.model.ArticleFull; import de.vipra.util.model.ArticleStats; -import de.vipra.util.model.Word; import de.vipra.util.service.MongoService; public class ImportCommand implements Command { @@ -79,24 +76,29 @@ public class ImportCommand implements Command { @Override public void run() { - log.info("importing \"" + object.get("title") + "\""); ArticleFull article = articleFromJSON(object); try { // preprocess text ProcessedText processedText = processor.process(article.getText()); + article.setProcessedText(processedText.getWords()); - // generate text stats - ArticleStats articleStats = ArticleStats.generateFromText(processedText.getText()); + // generate article stats + ArticleStats stats = new ArticleStats(); + stats.setWordCount(processedText.getWordCount()); + stats.setProcessedWordCount(processedText.getReducedWordCount()); + stats.setReductionRatio(processedText.getReductionRatio()); + article.setStats(stats); // add article to mongodb - article.setProcessedText(processedText.getText()); - article.setStats(articleStats); buffer.add(article); // add article to filebase if long enough - if (processedText.getWords().length >= Constants.DOCUMENT_MIN_LENGTH) + if (processedText.getReducedWordCount() >= Constants.DOCUMENT_MIN_LENGTH) filebase.add(article); + + log.info("imported \"" + object.get("title") + "\"\r\n └ text reduction: " + + (processedText.getReductionRatio() * 100) + "%"); } catch (ProcessorException e) { log.error("could not preprocess text of article '" + article.getTitle() + "'"); } catch (DatabaseException e) { @@ -128,10 +130,8 @@ public class ImportCommand implements Command { private JSONParser parser = new JSONParser(); private Config config; private MongoService<ArticleFull, ObjectId> dbArticles; - private MongoService<Word, String> dbWords; private Filebase filebase; private Processor processor; - private WordMap wordMap; private ArticleBuffer buffer; private ExecutorService executor; @@ -229,10 +229,8 @@ public class ImportCommand implements Command { int threadCount = Runtime.getRuntime().availableProcessors() * 10; config = Config.getConfig(); dbArticles = MongoService.getDatabaseService(config, ArticleFull.class); - dbWords = MongoService.getDatabaseService(config, Word.class); filebase = Filebase.getFilebase(config); processor = Processor.getProcessor(config); - wordMap = new WordMap(dbWords); buffer = new ArticleBuffer(dbArticles); executor = Executors.newFixedThreadPool(threadCount); @@ -260,20 +258,10 @@ public class ImportCommand implements Command { filebase.close(); timer.lap("filebase write"); - /* - * save words - */ - log.info("saving words"); - Set<Word> importedWords = wordMap.getNewWords(); - wordMap.create(); - timer.lap("saving words"); - /* * run information */ log.info("imported " + imported + " new " + StringUtils.quantity(imported, "article")); - int newWordsCount = importedWords.size(); - log.info("imported " + newWordsCount + " new " + StringUtils.quantity(newWordsCount, "word")); log.info(timer.toString()); log.info("done in " + StringUtils.timeString(timer.total())); } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/CoreNLPProcessor.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/CoreNLPProcessor.java index 620c9189c615474b86806761253ce7503e52f203..3e4ab99172e662117285aa2a95418a8bff6aa39c 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/CoreNLPProcessor.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/CoreNLPProcessor.java @@ -3,7 +3,12 @@ package de.vipra.cmd.text; import java.util.List; import java.util.Properties; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + import de.vipra.cmd.ex.ProcessorException; +import de.vipra.util.Constants; +import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; @@ -14,6 +19,8 @@ import edu.stanford.nlp.util.StringUtils; public class CoreNLPProcessor extends Processor { + public static final Logger log = LogManager.getLogger(CoreNLPProcessor.class); + private StanfordCoreNLP nlp; public CoreNLPProcessor(List<String> stopWordsList) { @@ -21,7 +28,14 @@ public class CoreNLPProcessor extends Processor { Properties props = new Properties(); props.setProperty("customAnnotatorClass.stopwords", StopwordsAnnotator.class.getCanonicalName()); - props.setProperty("annotators", "tokenize, ssplit, stopwords, pos, lemma"); + props.setProperty("customAnnotatorClass.frequency", FrequencyAnnotator.class.getCanonicalName()); + // tokenize: transform words to tokens + // ssplit: split by and group into sentences + // stopwords: mark stopwords + // frequency: count word frequency + // pos: mark word position + // lemma: lemmatize words + props.setProperty("annotators", "tokenize, ssplit, stopwords, pos, lemma, frequency"); props.setProperty("stopwords", StringUtils.join(stopWordsList)); nlp = new StanfordCoreNLP(props); @@ -32,17 +46,30 @@ public class CoreNLPProcessor extends Processor { Annotation doc = new Annotation(input.toLowerCase()); nlp.annotate(doc); StringBuilder sb = new StringBuilder(); - List<CoreMap> sentences = doc.get(SentencesAnnotation.class); - for (CoreMap sentence : sentences) { + long wordCount = 0; + // loop sentences + for (CoreMap sentence : doc.get(SentencesAnnotation.class)) { List<CoreLabel> words = sentence.get(TokensAnnotation.class); + // count words + wordCount += words.size(); + // loop words for (CoreLabel word : words) { + // filter out stopwords Boolean b = word.get(StopwordsAnnotator.class); - if (b == null || !b) - sb.append(word.word()).append(" "); + if (b == null || !b) { + // filter out infrequent words + Long count = word.get(FrequencyAnnotator.class); + if (count != null && count >= Constants.DOCUMENT_MIN_WORD_FREQ) { + String lemma = word.get(LemmaAnnotation.class); + // collect unique words + sb.append(lemma).append(" "); + } + } } } + String text = clean(sb.toString()); - return new ProcessedText(text); + return new ProcessedText(text, wordCount); } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java new file mode 100644 index 0000000000000000000000000000000000000000..8f339c497633b848658ed3801cc7d5191cf4f898 --- /dev/null +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/FrequencyAnnotator.java @@ -0,0 +1,45 @@ +package de.vipra.cmd.text; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import edu.stanford.nlp.ling.CoreAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.Annotator; + +public class FrequencyAnnotator implements Annotator, CoreAnnotation<Long> { + + public static final String NAME = "frequency"; + + @Override + public void annotate(Annotation annotation) { + List<CoreLabel> tokens = annotation.get(TokensAnnotation.class); + Map<String, Long> words = tokens.stream() + .collect(Collectors.groupingBy(p -> p.get(LemmaAnnotation.class), Collectors.counting())); + for (CoreLabel token : tokens) { + token.set(FrequencyAnnotator.class, words.get(token.get(LemmaAnnotation.class))); + } + } + + @Override + public Set<Requirement> requirementsSatisfied() { + return Collections.singleton(new Requirement(NAME)); + } + + @Override + public Set<Requirement> requires() { + return TOKENIZE_SSPLIT_POS_LEMMA; + } + + @Override + public Class<Long> getType() { + return Long.class; + } + +} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java index 84d3b6b01b5c1c88b6f0d56bf3cecd7556de22bb..dcc842518205229beb8fca4f9bf5e3d94a437e3b 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/ProcessedText.java @@ -1,21 +1,33 @@ package de.vipra.cmd.text; -import de.vipra.util.StringUtils; - -public final class ProcessedText { +public class ProcessedText { private final String[] words; + private final long originalWordCount; + private final long reducedWordCount; + private final double reductionRatio; - public ProcessedText(String text) { + public ProcessedText(String text, long wordCount) { this.words = text.split("\\s+"); + this.originalWordCount = wordCount; + this.reducedWordCount = this.words.length; + this.reductionRatio = 1 - ((double) reducedWordCount / wordCount); } public String[] getWords() { return words; } - public String getText() { - return StringUtils.join(words); + public long getWordCount() { + return originalWordCount; + } + + public long getReducedWordCount() { + return reducedWordCount; + } + + public double getReductionRatio() { + return reductionRatio; } } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java b/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java index d2701feee64c504c0f2d98a441e0582dd36f6de5..7d5ab90a920e88a8051b84d464d102184014c074 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/text/StopwordsAnnotator.java @@ -8,8 +8,8 @@ import java.util.Properties; import java.util.Set; import edu.stanford.nlp.ling.CoreAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.Annotator; @@ -21,16 +21,13 @@ public class StopwordsAnnotator implements Annotator, CoreAnnotation<Boolean> { public StopwordsAnnotator(String input, Properties props) { stopWords = new HashSet<String>(Arrays.asList(props.getProperty(NAME).split(" "))); + stopWords.addAll(Arrays.asList("-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-")); } @Override public void annotate(Annotation annotation) { List<CoreLabel> tokens = annotation.get(TokensAnnotation.class); - for (CoreLabel token : tokens) { - if (stopWords.contains(token.word())) { - token.set(StopwordsAnnotator.class, true); - } - } + tokens.stream().filter(t -> stopWords.contains(t.word())).forEach(t -> t.set(StopwordsAnnotator.class, true)); } @Override diff --git a/vipra-ui/app/html/articles/show.html b/vipra-ui/app/html/articles/show.html index 800544880be9541add49170827bd7b7da20e8b0a..00b486cff90d30d06eabb82c3c96b62a393c2b07 100644 --- a/vipra-ui/app/html/articles/show.html +++ b/vipra-ui/app/html/articles/show.html @@ -5,7 +5,9 @@ <table class="item-actions"> <tr> <td> - <a class="btn btn-default" ui-sref="network({type:'articles', id:article.id})">Network graph</a> + <a class="btn btn-default" ui-sref="network({type:'articles', id:article.id})"> + Network graph + </a> </td> </tr> </table> diff --git a/vipra-ui/app/html/index.html b/vipra-ui/app/html/index.html index d6be6e5e35a27c94a48f0215b73a78092d63de2e..cbbe697bc86ad5b52503860aa688b10e81a4aa4b 100644 --- a/vipra-ui/app/html/index.html +++ b/vipra-ui/app/html/index.html @@ -1,66 +1,62 @@ <div ng-cloak> - <div class="container"> - - <div class="row" ng-hide="search"> - <div class="col-md-12"> - <div class="heading"></div> - </div> + <div class="row" ng-hide="search"> + <div class="col-md-12"> + <div class="heading"></div> </div> + </div> - <div class="row" ng-hide="search"> - <div class="col-md-6 text-center"> - <h4>Latest articles</h4> - <ul class="list-unstyled"> - <li class="ellipsize" ng-repeat="article in latestArticles"> - <a ui-sref="articles.show({id:article.id})" ng-bind="article.title"></a> - </li> - </ul> - </div> - <div class="col-md-3 text-center"> - <h4>Latest topics</h4> - <ul class="list-unstyled"> - <li class="ellipsize" ng-repeat="topic in latestTopics"> - <a ui-sref="topics.show({id:topic.id})" ng-bind="topic.name"></a> - </li> - </ul> - </div> - <div class="col-md-3 text-center"> - <h4>Latest words</h4> - <ul class="list-unstyled"> - <li class="ellipsize" ng-repeat="word in latestWords"> - <a ui-sref="words.show({id:word.id})" ng-bind="word.id"></a> - </li> - </ul> - </div> + <div class="row" ng-hide="search"> + <div class="col-md-6 text-center"> + <h4>Latest articles</h4> + <ul class="list-unstyled"> + <li class="ellipsize" ng-repeat="article in latestArticles"> + <a ui-sref="articles.show({id:article.id})" ng-bind="article.title"></a> + </li> + </ul> </div> - - <div class="row row-spaced"> - <div class="col-md-12"> - <input type="text" class="form-control input-lg" placeholder="Search..." ng-model="search" ng-model-options="{debounce:500}"> - </div> + <div class="col-md-3 text-center"> + <h4>Latest topics</h4> + <ul class="list-unstyled"> + <li class="ellipsize" ng-repeat="topic in latestTopics"> + <a ui-sref="topics.show({id:topic.id})" ng-bind="topic.name"></a> + </li> + </ul> </div> + <div class="col-md-3 text-center"> + <h4>Latest words</h4> + <ul class="list-unstyled"> + <li class="ellipsize" ng-repeat="word in latestWords"> + <a ui-sref="words.show({id:word.id})" ng-bind="word.id"></a> + </li> + </ul> + </div> + </div> - <div class="row row-spaced"> - <div class="text-center" ng-show="searching"> - Searching... - </div> - <div class="col-md-12" ng-show="!searching && search && (!searchResults || searchResults.length == 0)"> - <h4>No Results</h4> - </div> - <div class="col-md-12" ng-show="searchResults.length > 0"> - <h4>Results</h4> - <ul class="list-unstyled search-results"> - <li class="search-result" ng-repeat="article in searchResults"> - <a ui-sref="articles.show({id:article.id})" ng-bind="article.title"></a> - <p> - <span class="text" ng-bind="article.text"></span> - <br> - <small class="text-muted" ng-bind-template="{{article.meta.score | toPercent}}% – {{article.date | formatDate}}"></small> - </p> - </li> - </ul> - </div> + <div class="row row-spaced"> + <div class="col-md-12"> + <input type="text" class="form-control input-lg" placeholder="Search..." ng-model="search" ng-model-options="{debounce:500}"> </div> + </div> + <div class="row row-spaced"> + <div class="text-center" ng-show="searching"> + Searching... + </div> + <div class="col-md-12" ng-show="!searching && search && (!searchResults || searchResults.length == 0)"> + <h4>No Results</h4> + </div> + <div class="col-md-12" ng-show="searchResults.length > 0"> + <h4>Results</h4> + <ul class="list-unstyled search-results"> + <li class="search-result" ng-repeat="article in searchResults"> + <a ui-sref="{{::articles.show}}({id:article.id})" ng-bind="article.title"></a> + <p> + <span class="text" ng-bind="article.text"></span> + <br> + <small class="text-muted" ng-bind-template="{{article.meta.score | toPercent}}% – {{article.date | formatDate}}"></small> + </p> + </li> + </ul> + </div> </div> </div> \ No newline at end of file diff --git a/vipra-ui/app/html/network.html b/vipra-ui/app/html/network.html index b1a8667a91fe7782774dd7e583326973c0f3e070..449bed477f4ed888002790bdd41f7233cbddd95e 100644 --- a/vipra-ui/app/html/network.html +++ b/vipra-ui/app/html/network.html @@ -2,13 +2,13 @@ <div class="fullsize navpadding"> <div class="graph-legend overlay"> <label style="color:{{colors.articles}}"> - <input type="checkbox" ng-model="shown.articles" store-value="showArticles"> Articles + <input type="checkbox" ng-model="shown.articles" store-value="showArticles" store-default="type == 'articles'" ng-disabled="type == 'articles'"> Articles </label> <label style="color:{{colors.topics}}"> - <input type="checkbox" ng-model="shown.topics" store-value="showTopics" store-default="true"> Topics + <input type="checkbox" ng-model="shown.topics" store-value="showTopics" store-default="true" ng-disabled="type == 'topics'"> Topics </label> <label style="color:{{colors.words}}"> - <input type="checkbox" ng-model="shown.words" store-value="showWords" store-default="true"> Words + <input type="checkbox" ng-model="shown.words" store-value="showWords" store-default="true" ng-disabled="type == 'words'"> Words </label> </div> <div class="fullsize navpadding" id="visgraph"></div> diff --git a/vipra-ui/app/index.html b/vipra-ui/app/index.html index 30b113bd77faf1fbde92cd76043925d347bcc7b3..b4995c130191a52e091396f76f60039060c5372f 100644 --- a/vipra-ui/app/index.html +++ b/vipra-ui/app/index.html @@ -51,9 +51,15 @@ <!-- Collect the nav links, forms, and other content for toggling --> <div class="collapse navbar-collapse" id="vipra-navbar-collapse-1"> <ul class="nav navbar-nav"> - <li ng-class="{active:$state.includes('articles')}"><a ui-sref="articles">Articles</a></li> - <li ng-class="{active:$state.includes('topics')}"><a ui-sref="topics">Topics</a></li> - <li ng-class="{active:$state.includes('words')}"><a ui-sref="words">Words</a></li> + <li ng-class="{active:$state.includes('articles')}"> + <a ui-sref="articles">Articles</a> + </li> + <li ng-class="{active:$state.includes('topics')}"> + <a ui-sref="topics">Topics</a> + </li> + <li ng-class="{active:$state.includes('words')}"> + <a ui-sref="words">Words</a> + </li> </ul> <ul class="nav navbar-nav navbar-right"> diff --git a/vipra-ui/app/js/config.js b/vipra-ui/app/js/config.js index e57dd2d136d48076a8f2371e62539307ed3b4995..643b5e335d4b33eaea31940ee2c5ba76725c47b0 100644 --- a/vipra-ui/app/js/config.js +++ b/vipra-ui/app/js/config.js @@ -3,8 +3,8 @@ window.Vipra = window.Vipra || {}; Vipra.config = { - restUrl: '//' + location.hostname + ':8080/vipra/rest', - websocketUrl: 'ws://' + location.hostname + ':8080/vipra/ws' + restUrl: '//' + location.hostname + ':8000/vipra/rest', + websocketUrl: 'ws://' + location.hostname + ':8000/vipra/ws' }; })(); \ No newline at end of file diff --git a/vipra-ui/app/js/controllers.js b/vipra-ui/app/js/controllers.js index 326c167905ce52582fad717935b19d4a27495be6..684e43366c7d53d9512efb889c7b771cd837fb16 100644 --- a/vipra-ui/app/js/controllers.js +++ b/vipra-ui/app/js/controllers.js @@ -85,6 +85,7 @@ nodes: $scope.nodes, edges: $scope.edges }; + $scope.type = $stateParams.type; $scope.options = { nodes: { font: { size: 14 }, diff --git a/vipra-ui/app/less/app.less b/vipra-ui/app/less/app.less index cd5b31d1c38a9359bb7f4f98ab451507f07db9e1..4ed64e66f96434abaee9f07d9fba3682e1cdee03 100644 --- a/vipra-ui/app/less/app.less +++ b/vipra-ui/app/less/app.less @@ -1,3 +1,5 @@ +@basecolor: #007aa3; + html { position: relative; min-height: 100%; @@ -57,6 +59,7 @@ ul.dashed { &> a, &> a:hover, &> a:focus { + border-color: @basecolor !important; border-bottom: 3px solid; padding-bottom: 12px; } diff --git a/vipra-util/src/main/java/de/vipra/util/Constants.java b/vipra-util/src/main/java/de/vipra/util/Constants.java index 9e9fdf9ab89de21a326e271a39e6daf083301bd7..9f431ebb3dc784630071665147f8a9d73e37ec6d 100644 --- a/vipra-util/src/main/java/de/vipra/util/Constants.java +++ b/vipra-util/src/main/java/de/vipra/util/Constants.java @@ -47,12 +47,12 @@ public class Constants { /** * Topic boost parameter. Boosts topic importance in queries. */ - public static final int BOOST_TOPICS = 4; + public static final int ES_BOOST_TOPICS = 4; /** * Title boost parameter. Boosts title importance in queries. */ - public static final int BOOST_TITLES = 2; + public static final int ES_BOOST_TITLES = 2; /* * TOPIC MODELING @@ -74,7 +74,7 @@ public class Constants { * Number of words in a discovered topic, if the selected topic modeling * library supports this parameter. */ - public static final int K_TOPIC_WORDS = 80; + public static final int K_TOPIC_WORDS = 50; /** * Precision of likeliness numbers. Likeliness is calculated for words to @@ -89,15 +89,16 @@ public class Constants { public static final double TOPIC_THRESHOLD = 0.01; /** - * Minumum number of words per document. + * Minimum word frequency for words to be used for topic modeling. All words + * below this frequency in a document are filtered out before generating the + * topic model. */ - public static final int DOCUMENT_MIN_LENGTH = 10; + public static final int DOCUMENT_MIN_WORD_FREQ = 20; /** - * Set this to true to save all found words in the database. If false, will - * save only topic related words found by topic modeling. + * Minumum number of words per document. */ - public static final boolean SAVE_ALL_WORDS = false; + public static final int DOCUMENT_MIN_LENGTH = 10; /** * Stopwords list. Extensive list of stopwords used to clean imported diff --git a/vipra-util/src/main/java/de/vipra/util/MultiMap.java b/vipra-util/src/main/java/de/vipra/util/MultiMap.java deleted file mode 100644 index 6f0fe47f3f1784eec7c695bdb0ebf47b38f83999..0000000000000000000000000000000000000000 --- a/vipra-util/src/main/java/de/vipra/util/MultiMap.java +++ /dev/null @@ -1,104 +0,0 @@ -package de.vipra.util; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; - -public class MultiMap<T, U> implements Map<T, Collection<U>> { - - private final Map<T, Collection<U>> map; - private final boolean unique; - - public MultiMap() { - this(false); - } - - public MultiMap(boolean unique) { - this.map = new HashMap<>(); - this.unique = unique; - } - - @Override - public int size() { - return map.size(); - } - - @Override - public boolean isEmpty() { - return map.isEmpty(); - } - - @Override - public boolean containsKey(Object key) { - return map.containsKey(key); - } - - @Override - public boolean containsValue(Object value) { - return map.containsValue(value); - } - - @Override - public Collection<U> get(Object key) { - return map.get(key); - } - - public Iterator<U> each(Object key) { - Collection<U> c = map.get(key); - if (c == null) - return null; - return c.iterator(); - } - - @Override - public Collection<U> put(T key, Collection<U> value) { - return map.put(key, value); - } - - public void put(T key, U value) { - Collection<U> c = map.get(key); - if (c == null) { - if (unique) - c = new HashSet<>(); - else - c = new ArrayList<>(); - } - c.add(value); - map.put(key, c); - } - - @Override - public Collection<U> remove(Object key) { - return map.remove(key); - } - - @Override - public void putAll(Map<? extends T, ? extends Collection<U>> m) { - map.putAll(m); - } - - @Override - public void clear() { - map.clear(); - } - - @Override - public Set<T> keySet() { - return map.keySet(); - } - - @Override - public Collection<Collection<U>> values() { - return map.values(); - } - - @Override - public Set<java.util.Map.Entry<T, Collection<U>>> entrySet() { - return map.entrySet(); - } - -} diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java index 8a17356a72544aaf547474f3b41250fe0d104b70..b3c6b025faeb103cd2937ca8d73869b53d003d95 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java +++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleFull.java @@ -7,9 +7,7 @@ import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; -import java.util.HashMap; import java.util.List; -import java.util.Map; import org.bson.types.ObjectId; import org.mongodb.morphia.annotations.Embedded; @@ -46,9 +44,8 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable { @QueryIgnore(multi = true) private String text; - @ElasticIndex("text") @QueryIgnore(all = true) - private String processedText; + private String[] processedText; private String url; @@ -67,9 +64,6 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable { private Date modified; - @Transient - private Map<String, String> links; - @Transient private NestedMap meta; @@ -104,15 +98,20 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable { } @ElasticIndex("excerpt") - public String serializeText() { + public String serializeExcerpt() { return StringUtils.ellipsize(text, Constants.EXCERPT_LENGTH); } - public String getProcessedText() { + @ElasticIndex("text") + public String serializeText() { + return StringUtils.join(processedText); + } + + public String[] getProcessedText() { return processedText; } - public void setProcessedText(String processedText) { + public void setProcessedText(String[] processedText) { this.processedText = processedText; } @@ -185,20 +184,6 @@ public class ArticleFull extends FileModel<ObjectId> implements Serializable { this.modified = modified; } - public Map<String, String> getLinks() { - return links; - } - - public void setLinks(Map<String, String> links) { - this.links = links; - } - - public void addLink(String key, String link) { - if (links == null) - links = new HashMap<>(); - links.put(key, link); - } - public NestedMap getMeta() { return meta; } diff --git a/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java index e2066212d7391db6ed72419e6231c8bdc5906b02..5e1dfedc955a7492be6a13e770a44ac72c179cbe 100644 --- a/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java +++ b/vipra-util/src/main/java/de/vipra/util/model/ArticleStats.java @@ -10,6 +10,9 @@ public class ArticleStats implements Serializable { private static final long serialVersionUID = -4712841724990200627L; private Long wordCount; + private Long uniqueWordCount; + private Long processedWordCount; + private Double reductionRatio; public Long getWordCount() { return wordCount; @@ -19,16 +22,34 @@ public class ArticleStats implements Serializable { this.wordCount = wordCount; } - public static ArticleStats generateFromText(final String text) { - ArticleStats stats = new ArticleStats(); - String[] words = text.split("\\s+"); - stats.setWordCount((long) words.length); - return stats; + public Long getUniqueWordCount() { + return uniqueWordCount; + } + + public void setUniqueWordCount(Long uniqueWordCount) { + this.uniqueWordCount = uniqueWordCount; + } + + public Long getProcessedWordCount() { + return processedWordCount; + } + + public void setProcessedWordCount(Long processedWordCount) { + this.processedWordCount = processedWordCount; + } + + public Double getReductionRatio() { + return reductionRatio; + } + + public void setReductionRatio(Double reductionRatio) { + this.reductionRatio = reductionRatio; } @Override public String toString() { - return ArticleStats.class.getSimpleName() + "[wordCount:" + wordCount + "]"; + return ArticleStats.class.getSimpleName() + "[wordCount:" + wordCount + ", processedWordCount:" + + processedWordCount + ", reductionRatio:" + reductionRatio + "]"; } } \ No newline at end of file