Skip to content
Snippets Groups Projects
Select Git revision
  • 773fb518797b4ca7aba3b2a1fa183521143e7f2a
  • master default protected
  • dev_moritz
  • 0.2.0
  • 0.1.0
5 results

Dockerfile

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    Constants.java 17.21 KiB
    package de.vipra.util;
    
    import java.nio.charset.Charset;
    import java.nio.charset.StandardCharsets;
    import java.util.Arrays;
    import java.util.Calendar;
    import java.util.Date;
    import java.util.GregorianCalendar;
    import java.util.List;
    import java.util.TimeZone;
    
    public class Constants {
    
    	/*
    	 * FILEBASE
    	 */
    
    	public static final String FILEBASE_DIR = "vipra";
    	public static final Charset FILEBASE_ENCODING = StandardCharsets.UTF_8;
    	public static final String LINE_SEP = System.lineSeparator();
    
    	/**
    	 * Buffer used while importing files into the database and filebase in # of
    	 * articles.
    	 */
    	public static final int IMPORT_BUFFER_MAX = 1000;
    
    	/*
    	 * FILES
    	 */
    
    	public static final String CONFIG_FILE = "config.properties";
    
    	/*
    	 * DATABASE
    	 */
    
    	public static final String DATABASE_HOST = "localhost";
    	public static final int DATABASE_PORT = 27017;
    	public static final String DATABASE_NAME = "test";
    
    	/*
    	 * ELASTICSEARCH
    	 */
    	public static final String ES_HOST = "127.0.0.1";
    	public static final int ES_PORT = 9300;
    
    	/**
    	 * Topic boost parameter. Boosts topic importance in queries. Default 4.
    	 */
    	public static final int ES_BOOST_TOPICS = 4;
    
    	/**
    	 * Title boost parameter. Boosts title importance in queries. Default 2.
    	 */
    	public static final int ES_BOOST_TITLES = 2;
    
    	/*
    	 * TOPIC MODELING
    	 */
    
    	/**
    	 * The number of words to be used to generate a topic name. The top n words
    	 * (sorted by likeliness) are used to generate a name for unnamed topics.
    	 * Default 4.
    	 */
    	public static final int TOPIC_AUTO_NAMING_WORDS = 4;
    
    	/**
    	 * Number of topics to discover with topic modeling, if the selected topic
    	 * modeling library supports this parameter. Default 20.
    	 */
    	public static final int K_TOPICS = 20;
    
    	/**
    	 * Number of words in a discovered topic, if the selected topic modeling
    	 * library supports this parameter. Default 50.
    	 */
    	public static final int K_TOPIC_WORDS = 50;
    
    	/**
    	 * Minimum likeliness of words. Words with lower likeliness are ignored.
    	 * Default 0.01.
    	 */
    	public static final double MINIMUM_RELATIVE_PROB = 0.01;
    
    	/**
    	 * Dynamic minimum iterations. Used for dynamic topic modeling. Default 100.
    	 */
    	public static final int DYNAMIC_MIN_ITER = 100;
    
    	/**
    	 * Dynamic maximum iterations. Used for dynamic topic modeling. Default
    	 * 1000.
    	 */
    	public static final int DYNAMIC_MAX_ITER = 1000;
    
    	/**
    	 * Static iterations. Used for static topic modeling. Default 100.
    	 */
    	public static final int STATIC_ITER = 200;
    
    	/**
    	 * Minimum word frequency for words to be used for topic modeling. All words
    	 * below this frequency in a document are filtered out before generating the
    	 * topic model. Default 10.
    	 */
    	public static final int DOCUMENT_MIN_WORD_FREQ = 10;
    
    	/**
    	 * Minumum number of words per document. Default 10.
    	 */
    	public static final int DOCUMENT_MIN_LENGTH = 10;
    
    	/**
    	 * The text processor to be used. To find a list of available values,
    	 * {@link de.vipra.util.Constants.Processor}.
    	 */
    	public static final Processor PROCESSOR = Processor.CORENLP;
    
    	/**
    	 * The topic modeling analyzer to be used. To find a list of available
    	 * analyzers, {@link de.vipra.util.Constants.Analyzer}.
    	 */
    	public static final Analyzer ANALYZER = Analyzer.DTM;
    
    	/**
    	 * The dynamic topic modeling window resolution to be used. This value is
    	 * only used, if the selected analyzer supports dynamic topic modeling. To
    	 * find a list of available analyzers,
    	 * {@link de.vipra.util.Constants.WindowResolution}.
    	 */
    	public static final WindowResolution WINDOW_RESOLUTION = WindowResolution.YEAR;
    
    	/**
    	 * Stopwords list. Extensive list of stopwords used to clean imported
    	 * articles of the most common words before topic modeling is applied.
    	 */
    	public static final List<String> STOPWORDS = Arrays.asList("'ll", "'ve", "a", "a's", "able", "about", "above",
    			"abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected",
    			"affecting", "affects", "after", "afterwards", "again", "against", "ah", "ain't", "aint", "all", "allow",
    			"allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst",
    			"amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone",
    			"anything", "anyway", "anyways", "anywhere", "apart", "apparently", "appear", "appreciate", "appropriate",
    			"approximately", "are", "area", "areas", "aren", "aren't", "arent", "arise", "around", "as", "aside", "ask",
    			"asked", "asking", "asks", "associated", "at", "auth", "available", "away", "awfully", "b", "back",
    			"backed", "backing", "backs", "be", "became", "because", "become", "becomes", "becoming", "been", "before",
    			"beforehand", "began", "begin", "beginning", "beginnings", "begins", "behind", "being", "beings", "believe",
    			"below", "beside", "besides", "best", "better", "between", "beyond", "big", "bill", "biol", "both",
    			"bottom", "brief", "briefly", "but", "by", "c", "c'mon", "c's", "ca", "call", "came", "can", "can't",
    			"cannot", "cant", "case", "cases", "cause", "causes", "certain", "certainly", "changes", "clear", "clearly",
    			"cmon", "co", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider",
    			"considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "couldnt",
    			"course", "cry", "cs", "currently", "d", "date", "de", "definitely", "describe", "described", "despite",
    			"detail", "did", "didn't", "didnt", "differ", "different", "differently", "do", "does", "doesn't", "doesnt",
    			"doing", "don't", "done", "dont", "down", "downed", "downing", "downs", "downwards", "due", "during", "e",
    			"each", "early", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "eleven", "else", "elsewhere",
    			"empty", "end", "ended", "ending", "ends", "enough", "entirely", "especially", "et", "et-al", "etc", "even",
    			"evenly", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example",
    			"except", "f", "face", "faces", "fact", "facts", "far", "felt", "few", "ff", "fifteen", "fifth", "fify",
    			"fill", "find", "finds", "fire", "first", "five", "fix", "followed", "following", "follows", "for",
    			"former", "formerly", "forth", "forty", "found", "four", "from", "front", "full", "fully", "further",
    			"furthered", "furthering", "furthermore", "furthers", "g", "gave", "general", "generally", "get", "gets",
    			"getting", "give", "given", "gives", "giving", "go", "goes", "going", "gone", "good", "goods", "got",
    			"gotten", "great", "greater", "greatest", "greetings", "group", "grouped", "grouping", "groups", "h", "had",
    			"hadn't", "hadnt", "happens", "hardly", "has", "hasn't", "hasnt", "have", "haven't", "havent", "having",
    			"he", "he'd", "he'll", "he's", "hed", "hello", "help", "hence", "her", "here", "here's", "hereafter",
    			"hereby", "herein", "heres", "hereupon", "hers", "herse", "herself", "hes", "hi", "hid", "high", "higher",
    			"highest", "him", "himse", "himself", "his", "hither", "home", "hopefully", "how", "how's", "howbeit",
    			"however", "hundred", "i", "i'd", "i'll", "i'm", "i've", "id", "ie", "if", "ignored", "ill", "im",
    			"immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index",
    			"indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest",
    			"interested", "interesting", "interests", "into", "invention", "inward", "is", "isn't", "isnt", "it",
    			"it'd", "it'll", "it's", "itd", "itll", "its", "itse", "itself", "ive", "j", "just", "k", "keep", "keeps",
    			"kept", "kg", "kind", "km", "knew", "know", "known", "knows", "l", "large", "largely", "last", "lately",
    			"later", "latest", "latter", "latterly", "least", "less", "lest", "let", "let's", "lets", "like", "liked",
    			"likely", "line", "little", "long", "longer", "longest", "look", "looking", "looks", "ltd", "m", "made",
    			"mainly", "make", "makes", "making", "man", "many", "may", "maybe", "me", "mean", "means", "meantime",
    			"meanwhile", "member", "members", "men", "merely", "mg", "might", "mill", "million", "mine", "miss", "ml",
    			"more", "moreover", "most", "mostly", "move", "mr", "mrs", "much", "mug", "must", "mustn't", "my", "myse",
    			"myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need",
    			"needed", "needing", "needs", "neither", "never", "nevertheless", "new", "newer", "newest", "next", "nine",
    			"ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted",
    			"nothing", "novel", "now", "nowhere", "number", "numbers", "o", "obtain", "obtained", "obviously", "of",
    			"off", "often", "oh", "ok", "okay", "old", "older", "oldest", "omitted", "on", "once", "one", "ones",
    			"only", "onto", "open", "opened", "opening", "opens", "or", "ord", "order", "ordered", "ordering", "orders",
    			"other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall",
    			"owing", "own", "p", "page", "pages", "part", "parted", "particular", "particularly", "parting", "parts",
    			"past", "per", "perhaps", "place", "placed", "places", "please", "plus", "point", "pointed", "pointing",
    			"points", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "presented",
    			"presenting", "presents", "presumably", "previously", "primarily", "probably", "problem", "problems",
    			"promptly", "proud", "provides", "put", "puts", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather",
    			"rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding",
    			"regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting",
    			"results", "right", "room", "rooms", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec",
    			"second", "secondly", "seconds", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen",
    			"sees", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "shan't",
    			"she", "she'd", "she'll", "she's", "shed", "shes", "should", "shouldn't", "shouldnt", "show", "showed",
    			"showing", "shown", "showns", "shows", "side", "sides", "significant", "significantly", "similar",
    			"similarly", "since", "sincere", "six", "sixty", "slightly", "small", "smaller", "smallest", "so", "some",
    			"somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere",
    			"soon", "sorry", "specifically", "specified", "specify", "specifying", "state", "states", "still", "stop",
    			"strongly", "sub", "substantially", "successfully", "such", "such as", "sufficiently", "suggest", "sup",
    			"sure", "system", "t", "t's", "take", "taken", "taking", "tell", "ten", "tends", "th", "than", "thank",
    			"thanks", "thanx", "that", "that'll", "that's", "that've", "thats", "the", "their", "theirs", "them",
    			"themselves", "then", "thence", "there", "there'll", "there's", "there've", "thereafter", "thereby",
    			"thered", "therefore", "therein", "thereof", "therere", "theres", "thereto", "thereupon", "these", "they",
    			"they'd", "they'll", "they're", "they've", "theyd", "theyll", "theyre", "theyve", "thick", "thin", "thing",
    			"things", "think", "thinks", "third", "this", "thorough", "thoroughly", "those", "thou", "though",
    			"thoughh", "thought", "thoughts", "thousand", "three", "throug", "through", "throughout", "thru", "thus",
    			"til", "tip", "to", "today", "together", "too", "took", "top", "toward", "towards", "tried", "tries",
    			"truly", "try", "trying", "ts", "turn", "turned", "turning", "turns", "twelve", "twenty", "twice", "two",
    			"u", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups",
    			"us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "uucp", "v", "value",
    			"various", "very", "via", "viz", "vol", "vols", "vs", "w", "want", "wanted", "wanting", "wants", "was",
    			"wasn't", "wasnt", "way", "ways", "we", "we'd", "we'll", "we're", "we've", "wed", "welcome", "well",
    			"wells", "went", "were", "weren't", "werent", "weve", "what", "what'll", "what's", "whatever", "whats",
    			"when", "when's", "whence", "whenever", "where", "where's", "whereafter", "whereas", "whereby", "wherein",
    			"wheres", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "who'll", "who's",
    			"whod", "whoever", "whole", "whom", "whomever", "whos", "whose", "why", "why's", "widely", "will",
    			"willing", "wish", "with", "within", "without", "won't", "wonder", "wont", "words", "work", "worked",
    			"working", "works", "world", "would", "wouldn't", "wouldnt", "www", "x", "y", "year", "years", "yes", "yet",
    			"you", "you'd", "you'll", "you're", "you've", "youd", "youll", "young", "younger", "youngest", "your",
    			"youre", "yours", "yourself", "yourselves", "youve", "z", "zero");
    
    	/**
    	 * Disallowed chars for words in processed text segments. This regular
    	 * expression is used to strip text of characters that should not be
    	 * processed.
    	 */
    	public static final String CHARS_DISALLOWED = "[^a-zA-Z0-9 ]";
    
    	/**
    	 * Regular expression to find and remove email addresses from text.
    	 */
    	public static final String REGEX_EMAIL = "[^\\s]*@[^\\s]*";
    
    	/**
    	 * Regular expressiong to find and remove urls from text.
    	 */
    	public static final String REGEX_URL = "(?:^|[\\W])((ht|f)tp(s?):\\/\\/|www\\.)(([\\w\\-]+\\.){1,}?([\\w\\-.~]+\\/?)*[\\p{Alnum}.,%_=?&#\\-+()\\[\\]\\*$~@!:/{};']*)";
    
    	/**
    	 * Regular expressiong to find and remove numbers from text.
    	 */
    	public static final String REGEX_NUMBER = "\\b\\w*\\d+\\w*\\b";
    
    	/**
    	 * Regular expression to find and remove single char words.
    	 */
    	public static final String REGEX_SINGLECHAR = "\\b\\w\\b";
    
    	/*
    	 * INDEX
    	 */
    
    	/**
    	 * The length of the text excerpt used for indexing and displaying text in
    	 * search results.
    	 */
    	public static final int EXCERPT_LENGTH = 250;
    
    	/*
    	 * OTHER
    	 */
    
    	/**
    	 * The global date time format. Will be used for conversion from and to
    	 * database and frontend dates.
    	 */
    	public static final String DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
    
    	/**
    	 * The text processors available, including the default text processor
    	 */
    	public static enum Processor {
    		CORENLP("corenlp");
    
    		public final String name;
    
    		private Processor(final String name) {
    			this.name = name;
    		}
    
    		private Processor(final Processor def) {
    			name = def.name;
    		}
    
    		public static Processor DEFAULT() {
    			return CORENLP;
    		}
    
    		public static Processor fromString(final String text) {
    			if (text != null) {
    				for (final Processor b : Processor.values()) {
    					if (text.equalsIgnoreCase(b.name)) {
    						return b;
    					}
    				}
    			}
    			return DEFAULT();
    		}
    	}
    
    	/**
    	 * The topic modeling analyzers available, including the default analyzer.
    	 */
    	public static enum Analyzer {
    		JGIBB("jgibb"),
    		DTM("dtm");
    
    		public final String name;
    
    		private Analyzer(final String name) {
    			this.name = name;
    		}
    
    		private Analyzer(final Analyzer def) {
    			name = def.name;
    		}
    
    		public static Analyzer DEFAULT() {
    			return DTM;
    		}
    
    		public static Analyzer fromString(final String text) {
    			if (text != null) {
    				for (final Analyzer b : Analyzer.values()) {
    					if (text.equalsIgnoreCase(b.name)) {
    						return b;
    					}
    				}
    			}
    			return DEFAULT();
    		}
    	}
    
    	/**
    	 * Describes the window size, when using dynamic topic modeling
    	 */
    	public static enum WindowResolution {
    		YEAR("year"),
    		QUARTER("quarter"),
    		MONTH("month"),
    		DAY("day"),
    		HOUR("hour"),
    		MINUTE("minute"),
    		SECOND("second");
    
    		public final String name;
    
    		private WindowResolution(final String name) {
    			this.name = name;
    		}
    
    		private WindowResolution(final WindowResolution def) {
    			name = def.name;
    		}
    
    		public Date startDate(final Date date) {
    			final Calendar in = new GregorianCalendar();
    			in.setTime(date);
    			final Calendar out = new GregorianCalendar();
    			out.setTimeZone(TimeZone.getTimeZone("GMT"));
    			out.setTime(date);
    			out.set(Calendar.MILLISECOND, 0);
    			switch (this) {
    				case YEAR:
    					out.set(Calendar.MONTH, 0);
    				case MONTH:
    					out.set(Calendar.DAY_OF_MONTH, 1);
    				case DAY:
    					out.set(Calendar.HOUR_OF_DAY, 0);
    				case HOUR:
    					out.set(Calendar.MINUTE, 0);
    				case MINUTE:
    					out.set(Calendar.SECOND, 0);
    				default:
    					break;
    			}
    			if (this == QUARTER)
    				out.set(Calendar.MONTH, CalendarUtils.getQuarter(in) * 3);
    			return out.getTime();
    		}
    
    		public Date endDate(final Date date) {
    			final Calendar in = new GregorianCalendar();
    			in.setTime(date);
    			final Calendar out = new GregorianCalendar();
    			out.setTime(date);
    			out.setTimeZone(TimeZone.getTimeZone("GMT"));
    			out.set(Calendar.MILLISECOND, 999);
    			switch (this) {
    				case YEAR:
    					out.set(Calendar.MONTH, 11);
    				case MONTH:
    					out.set(Calendar.DAY_OF_MONTH, out.getActualMaximum(Calendar.DAY_OF_MONTH));
    				case DAY:
    					out.set(Calendar.HOUR_OF_DAY, 23);
    				case HOUR:
    					out.set(Calendar.MINUTE, 59);
    				case MINUTE:
    					out.set(Calendar.SECOND, 59);
    				default:
    					break;
    			}
    			if (this == QUARTER)
    				out.set(Calendar.MONTH, (CalendarUtils.getQuarter(in) * 3) + 2);
    			return out.getTime();
    		}
    
    		public static WindowResolution DEFAULT() {
    			return YEAR;
    		}
    
    		public static WindowResolution fromString(final String text) {
    			if (text != null) {
    				for (final WindowResolution b : WindowResolution.values()) {
    					if (text.equalsIgnoreCase(b.name)) {
    						return b;
    					}
    				}
    			}
    			return DEFAULT();
    		}
    	}
    
    }