diff --git a/ma-impl.sublime-project b/ma-impl.sublime-project index 24db30311b340c8d78001f0fb705810ab77a8c38..e5bab69b26eee760a8bb31f588ab9a63c2fae03a 100644 --- a/ma-impl.sublime-project +++ b/ma-impl.sublime-project @@ -3,6 +3,9 @@ [ { "path": "." + }, + { + "path": "/home/eike/Downloads/dtm_release/dtm/example/model_run2" } ] } diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/Options.java b/vipra-cmd/src/main/java/de/vipra/cmd/Options.java deleted file mode 100644 index 8f1da6be38abfb6abfe3efbe7923ba65579c9f10..0000000000000000000000000000000000000000 --- a/vipra-cmd/src/main/java/de/vipra/cmd/Options.java +++ /dev/null @@ -1,5 +0,0 @@ -package de.vipra.cmd; - -public class Options { - -} diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java index 7951fdc7192216adc8795275fb88712a36254211..7b010b73f6279f179ac3a14496574ee5d1f40cf4 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMDateIndex.java @@ -22,6 +22,8 @@ import de.vipra.util.FileUtils; public class DTMDateIndex implements Closeable, Iterable<DTMDateIndex.DTMDateIndexEntry> { + public static final String FILE_WINDOWS = "dtm-seq.dat"; + public static class DTMDateIndexEntry implements Comparable<DTMDateIndexEntry> { public Date date; public boolean exists; @@ -95,7 +97,7 @@ public class DTMDateIndex implements Closeable, Iterable<DTMDateIndex.DTMDateInd writer.close(); // write window index - File seqFile = new File(file.getParentFile(), "dtm-seq.dat"); + File seqFile = new File(file.getParentFile(), FILE_WINDOWS); writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seqFile, false))); writer.write(Integer.toString(windows.size())); writer.write(Constants.LINE_SEP); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java index 8c81bbfc0d98a443aa8b17a30f335eeb6710c2e2..50a9829a0a7a31828e33b0ae18496be1078aae21 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/file/DTMFilebase.java @@ -19,6 +19,10 @@ import de.vipra.util.model.ArticleFull; public class DTMFilebase extends Filebase { + public static final String FILE_MODEL = "dtm-mult.dat"; + public static final String FILE_DATES = "dtm-dates.dat"; + public static final String FILE_VOCAB = "vocab"; + private final DTMDateIndex index; private final DTMVocabulary vocab; private final File modelFile; @@ -32,16 +36,16 @@ public class DTMFilebase extends Filebase { throw new FilebaseException(e); } try { - this.index = new DTMDateIndex(getModelFile("dtm-dates.dat"), config.windowResolution); + this.index = new DTMDateIndex(getModelFile(FILE_DATES), config.windowResolution); } catch (IOException | ParseException e) { throw new FilebaseException("could not read date index file", e); } try { - this.vocab = new DTMVocabulary(getModelFile("dtm-vocab.dat")); + this.vocab = new DTMVocabulary(getModelFile(FILE_VOCAB)); } catch (IOException e) { throw new FilebaseException("could not read vocabulary file", e); } - this.modelFile = getModelFile("dtm-mult.dat"); + this.modelFile = getModelFile(FILE_MODEL); } @Override @@ -53,7 +57,7 @@ public class DTMFilebase extends Filebase { } // write temp file - File modelFileTmp = getModelFile("dtm-mult.dat.tmp"); + File modelFileTmp = getModelFile(FILE_MODEL + ".tmp"); Iterator<String> lines = null; if (modelFile.exists()) lines = FileUtils.iterateFileLines(modelFile); diff --git a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java index 435a7ab6cbdff194157c0b0ecd551e5a203b361a..52461fc94f81d73c5fe54c2b30c3f9ea1ca085ea 100644 --- a/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java +++ b/vipra-cmd/src/main/java/de/vipra/cmd/option/ModelingCommand.java @@ -20,7 +20,7 @@ import de.vipra.util.Config; import de.vipra.util.Constants; import de.vipra.util.ConvertStream; import de.vipra.util.ESClient; -import de.vipra.util.ElasticSerializer; +import de.vipra.util.ESSerializer; import de.vipra.util.MongoUtils; import de.vipra.util.StringUtils; import de.vipra.util.Timer; @@ -46,7 +46,7 @@ public class ModelingCommand implements Command { private WordMap wordMap; private Analyzer analyzer; private Client elasticClient; - private ElasticSerializer<ArticleFull> elasticSerializer; + private ESSerializer<ArticleFull> elasticSerializer; @Override public void run() throws Exception { @@ -58,7 +58,7 @@ public class ModelingCommand implements Command { wordMap = new WordMap(dbWords); analyzer = Analyzer.getAnalyzer(config, wordMap); elasticClient = ESClient.getClient(config); - elasticSerializer = new ElasticSerializer<>(ArticleFull.class); + elasticSerializer = new ESSerializer<>(ArticleFull.class); out.info("using analyzer: " + analyzer.getName()); diff --git a/vipra-util/src/main/java/de/vipra/util/AbstractCache.java b/vipra-util/src/main/java/de/vipra/util/AbstractCache.java index 8e72bda88696fbff4c0b73900bb515e0a8b0afbc..3d22013c835d842d71b714210c1dc225b01bfc3a 100644 --- a/vipra-util/src/main/java/de/vipra/util/AbstractCache.java +++ b/vipra-util/src/main/java/de/vipra/util/AbstractCache.java @@ -1,15 +1,55 @@ package de.vipra.util; +/** + * Abstract cache interface. This interface is used to abstract a specific cache + * implementation. + * + * @param <T> + * Cache key type + * @param <U> + * Cache value type + */ public interface AbstractCache<T, U> { + /** + * Return the value from the cache, identified by the key t + * + * @param t + * key of value to be returned + * @return found value, or null + */ U get(T t); + /** + * Insert a value into the cache, with key t. + * + * @param t + * the key to be used for insertion + * @param u + * the value to be inserted + */ void put(T t, U u); + /** + * Removes a value from the cache, identified by key t + * + * @param t + * the key to be removed + */ void remove(T t); + /** + * Returns true if the specified key is found in the cache + * + * @param t + * the key to be searched + * @return true if key is found + */ boolean contains(T t); + /** + * Clears the cache of all keys and values + */ void clear(); } diff --git a/vipra-util/src/main/java/de/vipra/util/CalendarUtils.java b/vipra-util/src/main/java/de/vipra/util/CalendarUtils.java index 62378e4c9e54f2672f6940f441819a2908a08b79..f504b9e25de07dbe43dfb0a012c7d09b4c0fe366 100644 --- a/vipra-util/src/main/java/de/vipra/util/CalendarUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/CalendarUtils.java @@ -2,8 +2,19 @@ package de.vipra.util; import java.util.Calendar; +/** + * Calendar utils to be used with java default calendar implementation. + */ public class CalendarUtils { + /** + * Returns the quarter of the passed calendar. Months are turned into + * quarters of 4: Jan-Mar: 1, Apr-Jun: 2, Jul-Sep: 3, Oct:Dec: 4. + * + * @param c + * the calendar to be used + * @return the quarter of the calendar month + */ public static final int getQuarter(Calendar c) { return (int) Math.ceil(c.get(Calendar.MONTH) / 3.0); } diff --git a/vipra-util/src/main/java/de/vipra/util/Config.java b/vipra-util/src/main/java/de/vipra/util/Config.java index 94cfb3b711fcd96eef437557e341ba7bafed160f..9dfc270ef0c9f45d6b0dda691d049b74d5eb726f 100644 --- a/vipra-util/src/main/java/de/vipra/util/Config.java +++ b/vipra-util/src/main/java/de/vipra/util/Config.java @@ -24,30 +24,62 @@ import de.vipra.util.ex.ConfigException; import de.vipra.util.model.Model; import de.vipra.util.service.DatabaseService; +/** + * Vipra configuration + */ public class Config { /* * Configuration keys */ + /** + * The database host of the application database. + */ @ConfigKey("db.host") public String databaseHost = Constants.DATABASE_HOST; + /** + * The database port of the application database. + */ @ConfigKey("db.port") public int databasePort = Constants.DATABASE_PORT; + /** + * The database name of the application database. Can be an arbitrary value. + */ @ConfigKey("db.name") public String databaseName = Constants.DATABASE_NAME; + /** + * The text processor to be used. To find a list of available values, + * {@link de.vipra.util.Constants.Processor}. + */ @ConfigKey("tm.processor") public Processor processor = Constants.Processor.DEFAULT(); + /** + * The topic modeling analyzer to be used. To find a list of available + * analyzers, {@link de.vipra.util.Constants.Analyzer}. + */ @ConfigKey("tm.analyzer") public Analyzer analyzer = Constants.Analyzer.DEFAULT(); + /** + * The dynamic topic modeling window resolution to be used. This value is + * only used, if the selected analyzer supports dynamic topic modeling. To + * find a list of available analyzers, + * {@link de.vipra.util.Constants.WindowResolution}. + */ @ConfigKey("tm.windowresolution") public WindowResolution windowResolution = Constants.WindowResolution.DEFAULT(); + /** + * Set to true to save all words of each document, instead of only words + * discovered by topic modeling analysis. Mind that this will result in lots + * of words saved into the database, slowing down import processes and web + * requests. + */ @ConfigKey("tm.saveallwords") public boolean saveAllWords = Constants.SAVE_ALL_WORDS; @@ -158,7 +190,7 @@ public class Config { parsedValue = Double.parseDouble(value); } else if (Enum.class.isAssignableFrom(clazz)) { // enum - parsedValue = searchEnum((Class<Enum>) entry.getValue().getType(), value); + parsedValue = EnumTools.searchEnum((Class<Enum>) entry.getValue().getType(), value); } else { // something else log.warn("unrecognized config value type: " + clazz); @@ -173,6 +205,13 @@ public class Config { } } + /** + * Returns the data directory used for topic modeling and configuration + * storage + * + * @return the data directory to be used + * @throws ConfigException + */ public File getDataDirectory() throws ConfigException { File dataDir = getGenericDataDir(); @@ -185,10 +224,24 @@ public class Config { return dataDir; } + /** + * Returns a representation of the used mongodb connection + * + * @return mongo connection + * @throws ConfigException + */ public Mongo getMongo() throws ConfigException { return Mongo.getInstance(this); } + /** + * Create a database service abstraction to interact with the database. + * + * @param clazz + * the DAO class, extending Model + * @return the database service + * @throws ConfigException + */ public <Type extends Model<IdType>, IdType> DatabaseService<Type, IdType> getDatabaseService(Class<Type> clazz) throws ConfigException { return DatabaseService.getDatabaseService(this, clazz); @@ -233,21 +286,34 @@ public class Config { pw.flush(); } - public String hash() { - String config = databaseHost + databasePort + databaseName + processor + analyzer + saveAllWords; - return DigestUtils.md5(config); - } - + /** + * Returns a generic data directory, if none is configured. + * + * @return generic data directory + */ public static File getGenericDataDir() { File base = PathUtils.appDataDir(); return new File(base, Constants.FILEBASE_DIR); } + /** + * Returns a generic config directory, if none is configured. + * + * @return generic config directory + */ public static File getGenericConfigDir() { File base = PathUtils.appConfigDir(); return new File(base, Constants.FILEBASE_DIR); } + /** + * Config class is a singleton. This method returns its instantiation. + * + * @return config object instantiation + * @throws IOException + * if reading the config file fails + * @throws ConfigException + */ public static Config getConfig() throws IOException, ConfigException { if (config == null) { config = new Config(); @@ -255,13 +321,4 @@ public class Config { return config; } - public static <T extends Enum<?>> T searchEnum(Class<T> enumeration, String search) { - for (T each : enumeration.getEnumConstants()) { - if (each.name().compareToIgnoreCase(search) == 0) { - return each; - } - } - return null; - } - } diff --git a/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java b/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java index a5eefb8f09426027b32a35f51de94dfa01b25690..05741bc111962e83ba7cd07d408997394f57e40d 100644 --- a/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java +++ b/vipra-util/src/main/java/de/vipra/util/ConsoleUtils.java @@ -14,6 +14,9 @@ public class ConsoleUtils { public static final Logger log = LoggerFactory.getLogger(ConsoleUtils.class); + /** + * Choice enum. Describes choices that can be prompted + */ public static enum Choice { ABORT("[a]bort", "a"), CONTINUE("[c]ontinue", "c"), @@ -44,6 +47,11 @@ public class ConsoleUtils { } } + /** + * Read a line from the console + * + * @return read line, without newline character + */ public static String readLine() { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); try { @@ -54,12 +62,30 @@ public class ConsoleUtils { } } + /** + * Request user confirmation + * + * @param confirm + * the confirm string, has to be typed in by the user + * @return wether user input matches confirm string + */ public static boolean confirm(String confirm) { System.out.print("> "); String in = readLine().toLowerCase().trim(); return in.equals(confirm); } + /** + * Promt a user choice from multiple options + * + * @param choice + * default choice + * @param acceptDefault + * immediately return with default choice + * @param choices + * all choices that can be selected + * @return the selected choice + */ public static String prompt(String choice, boolean acceptDefault, String... choices) { if (acceptDefault && choice != null) return choice; @@ -80,6 +106,17 @@ public class ConsoleUtils { return null; } + /** + * Promt a user choice from multiple options + * + * @param choice + * default choice + * @param acceptDefault + * immediately return with default choice + * @param choices + * all choices that can be selected + * @return the selected choice + */ public static Choice prompt(Choice choice, boolean acceptDefault, Choice... choices) { return Choice.fromString( prompt(choice != null ? choice.choice : null, acceptDefault, ListUtils.toStringArray(choices))); diff --git a/vipra-util/src/main/java/de/vipra/util/ConvertStream.java b/vipra-util/src/main/java/de/vipra/util/ConvertStream.java index c5dc60580bc56c8ef0290e946fae18e6f78fe196..f7d926ffb0cf7f499971c6630f873f49c8dd84ac 100644 --- a/vipra-util/src/main/java/de/vipra/util/ConvertStream.java +++ b/vipra-util/src/main/java/de/vipra/util/ConvertStream.java @@ -13,6 +13,16 @@ import java.util.Queue; import de.vipra.util.ex.NotImplementedException; +/** + * The convert stream class is used to create a converting stream of objects + * from a file resource. The file is read sequentially and objects are + * deserialized from the file contents, according to some convert method of the + * convert stream. The amount of lines read by the stream is decided by the + * convert method. + * + * @param <T> + * object type returned by the stream + */ public abstract class ConvertStream<T> implements Closeable, AutoCloseable, Iterator<T>, Iterable<T> { private final BufferedReader reader; @@ -28,6 +38,11 @@ public abstract class ConvertStream<T> implements Closeable, AutoCloseable, Iter reader.close(); } + /** + * Returns the next file line. + * + * @return next file line, if available + */ protected String nextLine() { if (buffer.isEmpty()) { try { @@ -37,10 +52,20 @@ public abstract class ConvertStream<T> implements Closeable, AutoCloseable, Iter return buffer.poll(); } + /** + * Push back line into a line buffer, if not processed + * + * @param line + * line to buffer + */ protected void buffer(String line) { buffer.offer(line); } + /** + * Returns true if next line available. Reads and buffers the next line of + * the selected file. + */ @Override public boolean hasNext() { String line = null; @@ -54,6 +79,11 @@ public abstract class ConvertStream<T> implements Closeable, AutoCloseable, Iter return false; } + /** + * Returns the next object in the stream, converted by the convert method + * + * @return converted object + */ @Override public T next() { if (buffer.isEmpty()) { @@ -74,6 +104,15 @@ public abstract class ConvertStream<T> implements Closeable, AutoCloseable, Iter return this; } + /** + * Convert method. This method is used to deserialize a file line into an + * object of type T. If more lines are required for deserialization, they + * can be requested by using next(). + * + * @param line + * the line to be converted + * @return the converted object + */ public abstract T convert(String line); } diff --git a/vipra-util/src/main/java/de/vipra/util/DigestUtils.java b/vipra-util/src/main/java/de/vipra/util/DigestUtils.java deleted file mode 100644 index 3fa20e86387c1fcedc655cb972c3e0712ce4351f..0000000000000000000000000000000000000000 --- a/vipra-util/src/main/java/de/vipra/util/DigestUtils.java +++ /dev/null @@ -1,26 +0,0 @@ -package de.vipra.util; - -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; - -import javax.xml.bind.annotation.adapters.HexBinaryAdapter; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class DigestUtils { - - public static final Logger log = LoggerFactory.getLogger(DigestUtils.class); - - public static String md5(String in) { - MessageDigest md = null; - try { - md = MessageDigest.getInstance("MD5"); - return (new HexBinaryAdapter()).marshal(md.digest(in.getBytes())); - } catch (NoSuchAlgorithmException e) { - log.error("md5 algorithm not available"); - return null; - } - } - -} diff --git a/vipra-util/src/main/java/de/vipra/util/ESClient.java b/vipra-util/src/main/java/de/vipra/util/ESClient.java index efae45afc1bd94b693e18ada8fde18c6242db124..e496048e80dad47bf2a5d72af922d0dbaadb0faa 100644 --- a/vipra-util/src/main/java/de/vipra/util/ESClient.java +++ b/vipra-util/src/main/java/de/vipra/util/ESClient.java @@ -6,10 +6,21 @@ import java.net.UnknownHostException; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.transport.InetSocketTransportAddress; +/** + * ElasticSearch client to generate elastisearch transport client connections. + */ public abstract class ESClient { private static TransportClient client; + /** + * get a elasticsearch transport client + * + * @param config + * application configuration singleton + * @return elasticsearch transport client + * @throws UnknownHostException + */ public static TransportClient getClient(Config config) throws UnknownHostException { if (client == null) { client = TransportClient.builder().build().addTransportAddress( diff --git a/vipra-util/src/main/java/de/vipra/util/ElasticSerializer.java b/vipra-util/src/main/java/de/vipra/util/ESSerializer.java similarity index 96% rename from vipra-util/src/main/java/de/vipra/util/ElasticSerializer.java rename to vipra-util/src/main/java/de/vipra/util/ESSerializer.java index 01eec7f98f3b78f120997b26c1a07e8c295c8c2c..333eb1e95a4a0d4b45797118e65f35e8b21aed70 100644 --- a/vipra-util/src/main/java/de/vipra/util/ElasticSerializer.java +++ b/vipra-util/src/main/java/de/vipra/util/ESSerializer.java @@ -11,12 +11,12 @@ import java.util.Set; import de.vipra.util.an.ElasticIndex; -public class ElasticSerializer<T> { +public class ESSerializer<T> { private Set<Entry<String, Field>> fields; private Set<Entry<String, Method>> methods; - public ElasticSerializer(Class<T> clazz) { + public ESSerializer(Class<T> clazz) { Map<String, Field> foundFields = new HashMap<>(); Map<String, Method> foundMethods = new HashMap<>(); diff --git a/vipra-util/src/main/java/de/vipra/util/EnumTools.java b/vipra-util/src/main/java/de/vipra/util/EnumTools.java new file mode 100644 index 0000000000000000000000000000000000000000..9afa4e8a16b3b663e01b40284ddf79c642023c90 --- /dev/null +++ b/vipra-util/src/main/java/de/vipra/util/EnumTools.java @@ -0,0 +1,26 @@ +package de.vipra.util; + +/** + * Enum tools to work with java enums. + */ +public class EnumTools { + + /** + * Finds an enum value by its name, ignoring case. + * + * @param enumeration + * Enum to be searched + * @param search + * Enum value to be searched + * @return the found enum value, or null + */ + public static <T extends Enum<?>> T searchEnum(Class<T> enumeration, String search) { + for (T each : enumeration.getEnumConstants()) { + if (each.name().compareToIgnoreCase(search) == 0) { + return each; + } + } + return null; + } + +}