added patched dtm project

added patched version of dtm project updated dtm analyzer, unfinished added sequence to models, topicfull, stores a single sequence

added patched dtm project
0aa855ed · Eike Cochu · 840a49ee · 0aa855ed · 0aa855ed · 0aa855ed
Commit 0aa855ed authored Feb 27, 2016 by Eike Cochu
--- a/dtm_release/.gitignore
+++ b/dtm_release/.gitignore
+*.o
+/dtm/main
--- a/dtm_release/README
+++ b/dtm_release/README
+***************************
+Dynamic Topic Models and the Document Influence Model
+***************************
+
+This code is the result of work by 
+
+David M. Blei
+blei[at]cs.princeton.edu
+
+and
+
+Sean M Gerrish
+sgerrish[at]cs.princeton.edu.
+
+(C) Copyright 2006, David M. Blei
+   (blei [at] cs [dot] princeton [dot] edu)
+
+(C) Copyright 2011, Sean M. Gerrish
+   (sgerrish [at] cs [dot] princeton [dot] edu)
+
+It includes software corresponding to models described in the
+following papers:
+
+[1] D. Blei and J. Lafferty. Dynamic topic models. In
+   Proceedings of the 23rd International Conference on Machine Learning,
+   2006.
+[2] S. Gerrish and D. Blei.  A Language-based Approach to Measuring
+   Scholarly Impact.  In Proceedings of the 27th International Conference
+   on Machine Learning, 2010.
+
+These files are part of DIM.
+
+DIM is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your
+option) any later version.
+
+DIM is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+USA
+
+------------------------------------------------------------------------
+
+A. COMPILING
+
+You will need to have several libraries installed to compile this
+package.  One of these is gsl-devel.  Depending on your package
+manager, you may be able to install this with *one* of the following
+commands:
+
+  sudo aptitude install libgsl0-dev # Ubuntu 10.04
+  sudo zypper install gsl-devel     # OpenSUSE 11.2
+  sudo yum install gsl-devel        # CentOS 5.5
+
+You can make the main program by changing your working directory to
+dtm/ and typing:
+
+  make
+
+This software has been compiled on Ubuntu 10.04, OpenSUSE 11.2, and
+CentOS 5.5.  Depending on your environment, you may need to install
+additional libraries.
+
+B. RUNNING
+
+Once everything is compiled, you can run this software by typing the
+command "./main <flags>", where flags is a list of command-line
+options.  An example command and a description of the input and output
+files is given in dtm/sample.sh.  You can see all command-line options
+by typing
+
+  ./main --help
+
+(although we suggest you start out with the example in dtm/sample.sh).
+
+C. SUPPORT and QUESTIONS
+
+This software is provided as-is, without any warranty or support,
+WHATSOEVER.  If you have any questions about running this software,
+you can post your question to the topic-models mailing list at
+topic-models@lists.cs.princeton.edu.  You are welcome to submit
+modifications or bug-fixes of this software to the authors, although
+not all submissions may be posted.
+
+D. USAGE
+
+This progam takes as input a collection of text documents and creates
+as output a list of topics over time, a description of each document
+as a mixture of these topics, and (possibly) a measure of how
+"influential" each document is, based on its language.
+
+We have provided an example dataset, instructions for formatting input
+data and processing output files, and example command lines for
+running this software in the file dtm/sample.sh.
+
+E. CHANGES
+
+Changes in this version include:
+
+ - Change the default top_obs_var flag to 0.5 (from -1.0)
+ - Change to use more iterations and a tighter convergence criterion in each doc's E-step.
+ - Change to initialize random topics to be a bit more "flat".
--- a/dtm_release/dtm-parallel/Makefile
+++ b/dtm_release/dtm-parallel/Makefile
+.SUFFIXES: .c .u
+CC= g++
+
+CC= g++
+
+LIB=../lib
+GSLWRAP_LIB=../gslwrap
+LOCAL_DIR=../local
+
+CFLAGS = -I ${LIB} -I ${LIB}/math -I ${GSLWRAP_LIB}/include -I	\
+  ${GSLWRAP_LIB}/include/gslwrap -I ${LOCAL_DIR}/include -I	\
+  ${LIB}/util/gflags-1.1/src/gflags
+
+GDB_CFLAGS = -ggdb -I ${LIB} -I ${LIB}/math -I ${GSLWRAP_LIB}/include	\
+  -I ${GSLWRAP_LIB}/include/gslwrap -I ${LOCAL_DIR}/include -I		\
+  ${LIB}/util/gflags-1.1/src/gflags
+
+BASE_INCLUDES = -I ${LIB}
+
+LDFLAGS = -L${LOCAL_DIR}/lib -L${LOCAL_DIR}/lib/stl -lgsl -lm -lgslcblas
+LOBJECTS = ss-lm.o gsl-wrappers.o data.o param.o util.o lda-seq.o	\
+  lda.o params.o main.o
+
+GFLAGS = gflags.o gflags_reporting.o gflags_completions.o
+
+TOPICS_DIR=/home/sgerrish/portal/src1/topics
+TOPICS_DIR=/u/sgerrish/src1/topics
+
+GGDB = -ggdb
+#GGDB =
+
+CFLAGS = -I ${LIB} -I ${LIB}/math -I ${GSLWRAP_LIB}/include -I	\
+  ${GSLWRAP_LIB}/include/gslwrap -I ${LOCAL_DIR}/include -I	\
+  ${LIB}/util/gflags-1.1/src/gflags
+
+GDB_CFLAGS = -ggdb -I ${LIB} -I ${LIB}/math -I ${GSLWRAP_LIB}/include	\
+  -I ${GSLWRAP_LIB}/include/gslwrap -I ${LOCAL_DIR}/include -I		\
+  ${LIB}/util/gflags-1.1/src/gflags
+
+BASE_INCLUDES = -I ${LIB}
+
+LDFLAGS = -L${LOCAL_DIR}/lib -L${LOCAL_DIR}/lib/stl -lgsl -lm -lgslcblas
+LOBJECTS = ss-lm.o gsl-wrappers.o data.o param.o util.o lda-seq.o	\
+  lda.o params.o main.o
+
+E_STEP_FLAGS = ${GGDB} -lgsl -lm -lgslcblas -L/sw/lib
+E_STEP_SOURCE =  gsl-wrappers.c params.c data.c util.c lda.c main-lda-e-step.c c2_lib.c strutil.c
+E_STEP_OBJECTS = gsl-wrappers.o params.o data.o util.o lda.o main-lda-e-step.o c2_lib.o strutil.o
+
+M_STEP_FLAGS = ${GGDB} -lgsl -lm -lgslcblas -L/sw/lib
+M_STEP_SOURCE =  gsl-wrappers.c params.c data.c util.c lda.c main-lda-m-step.c c2_lib.c strutil.c
+M_STEP_OBJECTS = gsl-wrappers.o params.o data.o util.o lda.o main-lda-m-step.o c2_lib.o strutil.o
+
+AGGREGATE_FLAGS = ${GGDB} -lgsl -lm -lgslcblas -L/sw/lib
+AGGREGATE_SOURCE =  gsl-wrappers.c params.c data.c util.c lda.c main-aggregate-ss.c c2_lib.c strutil.c
+AGGREGATE_OBJECTS = gsl-wrappers.o params.o data.o util.o lda.o main-aggregate-ss.o c2_lib.o strutil.o
+
+ST_FLAGS = ${GGDB} -lgsl -lm -lgslcblas -L/sw/lib
+ST_SOURCE =  ss-lm.c gsl-wrappers.c data.c param.c util.c lda.c lda-seq.c params.c main-fit_seq_topics.c c2_lib.c strutil.c
+ST_OBJECTS = ss-lm.o gsl-wrappers.o data.o param.o util.o lda.o lda-seq.o params.o main-fit_seq_topics.o c2_lib.o strutil.o
+
+SD_FLAGS = ${GGDB} -lgsl -lm -lgslcblas -L/sw/lib
+SD_SOURCE =  gsl-wrappers.c data.c param.c util.c lda-seq.c lda.c params.c main-fit_seq_docs.c ss-lm.c c2_lib.c strutil.c
+SD_OBJECTS = gsl-wrappers.o data.o param.o util.o lda-seq.o lda.o params.o main-fit_seq_docs.o ss-lm.o c2_lib.o strutil.o
+GFLAGS = gflags.o gflags_reporting.o gflags_completions.o
+
+gflags:	${LIB}/util/gflags-1.1/src/gflags.cc
+	$(CC) -c $(CFLAGS) \
+        ${LIB}/util/gflags-1.1/src/gflags.cc \
+        ${LIB}/util/gflags-1.1/src/gflags_reporting.cc \
+        ${LIB}/util/gflags-1.1/src/gflags_completions.cc
+main:	$(LOBJECTS) gflags
+	$(CC)  $(CFLAGS) $(LOBJECTS) ${GFLAGS} -o main $(LDFLAGS_PTON)
+
+st:	$(ST_OBJECTS) gflags
+	$(CC)  $(CFLAGS) $(ST_OBJECTS) $(GFLAGS) -o fit-seq-topics $(ST_FLAGS)
+
+sd:	$(SD_OBJECTS) gflags
+	$(CC)  $(CFLAGS) $(SD_OBJECTS) $(GFLAGS) -o fit-seq-docs $(SD_FLAGS)
+
+lda-e:	$(E_STEP_OBJECTS) gflags
+	$(CC)  $(CFLAGS) $(E_STEP_OBJECTS) $(GFLAGS) -o lda-e-step $(E_STEP_FLAGS)
+
+lda-m:	$(M_STEP_OBJECTS) gflags
+	$(CC)  $(CFLAGS) $(M_STEP_OBJECTS) $(GFLAGS) -o lda-m-step $(M_STEP_FLAGS)
+
+aggregate:	$(AGGREGATE_OBJECTS) gflags
+	$(CC)  $(CFLAGS) $(AGGREGATE_OBJECTS) $(GFLAGS) -o main-aggregate-ss $(AGGREGATE_FLAGS)
+
+all:	gflags main st sd lda-e lda-m aggregate
+
+clean:
+	-rm -f *.o
--- a/dtm_release/dtm-parallel/README
+++ b/dtm_release/dtm-parallel/README
+***************************
+Document Influence Model and Dynamic Topic Models
+***************************
+
+This code is the result of work by 
+
+David M. Blei
+blei[at]cs.princeton.edu
+
+and
+
+Sean M Gerrish
+sgerrish[at]cs.princeton.edu.
+
+(C) Copyright 2006, David M. Blei
+   (blei [at] cs [dot] princeton [dot] edu)
+
+(C) Copyright 2009, 2010, Sean M. Gerrish
+   (sgerrish [at] cs [dot] princeton [dot] edu)
+
+This directory contains an implementation of the dynamic influence
+model intended to be run on a cluster of computers.  Compiling and
+running this software has not been tested on other computers.  It is
+therefore supported in no manner.
+
+This implementation was designed with a PBS cluster in mind, with a
+driver to be run on the head node.  If you are using a different
+cluster, you can modify the cluster interface in c2_lib.c.
--- a/dtm_release/dtm-parallel/TEST_ROADMAP
+++ b/dtm_release/dtm-parallel/TEST_ROADMAP
+./main \
+  --mode=fit \
+  --model=fixed \
+  --corpus_prefix=/n/fs/topics/users/sgerrish/data/docinf/data/nature_data/test_corpus_train \
+  --outname=/n/fs/topics/users/sgerrish/data/docinf/data/test_data \
+  --root_directory=/n/fs/topics/users/sgerrish/data/docinf/data/test_data \
+  --prp_obs_var=0.5 \
+  --top_obs_var=0.5 \
+  --phi_squared_factor=1.0 \
+  --prp_chain_var=0.5 \
+  --top_chain_var=0.005 \
+  --alpha=0.01 \
+  --influence_mean_years=20 \
+  --influence_mean_years=20 \
+  --sigma_d=0.1 \
+  --new_phi=true \
+  --ntopics=5 \
+  --number_tasks=10 \
+  --lda_max_em_iter=10
--- a/dtm_release/dtm-parallel/c2_lib.c
+++ b/dtm_release/dtm-parallel/c2_lib.c
+#include "gflags.h"
+#include "strutil.h"
+
+#include "c2_lib.h"
+
+#include <sys/stat.h>
+#include <string.h>
+#include <hash_set.h>
+#include <hash_map.h>
+#include <vector.h>
+#include "sys/wait.h"
+#include <stdlib.h>
+
+DEFINE_bool(run_locally, false,
+	    "If true, run locally.");
+DEFINE_string(bin_directory, ".",
+	      "The directory in which binaries live.");
+DEFINE_bool(resume, false,
+	    "If true, resume a halted job.");
+
+namespace c2_lib {
+
+bool FileRemove(string file_name) {
+  return remove(file_name.c_str()) == 0;
+}
+
+string CurrentWorkingDirectory() {
+  char* cwd = getcwd(NULL, 0);
+  string result(cwd);
+  free(cwd);
+  return result;
+}
+
+bool FileExists(string file_name) {
+  struct stat stFileInfo;
+  bool blnReturn;
+  int intStat;
+
+  // Attempt to get the file attributes
+  intStat = stat(file_name.c_str(), &stFileInfo);
+  if(intStat == 0) {
+    return(true);
+  } else {
+    return(false);
+  }
+}
+
+void WaitOnTask(const Task& task) {
+  bool done = false;
+  double backoff(1.0);
+  while (!done) {
+    done = true;
+    if (task.Status() != Task::SUCCESS) {
+      done = false;
+      if (task.Status() == Task::FAIL) {
+	printf("Error.  Task failed. Debug: %s\n",
+	       task.DebugString().c_str());
+	exit(1);
+      }
+    }
+    if (backoff > 2.0) {
+      sleep(backoff);
+    }
+    backoff *= 1.1;
+    if (backoff > 30) {
+      backoff = 30;
+    }
+  }
+}
+
+void WaitOnTasks(const vector<Task*>& tasks) {
+  double backoff(1.0);
+  vector<Task*> pending_tasks = tasks;
+  hash_map<long, int> task_tries;
+
+  while (!pending_tasks.empty()) {
+    for (vector<Task*>::iterator it=pending_tasks.begin();
+	 it != pending_tasks.end();
+	 ) {
+      Task* task = *it;
+      if (task->Status() != Task::SUCCESS) {
+	if (task->Status() == Task::FAIL) {
+
+	  if (task_tries.find((long) task) == task_tries.end()) {
+	    task_tries[(long) task] = 1;
+	  }
+
+	  // Sometimes tasks fail for no good reason.
+	  // Retry up to 3 times.
+	  if (task_tries[(long) task] > 3) {
+	    printf("Error.  Task %d failed.\n",
+		   task->id);
+	    printf("Debug string: %s\n",
+		   task->DebugString().c_str());
+	    exit(1);
+	  } else {
+	    printf("Task %d failed.  Retrying.  Try: %lld\n",
+		   task->id,
+		   task_tries[(long) task]);
+	    printf("Debug string: %s\n",
+		   task->DebugString().c_str());
+	    
+	    ++task_tries[(long) task];
+	    task->Start();
+	  }
+	}
+	// Don't break, since it's worth 
+	// checking whether any tasks have failed
+	// (in which case, we should die).
+	++it;
+      } else {
+	printf("Removing task.\n");
+	// Delete this element.
+	it = pending_tasks.erase(it);
+      }
+    }
+    printf("Waiting on %d tasks.\n",
+	   pending_tasks.size());
+    if (backoff > 2.0) {
+      sleep(backoff);
+    }
+    backoff *= 1.1;
+    if (backoff > 30) {
+      backoff = 30;
+    }
+  }
+}
+
+
+Resource::Resource(string filename)
+  : filename(filename) {
+}
+
+Resource::~Resource() {
+  // We should not let this resource go out of scope if tasks are
+  // still using it.
+  if (references_.size() != 0) {
+    printf("Error.  Task exists for deleted reference.");
+    exit(1);
+  }
+    
+}
+
+bool Resource::Available() {
+  return FileExists(filename);
+}
+
+void Resource::TaskDone(int id) {
+  hash_set<int>::iterator it = references_.find(id);
+  if (it != references_.end()) {
+    references_.erase(it);
+  }
+}
+
+
+// Task definitions.
+
+
+string Task::working_directory_ = "";
+
+void Task::AddResource(Resource* resource) {
+  resources_.push_back(resource);
+  resource->AddReference(id);
+}
+
+Task::Task(string name, int id)
+  : command_(""),
+    id(id),
+    name(name),
+    done_(false),
+    resources_()
+    {
+}
+
+Task::~Task() {
+  for (vector<Resource*>::iterator it = resources_.begin();
+       it != resources_.end();
+       ++it) {
+    (*it)->TaskDone(id);
+  }
+}
+
+void Task::ResourcesAvailable_(bool* available,
+			       string* resource_filename) {
+  *available = true;
+  for (vector<Resource*>::iterator it = resources_.begin();
+       it != resources_.end() && *available;
+       ++it) {
+    if (!(*it)->Available()) {
+      *available = false;
+      *resource_filename = (*it)->filename;
+    }
+  }  
+  *available = true;
+
+  return;
+}
+
+}
+
+namespace dtm {
+
+using c2_lib::FileRemove;
+using c2_lib::FileExists;
+using c2_lib::CurrentWorkingDirectory;
+
+string QSubTask::binary_ = "";
+
+double ReadLikelihoods(const vector<string>& done_sentinels) {
+  double l_hood = 0.0;
+  for (vector<string>::const_iterator it = done_sentinels.begin();
+       it != done_sentinels.end();
+       ++it) {
+    double l_hood_tmp;
+    printf("Likelihood: %.2lf\n", l_hood);
+    FILE* f = fopen((*it).c_str(), "r");
+    if (f) {
+      fscanf(f, "%lf", &l_hood_tmp);
+      fclose(f);
+    } else {
+      printf("Error reading likelihood from file %s.\n",
+	     (*it).c_str());
+    }
+
+    l_hood += l_hood_tmp;
+    printf("Likelihood: %.2lf\n", l_hood);
+  }
+  return l_hood;
+}
+
+
+string QSubTask::DebugString() const {
+  return StringPrintf("stderr: %s\n"
+		      "stdout: %s\n"
+		      "sentinel: %s\n"
+		      "full commandline: %s\n",
+		      stderr_filename_.c_str(),
+		      stdout_filename_.c_str(),
+		      done_sentinel_.c_str(),
+		      full_commandline_.c_str());
+}
+
+static int vsystem(string command) {
+  int pid;
+  int return_code = 0;
+  pid = vfork();
+  if (pid == 0) {
+    // We're the child process.  Execute.
+    execl("/bin/sh",
+	  "/bin/sh",
+	  command.c_str(),
+	  (char*) 0);
+    _exit (EXIT_FAILURE);
+  } else if (pid < 0) {
+    // The fork failed.  Report failure.
+    return_code = -1;
+  } else {
+    // This is the parent process.  Wait for the
+    // child to complete.
+    if (waitpid(pid, &return_code, 0) != pid) {
+      return_code = -1;
+    }
+  }
+  return return_code;
+}
+
+void QSubTask::Start() {
+  string resource_filename;
+  bool available;
+  ResourcesAvailable_(&available,
+		      &resource_filename);
+  if (!available) {
+    printf("Error.  Task resource unavailable:%s\n",
+	   resource_filename.c_str());
+  }
+
+  if (done_sentinel_.empty()) {
+    stdout_filename_ = StringPrintf(
+	    "%s/%s_%d.out",
+	    working_directory_.c_str(),
+	    name.c_str(),
+	    id);
+    stderr_filename_ = StringPrintf(
+	    "%s/%s_%d.err",
+	    working_directory_.c_str(),
+	    name.c_str(),
+	    id);
+    done_sentinel_ = StringPrintf(
+	    "%s/%s_%d.done",
+	    working_directory_.c_str(),
+	    name.c_str(),
+	    id);
+  }
+
+  if (FileExists(done_sentinel_)) {
+    if (!FileRemove(done_sentinel_)) {
+      printf("Error removing file.  Failing.\n");
+      exit(1);
+    }
+  }
+  if (FileExists(stdout_filename_)) {
+    if (!FileRemove(stdout_filename_)) {
+      printf("Error removing file.  Failing.\n");
+      exit(1);
+    }
+  }
+  if (FileExists(stderr_filename_)) {
+    if (!FileRemove(stderr_filename_)) {
+      printf("Error removing file.  Failing.\n");
+      exit(1);
+    }
+  }
+
+  string job_command = StringPrintf("%s/%s",
+				    FLAGS_bin_directory.c_str(),
+				    command_.c_str());
+  const string kFilename = "/tmp/command_asdf.sh";
+  if (FLAGS_run_locally) {
+    full_commandline_ = StringPrintf("nohup ./%s",
+				     job_command.c_str(),
+				     binary_.c_str(),
+				     stdout_filename_.c_str(),
+				     stderr_filename_.c_str());
+  } else {
+    // TODO(sgerrish): Add proper temp file naming.
+    full_commandline_ = StringPrintf(
+      "echo \'cd %s ; %s ; \n \' "
+      "| %s -o %s -e %s "
+      "-l walltime=%d:00:00,mem=%dmb\n",
+      CurrentWorkingDirectory().c_str(),
+      job_command.c_str(),
+      binary_.c_str(),
+      stdout_filename_.c_str(),
+      stderr_filename_.c_str(),
+      walltime_hours_,
+      memory_mb_);
+  }
+  printf("Current working directory: %s\n.", CurrentWorkingDirectory().c_str());
+  FILE* f = fopen(kFilename.c_str(), "w");
+  if (f) {
+    fprintf(f, full_commandline_.c_str());
+    fclose(f);
+    chmod(kFilename.c_str(), 0744);
+  } else {
+    printf("Error opening file /tmp/command_asdf.txt\n  Skipping.");
+  }
+
+  printf("Running command: %s.\n", full_commandline_.c_str());
+  int return_code = vsystem(kFilename.c_str());
+  if (return_code) {
+    printf("Return code %d when running command:\n"
+	   "%s\n",
+	   return_code,
+	   full_commandline_.c_str());
+    exit(1);
+  }
+}
+
+Task::RunStatus QSubTask::Status() const {
+  if (FileExists(done_sentinel_)) {
+    return Task::SUCCESS;
+  }
+
+  if (!FileExists(stderr_filename_)) {
+    return(Task::RUN);
+  } else {
+    if (!FileExists(done_sentinel_)) {
+      return(Task::FAIL);
+    }
+  }
+}
+
+QSubTask* TaskFactory::NewTask(int id,
+			       string job_name,
+			       const char* root,
+			       const hash_map<std::string, std::string>& flags,
+			       const vector<Resource*>& resources,
+			       const string& done_sentinel,
+			       string binary) {
+  QSubTask* task = new QSubTask(job_name, id);
+  string flags_string;
+
+  /*
+  for (int i=0; i < resources.size(); ++i) {
+    task->AddResource(resources[i]);
+    if (i) {
+      resource_string += " ";
+    }
+    resource_string += ("--"
+			+ resources[i]->name
+			+ "="
+			+ resources[i]->filename);
+  }
+  */
+
+  for (hash_map<string, string>::const_iterator it=flags.begin();
+       it != flags.end();
+       ++it) {
+    if (it != flags.begin()) {
+      flags_string += " ";
+    }
+    flags_string += ("--"
+		     + it->first
+		     + "="
+		     + it->second);
+  }
+
+  task->set_done_sentinel(done_sentinel);
+  task->set_working_directory(root);
+  task->set_command(binary + " " + flags_string);
+  if (flags.find("done_sentinel") != flags.end()) {
+    //    task->set_done_sentinel(flags["done_sentinel"]);
+    task->set_done_sentinel(flags.find("done_sentinel")->second);
+  }
+
+  // Set the parallel-processing binary appropriately.
+  task->set_binary("/opt/torque/bin/qsub");
+
+  return(task);
+}
+
+void CreateSentinel(const string& sentinel_filename,
+		    double l_hood) {
+  FILE* f = fopen(sentinel_filename.c_str(), "w");
+  fprintf(f, "%lf\n", l_hood);
+  fclose(f);
+}
+
+}  // namespace dtm
--- a/dtm_release/dtm-parallel/c2_lib.h
+++ b/dtm_release/dtm-parallel/c2_lib.h
+#include "strutil.h"
+
+#include <string>
+#include <iostream>
+#include <vector>
+#include <hash_set>
+#include <hash_map>
+
+using namespace std;
+
+#ifndef c2_lib_h__
+#define c2_lib_h__
+
+namespace __gnu_cxx {
+template<> struct hash< std::string >                                                       
+{                                                                                           
+  size_t operator()( const std::string& x ) const                                           
+  {                                                                                         
+    return hash< const char* >()( x.c_str() );                                              
+  }                                                                                         
+};                                                                                          
+}
+
+
+namespace c2_lib {
+
+bool FileRemove(string file_name);
+string CurrentWorkingDirectory();
+bool FileExists(string file_name);
+
+class Task;
+
+// Block until all tasks in tasks are complete.
+// If any task has failed, print a message and exit(1).;
+void WaitOnTask(const Task& task);
+void WaitOnTasks(const vector<Task*>& tasks);
+
+// Generally corresponds to a single file.
+class Resource: public FILE {
+ public:
+  Resource(string filename);
+
+  ~Resource();
+
+  // True iff the resource is available.
+  virtual bool Available();
+
+  // Removes a task from references_.
+  void TaskDone(int id);
+
+  void AddReference(int id) {
+    references_.insert(id);
+  }
+
+  // The filename corresponding to this resource.
+  string filename;
+
+  // A name given to this resource.
+  string name;
+
+ private:
+  std::hash_set<int> references_;
+};
+
+
+// Generally corresponds to one "parallel" task.
+class Task {
+ public:
+  // Name can be anything.  Id should be unique.
+  Task(string name, int id);
+
+  ~Task();
+
+  // Add a resource.
+  virtual void AddResource(Resource* resource);
+
+  // Launch the task.
+  virtual void Start() = 0;
+
+  virtual int Priority() const {
+    return 1;
+  }
+
+  virtual string DebugString() const {
+    return "";
+  }
+
+  // Removes resources from its tasks.
+  enum RunStatus {
+    RUN,
+    SUCCESS,
+    FAIL,
+  };
+  virtual RunStatus Status() const = 0;
+
+  void set_command(string command) {
+    command_ = command;
+  }
+
+  static void set_working_directory(string working_directory) {
+    working_directory_ = working_directory;
+  }
+
+  // A numeric integer describing this job's id.  Should be unique at
+  // any given time (could be recycled over the duration of the
+  // program).
+  int id;
+
+  // A string describing this job.  Need not be unique.
+  string name;
+
+  // The working directory for parallel tasks.
+  static string working_directory_;
+
+ protected:
+  // If the resources are available
+  void ResourcesAvailable_(bool* available,
+			   string* resource_filename);
+
+  // The command which has been requested run.
+  string command_;
+
+ private:
+
+  bool done_;
+
+  vector<Resource*> resources_;
+
+};
+
+}  // namespace c2_lib
+
+using namespace c2_lib;
+
+namespace dtm {
+
+// Helper functions
+double ReadLikelihoods(
+  const vector<string>& done_sentinels);
+
+class QSubTask
+  : public Task {
+ public:
+  QSubTask(string job_name, int id) : Task(job_name, id),
+				      stdout_filename_(""),
+				      stderr_filename_(""),
+				      full_commandline_(""),
+				      done_sentinel_(""),
+				      walltime_hours_(13),
+				      memory_mb_(15000) {}
+  void Start();
+
+  RunStatus Status() const;
+
+  int Priority() const {
+    return walltime_hours_;
+  }
+
+  string DebugString() const;
+
+  static void set_binary(const string& binary) {
+    binary_ = binary;
+  }
+
+  void set_done_sentinel(const string& sentinel) {
+    done_sentinel_ = sentinel;
+    stdout_filename_ = sentinel + ".out";
+    stderr_filename_ = sentinel + ".err";
+  }
+
+  void set_memory_mb(int mb) {
+    memory_mb_ = mb;
+  }
+
+  void set_walltime_hours(int hours) {
+    walltime_hours_ = hours;
+  }
+
+private:
+
+  // The parallel processing binary.
+  static string binary_;
+
+  string stdout_filename_;
+
+  string stderr_filename_;
+
+  string full_commandline_;
+
+  int walltime_hours_;
+
+  int memory_mb_;
+
+  // The name of a small file which should exist iff the program has
+  // completed successfully.
+  string done_sentinel_;
+};
+
+class TaskFactory {
+ public:
+  TaskFactory() {}
+  ~TaskFactory() {}
+
+  static Task* NewEStepTask(int id,
+			    const char* root,
+			    const hash_map<std::string, std::string>& flags,
+			    const vector<Resource*>& resources,
+			    const string& done_sentinel) {
+    QSubTask* task = NewTask(id, "lda-e-step",
+		   root,
+		   flags,
+		   resources,
+		   done_sentinel,
+		   "lda-e-step");
+    task->set_memory_mb(4000);
+    task->set_walltime_hours(5);
+    return task;
+  }
+
+  static Task* NewMStepTask(int id,
+			    const char* root,
+			    const hash_map<std::string, std::string>& flags,
+			    const vector<Resource*>& resources,
+			    const string& done_sentinel) {
+    QSubTask* task = NewTask(id, "lda-m-step",
+			     root,
+			     flags,
+			     resources,
+			     done_sentinel,
+			     "lda-m-step");
+    task->set_memory_mb(4000);
+    task->set_walltime_hours(3);
+    return task;
+  }
+
+  static Task* NewTopicsFitTask(int id,
+				const char* root,
+				const hash_map<std::string, std::string>& flags,
+				const vector<Resource*>& resources,
+				const string& done_sentinel) {
+    QSubTask* task = NewTask(id, "dtm-fit_seq_topics",
+			     root,
+			     flags,
+			     resources,
+			     done_sentinel,
+			     "fit-seq-topics");
+    task->set_memory_mb(8000);
+    task->set_walltime_hours(10);
+    return task;
+  }
+
+  static Task* NewDocsFitTask(int id,
+			      const char* root,
+			      const hash_map<std::string, std::string>& flags,
+			      const vector<Resource*>& resources,
+			      const string& done_sentinel) {
+    QSubTask* task = NewTask(id, "dtm-fit_seq_docs",
+			     root,
+			     flags,
+			     resources,
+			     done_sentinel,
+			     "fit-seq-docs");
+    task->set_memory_mb(4000);
+    task->set_walltime_hours(18);
+    return task;
+  }
+
+  static QSubTask* NewTask(int id,
+			   string job_name,
+			   const char* root,
+			   const hash_map<std::string, std::string>& flags,
+			   const vector<Resource*>& resources,
+			   const string& done_sentinel,
+			   string binary);
+
+
+ private:
+};
+
+void CreateSentinel(const string& sentinel_filename,
+		    double l_hood);
+
+}  // namespace dtm
+
+#endif
--- a/dtm_release/dtm-parallel/data.c
+++ b/dtm_release/dtm-parallel/data.c
+#include <string>
+
+#include "gflags.h"
+#include "params.h"
+#include "strutil.h"
+#include "ss-lm-fit_seq_topics.h"
+
+#include "data.h"
+
+DEFINE_double(sigma_l,
+	      0.05,
+	      "If true, use the new phi calculation.");
+DEFINE_double(sigma_d,
+	      0.05,
+	      "If true, use the new phi calculation.");
+DEFINE_double(sigma_c,
+	      0.05,
+	      "c stdev.");
+DEFINE_double(resolution,
+	      1,
+	      "The resolution.  Used to determine how far out the beta mean should be.");
+DEFINE_int32(max_number_time_points,
+	     200,
+	     "Used for the influence window.");
+DEFINE_double(time_resolution,
+	      0.5,
+	      "This is the number of years per time slice.");
+DEFINE_double(influence_mean_years,
+	      20.0,
+	      "How many years is the mean number of citations?");
+DEFINE_double(influence_stdev_years,
+	      15.0,
+	      "How many years is the stdev number of citations?");
+DEFINE_int32(influence_flat_years,
+	      -1,
+	     "How many years is the influence nonzero?"
+	     "If nonpositive, a lognormal distribution is used.");
+DEFINE_int32(fix_topics,
+	     0,
+	     "Fix all topics below this number.");
+
+DECLARE_string(normalize_docs);
+
+using namespace std;
+
+namespace dtm {
+
+/*
+ * seq corpus range: [start, end)
+ *
+ * creates a subset of time slices
+ *
+ */
+
+corpus_seq_t* make_corpus_seq_subset(corpus_seq_t* all,
+				     int start,
+				     int end) {
+    int n;
+    corpus_seq_t* subset_corpus = (corpus_seq_t*) malloc(sizeof(corpus_seq_t));
+    subset_corpus->nterms = all->nterms;
+    subset_corpus->len    = end - start;
+    subset_corpus->ndocs  = 0;
+    subset_corpus->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * subset_corpus->len);
+    for (n = start; n < end; n++)
+    {
+        subset_corpus->corpus[n - start] = all->corpus[n];
+        subset_corpus->ndocs += all->corpus[n]->ndocs;
+    }
+    return(subset_corpus);
+}
+
+
+/*
+ * collapse a sequential corpus to a flat corpus
+ *
+ */
+
+corpus_t* collapse_corpus_seq(corpus_seq_t* c) {
+    corpus_t* collapsed = (corpus_t*) malloc(sizeof(corpus_t));
+    collapsed->ndocs  = c->ndocs;
+    collapsed->nterms = c->nterms;
+    collapsed->doc    = (doc_t**) malloc(sizeof(doc_t*) * c->ndocs);
+    collapsed->max_unique = 0;
+    int t, n, doc_idx = 0;
+    for (t = 0; t < c->len; t++)
+    {
+        for (n = 0; n < c->corpus[t]->ndocs; n++)
+        {
+            collapsed->doc[doc_idx] = c->corpus[t]->doc[n];
+            if (collapsed->doc[doc_idx]->nterms > collapsed->max_unique)
+                collapsed->max_unique = collapsed->doc[doc_idx]->nterms;
+            doc_idx++;
+        }
+    }
+    assert(doc_idx == collapsed->ndocs);
+    return(collapsed);
+}
+
+/*
+ * read corpus
+ *
+ */
+
+corpus_t* read_corpus(const char* name)
+{
+    int length, count, word, n;
+    corpus_t* c;
+    char filename[400];
+    sprintf(filename, "%s-mult.dat", name);
+    outlog("reading corpus from %s", filename);
+    c = (corpus_t*) malloc(sizeof(corpus_t));
+    c->max_unique = 0;
+    FILE* fileptr = fopen(filename, "r");
+    if (fileptr == NULL) {
+      outlog("Error reading corpus prefix %s. Failing.",
+	     filename);
+      exit(1);
+    }
+    c->ndocs = 0; c->nterms = 0;
+    c->doc = (doc_t**) malloc(sizeof(doc_t*));
+    int grand_total = 0;
+    while ((fscanf(fileptr, "%10d", &length) != EOF))
+    {
+        if (length > c->max_unique) c->max_unique = length;
+        c->doc = (doc_t**) realloc(c->doc, sizeof(doc_t*)*(c->ndocs+1));
+        c->doc[c->ndocs] = (doc_t*) malloc(sizeof(doc_t));
+        c->doc[c->ndocs]->nterms = length;
+        c->doc[c->ndocs]->total = 0;
+	c->doc[c->ndocs]->log_likelihood = 0.0;
+
+        c->doc[c->ndocs]->word = (int*) malloc(sizeof(int)*length);
+        c->doc[c->ndocs]->count = (int*) malloc(sizeof(int)*length);
+        c->doc[c->ndocs]->lambda = (double*) malloc(sizeof(double)*length);
+        c->doc[c->ndocs]->log_likelihoods = (double*) malloc(sizeof(double)*length);
+        for (n = 0; n < length; n++)
+        {
+            fscanf(fileptr, "%10d:%10d", &word, &count);
+            word = word - OFFSET;
+	    if (FLAGS_normalize_docs == "occurrence") {
+	      count = 1;
+	    }
+            c->doc[c->ndocs]->word[n] = word;
+            c->doc[c->ndocs]->count[n] = count;
+            c->doc[c->ndocs]->total += count;
+	    // Is there a better value for initializing lambda?
+	    c->doc[c->ndocs]->lambda[n] = 0.0;
+	    c->doc[c->ndocs]->log_likelihoods[n] = 0.0;
+            if (word >= c->nterms) { c->nterms = word + 1; }
+        }
+        grand_total += c->doc[c->ndocs]->total;
+        c->ndocs = c->ndocs + 1;
+    }
+    fclose(fileptr);
+    outlog("read corpus (ndocs = %d; nterms = %d; nwords = %d)\n",
+           c->ndocs, c->nterms, grand_total);
+    return(c);
+}
+
+void free_corpus(corpus_t* corpus) {
+  for (int i=0; i < corpus->ndocs; ++i) {
+    delete corpus->doc[i]->word;
+    delete corpus->doc[i]->count;
+    delete corpus->doc[i]->lambda;
+    delete[] corpus->doc[i]->log_likelihoods;
+  }
+  delete corpus->doc;
+  delete corpus;
+}
+
+/*
+ * read corpus sequence
+ *
+ */
+
+corpus_seq_t* read_corpus_seq(const char* name)
+{
+    char filename[400];
+    corpus_seq_t* corpus_seq = (corpus_seq_t*) malloc(sizeof(corpus_seq_t));
+
+    // read corpus
+    corpus_t* raw_corpus = read_corpus(name);
+    corpus_seq->nterms = raw_corpus->nterms;
+    // read sequence information
+    sprintf(filename, "%s-seq.dat", name);
+    outlog("Reading corpus sequence %s.", filename);
+    FILE* fileptr = fopen(filename, "r");
+    if (!fileptr) {
+      outlog("Error opening dtm sequence file %s.\n",
+	     filename);
+      exit(1);
+    }
+    fscanf(fileptr, "%d", &(corpus_seq->len));
+    corpus_seq->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * corpus_seq->len);
+    // allocate corpora
+    int doc_idx = 0;
+    int ndocs, i, j;
+    corpus_seq->ndocs = 0;
+    for (i = 0; i < corpus_seq->len; ++i)
+    {
+        fscanf(fileptr, "%d", &ndocs);
+        corpus_seq->ndocs += ndocs;
+        corpus_seq->corpus[i] = (corpus_t*) malloc(sizeof(corpus_t));
+        corpus_seq->corpus[i]->ndocs = ndocs;
+        corpus_seq->corpus[i]->doc = (doc_t**) malloc(sizeof(doc_t*) * ndocs);
+        for (j = 0; j < ndocs; j++)
+        {
+	  if (doc_idx >= raw_corpus->ndocs) {
+	    outlog("Error: too few documents listed in dtm sequence file %s.\n"
+		   "Current  line: %d %d %d.\n",
+		   filename,
+		   doc_idx,
+		   ndocs,
+		   j);
+	    exit(1);
+	  }
+	  //	  outlog("%d %d %d %d\n", i, j, doc_idx, raw_corpus->ndocs);
+	  corpus_seq->corpus[i]->doc[j] = raw_corpus->doc[doc_idx];
+	  doc_idx++;
+        }
+    }
+    corpus_seq->max_nterms = compute_max_nterms(corpus_seq);
+    outlog("read corpus of length %d\n", corpus_seq->len);
+    return(corpus_seq);
+}
+
+
+/*
+ * write sequential corpus
+ *
+ */
+
+void write_corpus_seq(corpus_seq_t* c, char* name)
+{
+    char tmp_string[400];
+    int n;
+
+    outlog("writing %d slices to %s (%d total docs)", c->len, name, c->ndocs);
+    sprintf(tmp_string, "%s-seq.dat", name);
+    FILE* seq_file = fopen(tmp_string, "w");
+    fprintf(seq_file, "%d", c->len);
+    for (n = 0; n < c->len; n++)
+        fprintf(seq_file, " %d", c->corpus[n]->ndocs);
+    fclose(seq_file);
+
+    corpus_t* flat = collapse_corpus_seq(c);
+    sprintf(tmp_string, "%s-mult.dat", name);
+    write_corpus(flat, tmp_string);
+}
+
+/*
+ * write corpus
+ *
+ */
+
+void write_corpus(corpus_t* c, char* filename)
+{
+    int i, j;
+    FILE * fileptr;
+    doc_t * d;
+    outlog("writing %d docs to %s\n", c->ndocs, filename);
+    fileptr = fopen(filename, "w");
+    for (i = 0; i < c->ndocs; i++)
+    {
+        d = c->doc[i];
+        fprintf(fileptr, "%d", d->nterms);
+        for (j = 0; j < d->nterms; j++)
+        {
+            fprintf(fileptr, " %d:%d", d->word[j], d->count[j]);
+        }
+        fprintf(fileptr, "\n");
+    }
+    fclose(fileptr);
+}
+
+/*
+ * read and write lda sequence variational distribution
+ *
+ */
+
+void write_lda_seq_docs(const string& model_type,
+			const string& root_directory,
+			int min_time,
+			int max_time,
+			const lda_seq* model) {
+  string filename = StringPrintf("%s/times_%d_%d_info.dat",
+				 root_directory.c_str(),
+				 min_time,
+				 max_time);
+  FILE* f = fopen(filename.c_str(), "w");
+
+  params_write_int(f, "NUM_TOPICS", model->ntopics);
+  params_write_int(f, "NUM_TERMS", model->nterms);
+  params_write_int(f, "SEQ_LENGTH", model->nseq);
+  params_write_gsl_vector(f, "ALPHA", model->alpha);
+  
+  fclose(f);
+
+  for (int k = 0; k < model->ntopics; k++) {
+    if (model_type == "fixed") {
+      gsl_matrix_view view = gsl_matrix_submatrix(
+        model->topic[k]->w_phi_l,
+	0, min_time, model->nterms, max_time - min_time);
+      filename = StringPrintf(
+        "%s/w_phi_l-topic_%d-time_%d_%d.dat",
+	root_directory.c_str(), k, min_time, max_time); 
+      mtx_fprintf(filename.c_str(), &view.matrix);
+      
+      // Note that in this case, we write the entire sequence.
+      view = gsl_matrix_submatrix(
+        model->topic[k]->m_update_coeff,
+	0, 0, model->nterms, max_time - min_time);
+      filename = StringPrintf("%s/m_update_coeff-topic_%d-time_%d_%d.dat",
+			      root_directory.c_str(),
+			      k, min_time, max_time);
+      mtx_fprintf(filename.c_str(), &view.matrix);
+    }
+  }
+
+  for (int t=min_time; t < max_time; ++t) {
+    outlog("\nwriting influence weights for time %d to %s",
+	   t, filename.c_str());
+    filename = StringPrintf(
+      "%s/influence_time-%03d",
+      root_directory.c_str(), t);
+    gsl_matrix* influence_t =
+      model->influence->doc_weights[t];
+    assert(model->ntopics == influence_t->size2);
+    mtx_fprintf(filename.c_str(), influence_t);
+
+    filename = StringPrintf("%s/renormalized_influence_time-%03d",
+			    root_directory.c_str(), t);
+    outlog("\nwriting influence weights for time %d to %s",
+	   t, filename.c_str());
+    influence_t =
+      model->influence->renormalized_doc_weights[t];
+    assert(model->ntopics == influence_t->size2);
+    mtx_fprintf(filename.c_str(), influence_t);
+  }
+}
+
+
+/*
+ * compute the maximum nterms in a corpus sequence
+ *
+ */
+
+int compute_max_nterms(const corpus_seq_t* c)
+{
+    int i,j;
+    int max = 0;
+    for (i = 0; i < c->len; i++)
+    {
+        corpus_t* corpus = c->corpus[i];
+        for (j = 0; j < corpus->ndocs; j++)
+            if (corpus->doc[j]->nterms > max)
+                max = corpus->doc[j]->nterms;
+    }
+    return(max);
+}
+
+
+/*
+ * compute the total matrix of counts (W x T)
+ *
+ */
+
+gsl_matrix* compute_total_counts(const corpus_seq_t* c)
+{
+    int t, d, n;
+    gsl_matrix* ret = gsl_matrix_alloc(c->nterms, c->len);
+
+    for (t = 0; t < c->len; t++)
+    {
+        corpus_t* corpus = c->corpus[t];
+        for (d = 0; d < corpus->ndocs; d++)
+        {
+            doc_t* doc = corpus->doc[d];
+            for (n = 0; n < doc->nterms; n++)
+            {
+                minc(ret, doc->word[n], t, (double) doc->count[n]);
+            }
+        }
+    }
+    return(ret);
+}
+
+// Creates a new array of doubles with kScaledBetaMax
+// elements.
+double* NewScaledInfluence(int size) {
+  double* scaled_influence = new double[size];
+
+  if (FLAGS_influence_flat_years > 0) {
+    // Note that we round up, to make sure we have at least one epoch.
+    int number_epochs = FLAGS_influence_flat_years * FLAGS_time_resolution;
+    double epoch_weight = 1.0 / number_epochs;
+    for (int i=0; i < number_epochs; ++i) {
+      scaled_influence[i] = epoch_weight;
+    }
+    for (int i=number_epochs; i < size; ++i) {
+      scaled_influence[i] = 0.0;
+    }
+    return scaled_influence;
+  }
+
+
+  /*
+  // Use the simple distribution: 1 at [0], 0 everywhere else.
+  for (int i=0; i < size; ++i) {
+    scaled_influence[i] = 0.0;
+  }
+  scaled_influence[0] = 1.0;
+  //  scaled_beta[1] = 0;
+  return scaled_influence;
+  */
+
+  /*
+  // Simulate a beta distribution with specified mean and variance.
+  double total = 0.0;
+  double tmp = (scaled_beta_mean
+		* (1 - scaled_beta_mean)
+		/ scaled_beta_variance) - 1.0;
+  double beta_alpha = scaled_beta_mean * tmp;
+  double beta_beta = (1 - scaled_beta_mean) * tmp;
+  for (int i=0; i < scaled_beta_max; ++i) {
+    // Offset tmp by 0.5 so we get a centered distribution
+    // and don't run into degeneracy issues.
+    tmp = (i + 0.5) / (scaled_beta_max);
+    scaled_beta[i] = (pow(tmp, beta_alpha - 1.0)
+		      * pow(1 - tmp, beta_beta - 1.0));
+    total += scaled_beta[i];
+  }
+  */
+
+
+  // Handle the log-normal distribution.
+  double total = 0.0;
+
+  // Here, we're interested more in the median.
+  // So we treat the variable mean as median and note this in
+  // our paper.
+  double scaled_influence_mean = FLAGS_influence_mean_years;
+  double scaled_influence_variance = (FLAGS_influence_stdev_years
+				      * FLAGS_influence_stdev_years);
+  double tmp = (1.0
+		+ (scaled_influence_variance
+		   / (scaled_influence_mean
+		      * scaled_influence_mean)));
+  double lognormal_sigma_squared = log(tmp);
+  double lognormal_mu = (log(scaled_influence_mean)
+			 - 0.5 * lognormal_sigma_squared);
+  printf("Median: %.2f\n", exp(lognormal_mu));
+  for (int i = 0; i < size; ++i) {
+    // Shift right by half a timeframe to avoid corner cases.
+    double x = (i / FLAGS_time_resolution) + (1.0 / FLAGS_time_resolution) / 2;
+    double tmp2 = (log(x) - lognormal_mu);
+    scaled_influence[i] = (1.0
+		      / (x * sqrt(lognormal_sigma_squared * 2 * 3.1415926))
+		      * exp(-tmp2 * tmp2
+			    / (2.0
+			       * lognormal_sigma_squared)));
+    total += scaled_influence[i];
+  }
+  
+  for (int i = 0; i < kScaledInfluenceMax; ++i) {
+    scaled_influence[i] /= total;
+  }
+
+  return scaled_influence;
+  
+}
+
+}  // namespace dtm
--- a/dtm_release/dtm-parallel/data.h
+++ b/dtm_release/dtm-parallel/data.h
+#ifndef DATA_H
+#define DATA_H
+
+#include "gsl-wrappers.h"
+#include "param.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <hash_map>
+
+#define OFFSET 0
+//class string;
+
+using namespace std;
+
+namespace dtm {
+
+
+// Create// Create the scaled beta distribution, which describes
+// how much weight documents have after n years.
+const int kScaledInfluenceMax = 200;
+// This mean and variance are relative to the interval [0, 1.0].
+const double kScaledInfluenceMean = 10.0 / kScaledInfluenceMax;
+const double kScaledInfluenceVariance = ((10.0 / kScaledInfluenceMax)
+					 * (10.0 / kScaledInfluenceMax));
+
+/*
+ * a document is a collection of counts and terms
+ *
+ */
+
+typedef struct doc_t
+{
+    int total;
+    int nterms;
+    int* word;
+    int* count;
+    // A parameter for finding phi.
+    double* lambda;
+
+    // Used for measuring perplexity.
+    double log_likelihood;
+    double* log_likelihoods;
+} doc_t;
+
+
+/*
+ * a corpus is a collection of documents
+ *
+ */
+
+typedef struct corpus_t
+{
+    doc_t** doc;
+    int ndocs;
+    int nterms;
+    int max_unique;  // maximum number of unique terms in a document
+} corpus_t;
+
+
+/*
+ * a sequence is a sequence of corpora
+ *
+ */
+
+typedef struct corpus_seq_t
+{
+    corpus_t** corpus;
+    int nterms;
+    int max_nterms;
+    int len;
+    int ndocs;
+} corpus_seq_t;
+
+
+typedef struct inf_var
+{
+  gsl_matrix** doc_weights;   // T matrices of document weights.
+                              // each matrix is d_t x K.
+  gsl_matrix** renormalized_doc_weights;   // T matrices of document weights.
+                              // each matrix is d_t x K.
+  int ntime;
+} inf_var;
+
+/*
+ * variational posterior structure
+ *
+ */
+
+
+typedef struct sslm_var {
+    // properties
+
+    int W; // vocabulary size
+    int T; // sequence length
+
+    // variational parameters
+
+    gsl_matrix* obs;             // observations, W x T
+
+    // biproducts of the variational parameters
+
+    double obs_variance;         // observation variance
+    double chain_variance;       // chain variance
+    gsl_vector* zeta;            // extra variational parameter, T
+    gsl_matrix* e_log_prob;      // E log prob(w | t), W x T
+
+    // convenient quantities for inference
+
+    gsl_matrix* fwd_mean;       // forward posterior mean, W x T
+    gsl_matrix* fwd_variance;   // forward posterior variance, W x T
+    gsl_matrix* mean;           // posterior mean, W x T
+    gsl_matrix* variance;       // posterior variance, W x T
+
+    gsl_matrix* mean_t;         // W x T
+    gsl_matrix* variance_t;
+
+    gsl_matrix* influence_sum_lgl;  // The sum exp * w_phi_l
+
+    // Recent copy of w_phi_l.
+    gsl_matrix* w_phi_l;         // W x T
+    // gsl_matrix* w_phi_sum;      // W x T
+    // gsl_matrix* w_phi_l_sq;      // Square term involving various
+    gsl_matrix* m_update_coeff;  // Terms involving squares of
+                                 // W, l, and phi.  W x T.
+    gsl_matrix* m_update_coeff_g;  // \sum_i=0..t phi_l(t) r(i-t)
+                                   // W x T.
+
+    // useful temporary vector
+    gsl_vector* T_vct;
+} sslm_var;
+
+
+typedef struct lda_seq
+{
+    int ntopics;             // number of topics
+    int nterms;              // number of terms
+    int nseq;                // length of sequence
+    gsl_vector* alpha;       // dirichlet parameters
+
+    sslm_var** topic;        // topic chains.
+
+    inf_var* influence;      // document weights
+
+    gsl_matrix** influence_sum_lgl;  // Sum of document weights at time t (see g in the regression formula)
+
+  //    gsl_vector** influence_sum_g;  // Sum of document weights at time t.
+  // gsl_vector** influence_sum_h;  // Sum of document weights at time t.
+
+    inf_var* renormalized_influence;      // document weights
+
+  //    gsl_matrix** w_phi_l;        // Product term for the \beta update.
+  //  gsl_matrix** w_phi_l_sq;     // Square term involving various
+                                // coefficients for the \beta update.
+
+  pair<int, float>**** top_doc_phis;        // T x D_t x n of document phis.
+} lda_seq;
+
+/*
+ * functions
+ *
+ */
+
+corpus_t* read_corpus(const char* name);
+void free_corpus(corpus_t* corpus);
+corpus_seq_t* read_corpus_seq(const char* name);
+int compute_max_nterms(const corpus_seq_t* c);
+gsl_matrix * compute_total_counts(const corpus_seq_t* c);
+corpus_seq_t* make_seq_corpus_subset(corpus_seq_t* all, int start, int end);
+void write_corpus(corpus_t* c, char* filename);
+void write_corpus_seq(corpus_seq_t* c, char* name);
+corpus_seq_t* make_corpus_seq_subset(corpus_seq_t* all, int start, int end);
+corpus_t* collapse_corpus_seq(corpus_seq_t* c);
+double* NewScaledInfluence(int size);
+
+/*
+ * Reading and writing.
+ *
+ */
+
+// read and write an lda sequence
+void write_lda_seq_docs(const string& model_type,
+			const string& root_directory,
+			int min_time,
+			int max_time,
+			const lda_seq* m);
+
+
+
+}  // namespace dtm
+
+#endif
--- a/dtm_release/dtm-parallel/gsl-wrappers.c
+++ b/dtm_release/dtm-parallel/gsl-wrappers.c
+#include "gflags.h"
+#include "gsl-wrappers.h"
+
+static gsl_rng* RANDOM_NUMBER_GENERATOR = NULL;
+
+DEFINE_int64(rng_seed,
+	     0,
+	     "Specifies the random seed.  If 0, seeds pseudo-randomly.");
+
+// The maximum number of iterations for each update.
+const double MAX_ITER = 15;
+
+/*
+ * safe logarithm function
+ *
+ */
+
+double safe_log(double x)
+{
+    if (x == 0)
+    {
+        return(-1000);
+    }
+    else
+    {
+        return(log(x));
+    }
+}
+
+
+/*
+ * given log(a) and log(b), return log(a+b)
+ *
+ */
+
+double log_sum(double log_a, double log_b)
+{
+  double v;
+
+  if (log_a == -1) return(log_b);
+
+  if (log_a < log_b)
+  {
+      v = log_b+log(1 + exp(log_a-log_b));
+  }
+  else
+  {
+      v = log_a+log(1 + exp(log_b-log_a));
+  }
+  return(v);
+}
+
+
+void vinc(gsl_vector* v, int i, double x)
+{
+    vset(v, i, vget(v, i) + x);
+}
+
+void minc(gsl_matrix* m, int i, int j, double x)
+{
+    mset(m, i, j, mget(m, i, j) + x);
+}
+
+
+void msetrow(gsl_matrix* m, int r, const gsl_vector* val)
+{
+    int i;
+    gsl_vector v = gsl_matrix_row(m, r).vector;
+    for (i = 0; i < v.size; i++)
+        vset(&v, i, vget(val, i));
+}
+
+
+void msetcol(gsl_matrix* m, int r, const gsl_vector* val)
+{
+    int i;
+    gsl_vector v = gsl_matrix_column(m, r).vector;
+    for (i = 0; i < v.size; i++)
+        vset(&v, i, vget(val, i));
+}
+
+
+/*
+ * compute the column sums of a matrix
+ *
+ */
+
+void col_sum(gsl_matrix* m, gsl_vector* val)
+{
+    int i, j;
+    gsl_vector_set_all(val, 0);
+
+    for (i = 0; i < m->size1; i++)
+        for (j = 0; j < m->size2; j++)
+            vinc(val, j, mget(m, i, j));
+}
+
+
+/*
+ * print a vector to standard out
+ *
+ */
+
+void vct_printf(const gsl_vector * v)
+{
+    int i;
+    for (i = 0; i < v->size; i++)
+	printf("%5.5f ", vget(v, i));
+    printf("\n\n");
+}
+
+
+/*
+ * print a matrix to standard out
+ *
+ */
+
+void mtx_printf(const gsl_matrix * m)
+{
+    int i, j;
+    for (i = 0; i < m->size1; i++)
+    {
+	for (j = 0; j < m->size2; j++)
+	    printf("%5.5f ", mget(m, i, j));
+	printf("\n");
+    }
+}
+
+
+/*
+ * read/write a vector/matrix from a file
+ *
+ */
+
+void vct_fscanf(const char* filename, gsl_vector* v)
+{
+    outlog("reading %ld vector from %s", v->size, filename);
+    FILE* fileptr;
+    if (!fileptr) {
+      outlog("Error opening file %s. Failing.", filename);
+      exit(1);
+    }
+    fileptr = fopen(filename, "r");
+    gsl_vector_fscanf(fileptr, v);
+    fclose(fileptr);
+}
+
+void mtx_fscanf(const char* filename, gsl_matrix * m)
+{
+    FILE* fileptr = fopen(filename, "r");
+
+    outlog("reading %ld x %ld matrix from %s",
+           m->size1, m->size2, filename);
+    if (!fileptr) {
+      outlog("Error opening file %s. Failing.", filename);
+      exit(1);
+    }
+
+    gsl_matrix_fscanf(fileptr, m);
+    fclose(fileptr);
+}
+
+void vct_fprintf(const char* filename, gsl_vector* v)
+{
+    outlog( "writing %ld vector to %s", v->size, filename);
+    FILE* fileptr;
+    fileptr = fopen(filename, "w");
+    if (!fileptr) {
+      outlog("Error opening file %s. Failing.", filename);
+      exit(1);
+    }
+    gsl_vector_fprintf(fileptr, v, "%20.17e");
+    fclose(fileptr);
+}
+
+
+void mtx_fprintf(const char* filename, const gsl_matrix * m)
+{
+    outlog( "writing %ld x %ld matrix to %s",
+            m->size1, m->size2, filename);
+    FILE* fileptr;
+    fileptr = fopen(filename, "w");
+    if (!fileptr) {
+      outlog("Error opening file: %s", filename);
+      exit(1);
+    }
+    gsl_matrix_fprintf(fileptr, m, "%20.17e");
+    fclose(fileptr);
+}
+
+
+/*
+ * matrix inversion using blas
+ *
+ */
+
+void matrix_inverse(gsl_matrix* m, gsl_matrix* inverse)
+{
+    gsl_matrix *lu;
+    gsl_permutation* p;
+    int signum;
+
+    p = gsl_permutation_alloc(m->size1);
+    lu = gsl_matrix_alloc(m->size1, m->size2);
+
+    gsl_matrix_memcpy(lu, m);
+    gsl_linalg_LU_decomp(lu, p, &signum);
+    gsl_linalg_LU_invert(lu, p, inverse);
+
+    gsl_matrix_free(lu);
+    gsl_permutation_free(p);
+}
+
+
+/*
+ * log determinant using blas
+ *
+ */
+
+double log_det(gsl_matrix* m)
+{
+    gsl_matrix* lu;
+    gsl_permutation* p;
+    double result;
+    int signum;
+
+    p = gsl_permutation_alloc(m->size1);
+    lu = gsl_matrix_alloc(m->size1, m->size2);
+
+    gsl_matrix_memcpy(lu, m);
+    gsl_linalg_LU_decomp(lu, p, &signum);
+    result = gsl_linalg_LU_lndet(lu);
+
+    gsl_matrix_free(lu);
+    gsl_permutation_free(p);
+
+    return(result);
+}
+
+
+/*
+ * eigenvalues of a symmetric matrix using blas
+ *
+ */
+
+void sym_eigen(gsl_matrix* m, gsl_vector* vals, gsl_matrix* vects)
+{
+    gsl_eigen_symmv_workspace* wk;
+    gsl_matrix* mcpy;
+    int r;
+
+    mcpy = gsl_matrix_alloc(m->size1, m->size2);
+    wk = gsl_eigen_symmv_alloc(m->size1);
+    gsl_matrix_memcpy(mcpy, m);
+    r = gsl_eigen_symmv(mcpy, vals, vects, wk);
+    gsl_eigen_symmv_free(wk);
+    gsl_matrix_free(mcpy);
+}
+
+
+/*
+ * sum of a vector
+ *
+ */
+
+double sum(const gsl_vector* v)
+{
+    double val = 0;
+    int i, size = v->size;
+    for (i = 0; i < size; i++)
+        val += vget(v, i);
+    return(val);
+}
+
+
+/*
+ * take log of each element
+ *
+ */
+
+void vct_log(gsl_vector* v)
+{
+    int i, size = v->size;
+    for (i = 0; i < size; i++)
+        vset(v, i, safe_log(vget(v, i)));
+}
+
+
+/*
+ * l2 norm of a vector
+ *
+ */
+
+// !!! this can be BLASified
+
+double norm(gsl_vector *v)
+{
+    double val = 0;
+    int i;
+
+    for (i = 0; i < v->size; i++)
+        val += vget(v, i) * vget(v, i);
+    return(sqrt(val));
+}
+
+
+/*
+ * draw K random integers from 0..N-1
+ *
+ */
+
+void choose_k_from_n(int k, int n, int* result)
+{
+    int i, x[n];
+
+    if (RANDOM_NUMBER_GENERATOR == NULL)
+        RANDOM_NUMBER_GENERATOR = gsl_rng_alloc(gsl_rng_taus);
+    for (i = 0; i < n; i++)
+        x[i] = i;
+
+    gsl_ran_choose (RANDOM_NUMBER_GENERATOR, (void *) result,  k,
+                    (void *) x, n, sizeof(int));
+}
+
+
+/*
+ * normalize a vector in log space
+ *
+ * x_i = log(a_i)
+ * v = log(a_1 + ... + a_k)
+ * x_i = x_i - v
+ *
+ */
+
+void log_normalize(gsl_vector* x)
+{
+    double v = vget(x, 0);
+    int i;
+
+    for (i = 1; i < x->size; i++)
+        v = log_sum(v, vget(x, i));
+
+    for (i = 0; i < x->size; i++)
+        vset(x, i, vget(x,i)-v);
+}
+
+
+/*
+ * normalize a positive vector
+ *
+ */
+
+void normalize(gsl_vector* x)
+{
+    double v = 0;
+    int i;
+
+    for (i = 0; i < x->size; i++)
+        v += vget(x, i);
+
+    for (i = 0; i < x->size; i++)
+        vset(x, i, vget(x, i) / v);
+}
+
+
+/*
+ * exponentiate a vector
+ *
+ */
+
+void vct_exp(gsl_vector* x)
+{
+    int i;
+
+    for (i = 0; i < x->size; i++)
+        vset(x, i, exp(vget(x, i)));
+}
+
+
+/*
+ * maximize a function using its derivative
+ *
+ */
+
+void optimize_fdf(int dim,
+                  gsl_vector* x,
+                  void* params,
+                  void (*fdf)(const gsl_vector*, void*, double*, gsl_vector*),
+                  void (*df)(const gsl_vector*, void*, gsl_vector*),
+                  double (*f)(const gsl_vector*, void*),
+                  double* f_val,
+                  double* conv_val,
+                  int* niter)
+{
+    gsl_multimin_function_fdf obj;
+    obj.f = f;
+    obj.df = df;
+    obj.fdf = fdf;
+    obj.n = dim;
+    obj.params = params;
+
+//    const gsl_multimin_fdfminimizer_type * method =
+//        gsl_multimin_fdfminimizer_vector_bfgs;
+    const gsl_multimin_fdfminimizer_type * method =
+        gsl_multimin_fdfminimizer_conjugate_fr;
+
+    gsl_multimin_fdfminimizer * opt =
+        gsl_multimin_fdfminimizer_alloc(method, dim);
+
+    gsl_multimin_fdfminimizer_set(opt, &obj, x, 0.01, 1e-3);
+
+    int iter = 0, status;
+    double converged, f_old = 0;
+    do
+    {
+        iter++;
+        status = gsl_multimin_fdfminimizer_iterate(opt);
+        // assert(status==0);
+        converged = fabs((f_old - opt->f) / (dim * f_old));
+        // status = gsl_multimin_test_gradient(opt->gradient, 1e-3);
+        // printf("f = %1.15e; conv = %5.3e; norm = %5.3e; niter = %03d\n",
+        // opt->f, converged, norm(opt->gradient), iter);
+        f_old = opt->f;
+    }
+    while (converged > 1e-8 && iter < MAX_ITER);
+    // while (status == GSL_CONTINUE);
+    *f_val = opt->f;
+    *conv_val = converged;
+    *niter = iter;
+    gsl_multimin_fdfminimizer_free(opt);
+}
+
+
+
+/*
+ * maximize a function
+ *
+ */
+
+void optimize_f(int dim,
+                gsl_vector* x,
+                void* params,
+                double (*f)(const gsl_vector*, void*))
+{
+    gsl_multimin_function obj;
+    obj.f = f;
+    obj.n = dim;
+    obj.params = params;
+
+    const gsl_multimin_fminimizer_type * method =
+        gsl_multimin_fminimizer_nmsimplex;
+
+    gsl_multimin_fminimizer * opt =
+        gsl_multimin_fminimizer_alloc(method, dim);
+
+    gsl_vector * step_size = gsl_vector_alloc(dim);
+    gsl_vector_set_all(step_size, 1);
+    gsl_multimin_fminimizer_set(opt, &obj, x, step_size);
+
+    int iter = 0, status;
+    double converged, f_old;
+    do
+    {
+        iter++;
+        f_old = opt->fval;
+        status = gsl_multimin_fminimizer_iterate(opt);
+        converged = fabs((f_old - opt->fval) / f_old);
+        printf("f = %1.15e; conv = %5.3e; size = %5.3e; niter = %03d\n",
+               opt->fval, converged, opt->size, iter);
+    }
+    while ((converged > 1e-10) || (iter < 10000));
+    // while (status == GSL_CONTINUE);
+    printf("f = %1.15e; conv = %5.3e; niter = %03d\n",
+           opt->fval, converged, iter);
+
+    gsl_multimin_fminimizer_free(opt);
+    gsl_vector_free(step_size);
+}
+
+
+/*
+ * check if a directory exists
+ *
+ * !!! shouldn't be here
+ */
+
+int directory_exist(const char *dname)
+{
+    struct stat st;
+    int ret;
+
+    if (stat(dname,&st) != 0)
+    {
+        return 0;
+    }
+
+    ret = S_ISDIR(st.st_mode);
+
+    if(!ret)
+    {
+        errno = ENOTDIR;
+    }
+
+    return ret;
+}
+
+void make_directory(const char* name)
+{
+    mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
+}
+
+gsl_rng* new_random_number_generator()
+{
+    gsl_rng* random_number_generator = gsl_rng_alloc(gsl_rng_taus);
+    long t1;
+    (void) time(&t1);
+
+    if (FLAGS_rng_seed) {
+      t1 = FLAGS_rng_seed;
+    }
+
+    // !!! DEBUG
+    // t1 = 1147530551;
+    printf("RANDOM SEED = %ld\n", t1);
+    gsl_rng_set(random_number_generator, t1);
+    return(random_number_generator);
+}
+
--- a/dtm_release/dtm-parallel/gsl-wrappers.h
+++ b/dtm_release/dtm-parallel/gsl-wrappers.h
+#ifndef GSL_WRAPPERS_H
+#define GSL_WRAPPERS_H
+
+// #include <gsl/gsl_check_range.h>
+#include <gsl/gsl_vector.h>
+#include <gsl/gsl_matrix.h>
+#include <gsl/gsl_permutation.h>
+#include <gsl/gsl_linalg.h>
+#include <gsl/gsl_eigen.h>
+#include <gsl/gsl_rng.h>
+#include <gsl/gsl_randist.h>
+#include <gsl/gsl_multimin.h>
+#include <gsl/gsl_math.h>
+#include <gsl/gsl_blas.h>
+#include <math.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#define outlog(format, args...) \
+    fprintf(stderr, format, args); \
+    fprintf(stderr, "\n");
+
+double safe_log(double);
+double log_sum(double, double);
+
+static inline double vget(const gsl_vector* v, int i)
+{ return(gsl_vector_get(v, i)); };
+
+static inline void vset(gsl_vector* v, int i, double x)
+{ gsl_vector_set(v, i, x); };
+
+// Increment a vector element by a double.
+void vinc(gsl_vector*, int, double);
+
+static inline double mget(const gsl_matrix* m, int i, int j)
+{ return(gsl_matrix_get(m, i, j)); };
+
+static inline void mset(gsl_matrix* m, int i, int j, double x)
+{ gsl_matrix_set(m, i, j, x); };
+
+void msetcol(gsl_matrix* m, int r, const gsl_vector* val);
+
+// Increment a matrix element by a double.
+void minc(gsl_matrix*, int, int, double);
+void msetrow(gsl_matrix*, int, const gsl_vector*);
+
+void col_sum(gsl_matrix*, gsl_vector*);
+
+void vct_printf(const gsl_vector* v);
+void mtx_printf(const gsl_matrix* m);
+void vct_fscanf(const char*, gsl_vector* v);
+void mtx_fscanf(const char*, gsl_matrix* m);
+void vct_fprintf(const char* filename, gsl_vector* v);
+void mtx_fprintf(const char* filename, const gsl_matrix* m);
+
+double log_det(gsl_matrix*);
+
+void matrix_inverse(gsl_matrix*, gsl_matrix*);
+
+void sym_eigen(gsl_matrix*, gsl_vector*, gsl_matrix*);
+
+double sum(const gsl_vector* v);
+
+double norm(gsl_vector * v);
+
+void vct_log(gsl_vector* v);
+void vct_exp(gsl_vector* x);
+
+void choose_k_from_n(int k, int n, int* result);
+
+void log_normalize(gsl_vector* x);
+void normalize(gsl_vector* x);
+
+void optimize(int dim,
+              gsl_vector* x,
+              void* params,
+              void (*fdf)(const gsl_vector*, void*, double*, gsl_vector*),
+              void (*df)(const gsl_vector*, void*, gsl_vector*),
+              double (*f)(const gsl_vector*, void*));
+
+void optimize_fdf(int dim,
+                  gsl_vector* x,
+                  void* params,
+                  void (*fdf)(const gsl_vector*, void*, double*, gsl_vector*),
+                  void (*df)(const gsl_vector*, void*, gsl_vector*),
+                  double (*f)(const gsl_vector*, void*),
+                  double* f_val,
+                  double* conv_val,
+                  int* niter);
+
+void log_write(FILE* f, char* string);
+int directory_exist(const char *dname);
+void make_directory(const char* name);
+
+gsl_rng* new_random_number_generator();
+
+#endif
--- a/dtm_release/dtm-parallel/lda-seq.c
+++ b/dtm_release/dtm-parallel/lda-seq.c
--- a/dtm_release/dtm-parallel/lda-seq.h
+++ b/dtm_release/dtm-parallel/lda-seq.h
+#ifndef LDASEQ_H
+#define LDASEQ_H
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "gsl-wrappers.h"
+#include "lda.h"
+
+#include <string>
+#include <vector>
+
+#define LDA_SEQ_EM_THRESH 1e-4
+#define SAVE_LAG 10
+
+/*
+ * an lda sequence is a collection of simplex sequences for K topics
+ * and an alpha vector
+ *
+ */
+
+#include <gsl/gsl_vector.h>
+#include <gsl/gsl_matrix.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "param.h"
+#include "ss-lm.h"
+#include "data.h"
+#include "lda.h"
+#include "c2_lib.h"
+
+#define LDA_SEQ_EM_THRESHOLD 1e-5;
+
+using c2_lib::Task;
+using c2_lib::Resource;
+
+using namespace std;
+
+namespace dtm {
+// lda sequence variational posterior distribution
+
+
+// === allocation and initialization ===
+
+bool FitParallelLDASeq(const vector<int>& topic_boundaries);
+
+inf_var* inf_var_alloc(int number_topics,
+		       int* ndocs,
+		       int len);
+void inf_var_free(inf_var* ptr);
+
+// === fitting ===
+
+
+// infer a corpus with an lda-seq
+
+double update_inf_var(lda_seq* seq,
+		      const corpus_seq_t* data,
+		      gsl_matrix** phi,
+		      size_t t,
+		      const char* root);
+double update_inf_var_multiple(lda_seq* seq,
+			       const corpus_seq_t* data,
+			       gsl_matrix** phi,
+			       size_t t,
+			       const char* root);
+void update_inf_reg(lda_seq* seq,
+		    const corpus_seq_t* data,
+		    gsl_matrix** phi,
+		    size_t t,
+		    const char* root);
+
+double lda_seq_infer(lda_seq* model,
+                     const corpus_seq_t* data,
+                     gsl_matrix** suffstats,
+                     gsl_matrix* gammas,
+                     gsl_matrix* lhoods,
+		     const char* file_root);
+
+// fit lda sequence from sufficient statistics
+
+double fit_lda_seq_st();
+
+double fit_lda_seq_topic(int topic,
+			 lda_seq* model,
+			 gsl_matrix* topic_ss);
+
+double fit_lda_seq_sd();
+
+void update_lda_seq_ss(int time,
+                       const doc_t* doc,
+                       const lda_post* post,
+                       gsl_matrix** ss);
+
+// === reading and writing ===
+
+
+// read and write a lda sequence
+
+void write_lda_seq(const lda_seq* m, const char* root);
+
+  // lda_seq* read_lda_seq(const char* root, corpus_seq_t* data);
+
+lda_seq* read_lda_seq(const string& model_type,
+		      const char* root,
+		      corpus_seq_t* data);
+
+// write lda sequence sufficient statistics
+void init_lda_seq(
+    const vector<int> time_boundaries,
+    const string& root_directory,
+    const string& model_type,
+    lda_seq* model,
+    int* ndocs,
+    int len,
+    double topic_chain_variance,
+    double topic_obs_variance,
+    double alpha,
+    int min_topic,
+    int max_topic,
+    bool seq_docs);
+
+void write_lda_seq_suffstats(lda_seq* m,
+                             gsl_matrix** topic_ss,
+                             const char* root);
+
+// new lda sequence
+
+lda_seq* new_lda_seq(corpus_seq_t* data,
+		     int* ndocs,
+		     int W,
+		     int T,
+		     int K);
+
+void make_lda_from_seq_slice(lda* lda_m,
+                             lda_seq* lda_seq_m,
+                             int time);
+
+
+}  // namespace dtm
+
+#endif
--- a/dtm_release/dtm-parallel/lda.c
+++ b/dtm_release/dtm-parallel/lda.c
--- a/dtm_release/dtm-parallel/lda.h
+++ b/dtm_release/dtm-parallel/lda.h
+#ifndef LDA_H
+#define LDA_H
+
+#include <gsl/gsl_vector.h>
+#include <gsl/gsl_matrix.h>
+#include <gsl/gsl_sf_gamma.h>
+#include <gsl/gsl_sf_psi.h>
+#include <gsl/gsl_sf_lambert.h>
+#include <gsl/gsl_rng.h>
+
+#include "param.h"
+#include "data.h"
+#include "gsl-wrappers.h"
+
+#include <vector>
+
+/*
+ * functions for posterior inference in the latent dirichlet
+ * allocation model.
+ *
+ */
+
+#define LDA_INFERENCE_CONVERGED 1e-7
+#define LDA_SEED_INIT 1
+#define LDA_INIT_SMOOTH 1.0
+#define LDA_EM_CONVERGED 5e-5
+#define LDA_USE_VAR_BAYES 0
+#define LDA_TOPIC_DIR_PARAM 0.001
+
+namespace dtm {
+
+// lda model
+
+typedef struct lda
+{
+    int ntopics;         // number of topics
+    int nterms;          // vocabulary size
+    gsl_matrix* topics;  // each column is a topic (V X K)
+    gsl_vector* alpha;   // dirichlet parameters
+} lda;
+
+// lda posterior
+
+typedef struct lda_post
+{
+    doc_t* doc;          // document associated to this posterior
+    lda* model;          // lda model
+    gsl_matrix* phi;     // variational mult parameters (nterms x K)
+    gsl_matrix* log_phi; // convenient for computation (nterms x K)
+    gsl_vector* gamma;   // variational dirichlet parameters (K)
+    gsl_vector* lhood;   // a K+1 vector, sums to the lhood bound
+    gsl_vector* doc_weight;  // Not owned by this structure.
+    gsl_vector* renormalized_doc_weight;  // Not owned by this structure.
+} lda_post;
+
+// lda sufficient statistics
+
+typedef struct lda_suff_stats {
+    gsl_matrix* topics_ss;
+} lda_suff_stats;
+
+// Run LDA in parallel.
+bool RunParallelLDA(const vector<int>& topic_boundaries);
+void AggregateSuffStats();
+
+// new lda model and suff stats
+
+lda* new_lda_model(int ntopics, int nterms);
+void free_lda_model(lda* m);
+lda_suff_stats* new_lda_suff_stats(lda* model);
+void reset_lda_suff_stats(lda_suff_stats* ss);
+lda_post* new_lda_post(int ntopics, int max_length);
+void free_lda_post(lda_post* p);
+void initialize_lda_ss_from_data(corpus_t* data, lda_suff_stats* ss);
+
+// posterior inference
+double fit_lda_post(int doc_number, int time,
+		    lda_post* p, lda_seq* var,
+		    gsl_matrix* g,
+		    gsl_matrix* g3,
+		    gsl_matrix* g4,
+		    gsl_matrix* g5);
+void init_lda_post(lda_post* p);
+void update_gamma(lda_post* p);
+void update_phi(int doc_number, int time,
+		lda_post* p, lda_seq* var,
+		gsl_matrix* g);
+void update_phi_dim(int doc_number, int time,
+		    lda_post* p, lda_seq* var,
+		    gsl_matrix* g);
+void update_phi_fixed(int doc_number, int time,
+		      lda_post* p, lda_seq* var,
+		      gsl_matrix* g3_matrix,
+		      gsl_matrix* g4_matrix,
+		      gsl_matrix* g5_matrix);
+void update_phi_multiple(int doc_number, int time,
+			 lda_post* p, lda_seq* var,
+			 gsl_matrix* g);
+
+// compute the likelihood bound
+
+double compute_lda_lhood(lda_post* p);
+
+// EM algorithm
+
+double RunEStep();
+double RunMStep();
+double lda_e_step(lda* model, corpus_t* data, lda_suff_stats* ss);
+double lda_m_step(lda* model, lda_suff_stats* ss);
+void lda_em(lda* model,
+	    lda_suff_stats* ss,
+	    corpus_t* data,
+	    int max_iter,
+	    char* outname);
+
+// reading and writing
+
+lda_suff_stats* read_lda_suff_stats(
+  const char* filename, int ntopics, int nterms);
+void read_lda_topics(lda* model);
+void read_lda_topics(const vector<int>& topic_boundaries,
+		     lda* model);
+void write_lda(lda* model, const char* name);
+void write_lda_suff_stats(lda_suff_stats* ss, const char* name);
+void write_lda_topics(lda_suff_stats* ss, const char* name);
+lda* read_lda(int ntopics, int nterms, const char* name);
+
+void initialize_lda_ss_from_random(lda_suff_stats* ss);
+
+
+}  // namespace
+
+#endif
--- a/dtm_release/dtm-parallel/main-aggregate-ss.c
+++ b/dtm_release/dtm-parallel/main-aggregate-ss.c
+#include "gflags.h"
+
+#include "lda.h"
+#include "main.h"
+#include "c2_lib.h"
+
+// Note that this is not necessary here.
+DEFINE_string(model, "", "Model name.");
+
+DEFINE_string(sentinel_filename,
+	      "",
+              "A sentinel filename.");
+
+using namespace dtm;
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, 0);
+  
+  AggregateSuffStats();
+  CreateSentinel(FLAGS_sentinel_filename,
+		 0.0);
+
+  return(0);
+}
--- a/dtm_release/dtm-parallel/main-fit_seq_docs.c
+++ b/dtm_release/dtm-parallel/main-fit_seq_docs.c
+#include "gflags.h"
+#include "lda-seq.h"
+#include "main.h"
+#include "c2_lib.h"
+
+DEFINE_string(sentinel_filename,
+	      "",
+	      "");
+
+DECLARE_string(outname);
+DECLARE_string(root_directory);
+DECLARE_double(top_obs_var);
+DECLARE_double(top_chain_var);
+DECLARE_int32(ntopics);
+
+DECLARE_string(corpus_prefix);
+DECLARE_double(alpha);
+
+/*
+ * read the parameters
+ *
+ * !!! use the cleaner functions in params.h
+ *
+ */
+
+
+/*
+ * fit a model from data
+ *
+ */
+
+using namespace dtm;
+
+/*
+ * main function
+ *
+ * supports fitting a dynamic topic model
+ *
+ */
+
+int main(int argc, char* argv[])
+{
+  // Initialize the flag objects.
+  //    InitFlags(argc, argv);
+  google::ParseCommandLineFlags(&argc, &argv, 0);
+
+  double lhood = fit_lda_seq_sd();
+
+  CreateSentinel(FLAGS_sentinel_filename.c_str(),
+		 lhood);
+  printf("... Job complete.\n");
+  
+  return(0);
+}
--- a/dtm_release/dtm-parallel/main-fit_seq_topics.c
+++ b/dtm_release/dtm-parallel/main-fit_seq_topics.c
+#include "gflags.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include "data.h"
+#include "lda-seq.h"
+#include "lda.h"
+#include "main.h"
+#include <gsl/gsl_matrix.h>
+
+DEFINE_string(sentinel_filename,
+	      "",
+	      "");
+
+DECLARE_string(outname);
+DECLARE_string(root_directory);
+DECLARE_double(top_obs_var);
+DECLARE_double(top_chain_var);
+DECLARE_int32(ntopics);
+
+DECLARE_string(corpus_prefix);
+DECLARE_double(alpha);
+
+// using namespace dtm;
+
+/*
+ * main function
+ *
+ * supports fitting a dynamic topic model
+ *
+ */
+
+int main(int argc, char* argv[]) {
+  // Initialize the flag objects.
+  //    InitFlags(argc, argv);
+  google::ParseCommandLineFlags(&argc, &argv, 0);
+  
+  // mode for fitting a dynamic topic model
+  double lhood = dtm::fit_lda_seq_st();
+  dtm::CreateSentinel(FLAGS_sentinel_filename,
+		      lhood);
+  printf("... Job complete.\n");
+
+
+  return(0);
+}
--- a/dtm_release/dtm-parallel/main-fit_seq_topics.h
+++ b/dtm_release/dtm-parallel/main-fit_seq_topics.h
+#ifndef MAINH
+#define MAINH
+
+typedef struct dtm_fit_params
+{
+    char* datafile;
+    char* outname;
+    char* heldout;
+    int start;
+    int end;
+    int ntopics;
+    int lda_max_em_iter;
+    double top_obs_var;
+    double top_chain_var;
+    double alpha;
+} dtm_fit_params;
+
+#endif
--- a/dtm_release/dtm-parallel/main-lda-e-step.c
+++ b/dtm_release/dtm-parallel/main-lda-e-step.c
+#include "gflags.h"
+
+#include "lda.h"
+#include "main.h"
+#include "c2_lib.h"
+
+DEFINE_string(model,
+	      "dim",
+              "The function to perform. "
+	      "Can be dtm or dim.");
+DEFINE_string(sentinel_filename,
+	      "",
+              "A sentinel filename.");
+
+DEFINE_string(outname, "", "");
+
+using namespace dtm;
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, 0);
+  
+  double l_hood = RunEStep();
+
+  CreateSentinel(FLAGS_sentinel_filename,
+		 l_hood);
+
+  return(0);
+}