-
Eike Cochu authored
added patched version of dtm project updated dtm analyzer, unfinished added sequence to models, topicfull, stores a single sequence
Eike Cochu authoredadded patched version of dtm project updated dtm analyzer, unfinished added sequence to models, topicfull, stores a single sequence
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
data.c 13.16 KiB
#include <string>
#include "gflags.h"
#include "params.h"
#include "strutil.h"
#include "ss-lm-fit_seq_topics.h"
#include "data.h"
DEFINE_double(sigma_l,
0.05,
"If true, use the new phi calculation.");
DEFINE_double(sigma_d,
0.05,
"If true, use the new phi calculation.");
DEFINE_double(sigma_c,
0.05,
"c stdev.");
DEFINE_double(resolution,
1,
"The resolution. Used to determine how far out the beta mean should be.");
DEFINE_int32(max_number_time_points,
200,
"Used for the influence window.");
DEFINE_double(time_resolution,
0.5,
"This is the number of years per time slice.");
DEFINE_double(influence_mean_years,
20.0,
"How many years is the mean number of citations?");
DEFINE_double(influence_stdev_years,
15.0,
"How many years is the stdev number of citations?");
DEFINE_int32(influence_flat_years,
-1,
"How many years is the influence nonzero?"
"If nonpositive, a lognormal distribution is used.");
DEFINE_int32(fix_topics,
0,
"Fix all topics below this number.");
DECLARE_string(normalize_docs);
using namespace std;
namespace dtm {
/*
* seq corpus range: [start, end)
*
* creates a subset of time slices
*
*/
corpus_seq_t* make_corpus_seq_subset(corpus_seq_t* all,
int start,
int end) {
int n;
corpus_seq_t* subset_corpus = (corpus_seq_t*) malloc(sizeof(corpus_seq_t));
subset_corpus->nterms = all->nterms;
subset_corpus->len = end - start;
subset_corpus->ndocs = 0;
subset_corpus->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * subset_corpus->len);
for (n = start; n < end; n++)
{
subset_corpus->corpus[n - start] = all->corpus[n];
subset_corpus->ndocs += all->corpus[n]->ndocs;
}
return(subset_corpus);
}
/*
* collapse a sequential corpus to a flat corpus
*
*/
corpus_t* collapse_corpus_seq(corpus_seq_t* c) {
corpus_t* collapsed = (corpus_t*) malloc(sizeof(corpus_t));
collapsed->ndocs = c->ndocs;
collapsed->nterms = c->nterms;
collapsed->doc = (doc_t**) malloc(sizeof(doc_t*) * c->ndocs);
collapsed->max_unique = 0;
int t, n, doc_idx = 0;
for (t = 0; t < c->len; t++)
{
for (n = 0; n < c->corpus[t]->ndocs; n++)
{
collapsed->doc[doc_idx] = c->corpus[t]->doc[n];
if (collapsed->doc[doc_idx]->nterms > collapsed->max_unique)
collapsed->max_unique = collapsed->doc[doc_idx]->nterms;
doc_idx++;
}
}
assert(doc_idx == collapsed->ndocs);
return(collapsed);
}
/*
* read corpus
*
*/
corpus_t* read_corpus(const char* name)
{
int length, count, word, n;
corpus_t* c;
char filename[400];
sprintf(filename, "%s-mult.dat", name);
outlog("reading corpus from %s", filename);
c = (corpus_t*) malloc(sizeof(corpus_t));
c->max_unique = 0;
FILE* fileptr = fopen(filename, "r");
if (fileptr == NULL) {
outlog("Error reading corpus prefix %s. Failing.",
filename);
exit(1);
}
c->ndocs = 0; c->nterms = 0;
c->doc = (doc_t**) malloc(sizeof(doc_t*));
int grand_total = 0;
while ((fscanf(fileptr, "%10d", &length) != EOF))
{
if (length > c->max_unique) c->max_unique = length;
c->doc = (doc_t**) realloc(c->doc, sizeof(doc_t*)*(c->ndocs+1));
c->doc[c->ndocs] = (doc_t*) malloc(sizeof(doc_t));
c->doc[c->ndocs]->nterms = length;
c->doc[c->ndocs]->total = 0;
c->doc[c->ndocs]->log_likelihood = 0.0;
c->doc[c->ndocs]->word = (int*) malloc(sizeof(int)*length);
c->doc[c->ndocs]->count = (int*) malloc(sizeof(int)*length);
c->doc[c->ndocs]->lambda = (double*) malloc(sizeof(double)*length);
c->doc[c->ndocs]->log_likelihoods = (double*) malloc(sizeof(double)*length);
for (n = 0; n < length; n++)
{
fscanf(fileptr, "%10d:%10d", &word, &count);
word = word - OFFSET;
if (FLAGS_normalize_docs == "occurrence") {
count = 1;
}
c->doc[c->ndocs]->word[n] = word;
c->doc[c->ndocs]->count[n] = count;
c->doc[c->ndocs]->total += count;
// Is there a better value for initializing lambda?
c->doc[c->ndocs]->lambda[n] = 0.0;
c->doc[c->ndocs]->log_likelihoods[n] = 0.0;
if (word >= c->nterms) { c->nterms = word + 1; }
}
grand_total += c->doc[c->ndocs]->total;
c->ndocs = c->ndocs + 1;
}
fclose(fileptr);
outlog("read corpus (ndocs = %d; nterms = %d; nwords = %d)\n",
c->ndocs, c->nterms, grand_total);
return(c);
}
void free_corpus(corpus_t* corpus) {
for (int i=0; i < corpus->ndocs; ++i) {
delete corpus->doc[i]->word;
delete corpus->doc[i]->count;
delete corpus->doc[i]->lambda;
delete[] corpus->doc[i]->log_likelihoods;
}
delete corpus->doc;
delete corpus;
}
/*
* read corpus sequence
*
*/
corpus_seq_t* read_corpus_seq(const char* name)
{
char filename[400];
corpus_seq_t* corpus_seq = (corpus_seq_t*) malloc(sizeof(corpus_seq_t));
// read corpus
corpus_t* raw_corpus = read_corpus(name);
corpus_seq->nterms = raw_corpus->nterms;
// read sequence information
sprintf(filename, "%s-seq.dat", name);
outlog("Reading corpus sequence %s.", filename);
FILE* fileptr = fopen(filename, "r");
if (!fileptr) {
outlog("Error opening dtm sequence file %s.\n",
filename);
exit(1);
}
fscanf(fileptr, "%d", &(corpus_seq->len));
corpus_seq->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * corpus_seq->len);
// allocate corpora
int doc_idx = 0;
int ndocs, i, j;
corpus_seq->ndocs = 0;
for (i = 0; i < corpus_seq->len; ++i)
{
fscanf(fileptr, "%d", &ndocs);
corpus_seq->ndocs += ndocs;
corpus_seq->corpus[i] = (corpus_t*) malloc(sizeof(corpus_t));
corpus_seq->corpus[i]->ndocs = ndocs;
corpus_seq->corpus[i]->doc = (doc_t**) malloc(sizeof(doc_t*) * ndocs);
for (j = 0; j < ndocs; j++)
{
if (doc_idx >= raw_corpus->ndocs) {
outlog("Error: too few documents listed in dtm sequence file %s.\n"
"Current line: %d %d %d.\n",
filename,
doc_idx,
ndocs,
j);
exit(1);
}
// outlog("%d %d %d %d\n", i, j, doc_idx, raw_corpus->ndocs);
corpus_seq->corpus[i]->doc[j] = raw_corpus->doc[doc_idx];
doc_idx++;
}
}
corpus_seq->max_nterms = compute_max_nterms(corpus_seq);
outlog("read corpus of length %d\n", corpus_seq->len);
return(corpus_seq);
}
/*
* write sequential corpus
*
*/
void write_corpus_seq(corpus_seq_t* c, char* name)
{
char tmp_string[400];
int n;
outlog("writing %d slices to %s (%d total docs)", c->len, name, c->ndocs);
sprintf(tmp_string, "%s-seq.dat", name);
FILE* seq_file = fopen(tmp_string, "w");
fprintf(seq_file, "%d", c->len);
for (n = 0; n < c->len; n++)
fprintf(seq_file, " %d", c->corpus[n]->ndocs);
fclose(seq_file);
corpus_t* flat = collapse_corpus_seq(c);
sprintf(tmp_string, "%s-mult.dat", name);
write_corpus(flat, tmp_string);
}
/*
* write corpus
*
*/
void write_corpus(corpus_t* c, char* filename)
{
int i, j;
FILE * fileptr;
doc_t * d;
outlog("writing %d docs to %s\n", c->ndocs, filename);
fileptr = fopen(filename, "w");
for (i = 0; i < c->ndocs; i++)
{
d = c->doc[i];
fprintf(fileptr, "%d", d->nterms);
for (j = 0; j < d->nterms; j++)
{
fprintf(fileptr, " %d:%d", d->word[j], d->count[j]);
}
fprintf(fileptr, "\n");
}
fclose(fileptr);
}
/*
* read and write lda sequence variational distribution
*
*/
void write_lda_seq_docs(const string& model_type,
const string& root_directory,
int min_time,
int max_time,
const lda_seq* model) {
string filename = StringPrintf("%s/times_%d_%d_info.dat",
root_directory.c_str(),
min_time,
max_time);
FILE* f = fopen(filename.c_str(), "w");
params_write_int(f, "NUM_TOPICS", model->ntopics);
params_write_int(f, "NUM_TERMS", model->nterms);
params_write_int(f, "SEQ_LENGTH", model->nseq);
params_write_gsl_vector(f, "ALPHA", model->alpha);
fclose(f);
for (int k = 0; k < model->ntopics; k++) {
if (model_type == "fixed") {
gsl_matrix_view view = gsl_matrix_submatrix(
model->topic[k]->w_phi_l,
0, min_time, model->nterms, max_time - min_time);
filename = StringPrintf(
"%s/w_phi_l-topic_%d-time_%d_%d.dat",
root_directory.c_str(), k, min_time, max_time);
mtx_fprintf(filename.c_str(), &view.matrix);
// Note that in this case, we write the entire sequence.
view = gsl_matrix_submatrix(
model->topic[k]->m_update_coeff,
0, 0, model->nterms, max_time - min_time);
filename = StringPrintf("%s/m_update_coeff-topic_%d-time_%d_%d.dat",
root_directory.c_str(),
k, min_time, max_time);
mtx_fprintf(filename.c_str(), &view.matrix);
}
}
for (int t=min_time; t < max_time; ++t) {
outlog("\nwriting influence weights for time %d to %s",
t, filename.c_str());
filename = StringPrintf(
"%s/influence_time-%03d",
root_directory.c_str(), t);
gsl_matrix* influence_t =
model->influence->doc_weights[t];
assert(model->ntopics == influence_t->size2);
mtx_fprintf(filename.c_str(), influence_t);
filename = StringPrintf("%s/renormalized_influence_time-%03d",
root_directory.c_str(), t);
outlog("\nwriting influence weights for time %d to %s",
t, filename.c_str());
influence_t =
model->influence->renormalized_doc_weights[t];
assert(model->ntopics == influence_t->size2);
mtx_fprintf(filename.c_str(), influence_t);
}
}
/*
* compute the maximum nterms in a corpus sequence
*
*/
int compute_max_nterms(const corpus_seq_t* c)
{
int i,j;
int max = 0;
for (i = 0; i < c->len; i++)
{
corpus_t* corpus = c->corpus[i];
for (j = 0; j < corpus->ndocs; j++)
if (corpus->doc[j]->nterms > max)
max = corpus->doc[j]->nterms;
}
return(max);
}
/*
* compute the total matrix of counts (W x T)
*
*/
gsl_matrix* compute_total_counts(const corpus_seq_t* c)
{
int t, d, n;
gsl_matrix* ret = gsl_matrix_alloc(c->nterms, c->len);
for (t = 0; t < c->len; t++)
{
corpus_t* corpus = c->corpus[t];
for (d = 0; d < corpus->ndocs; d++)
{
doc_t* doc = corpus->doc[d];
for (n = 0; n < doc->nterms; n++)
{
minc(ret, doc->word[n], t, (double) doc->count[n]);
}
}
}
return(ret);
}
// Creates a new array of doubles with kScaledBetaMax
// elements.
double* NewScaledInfluence(int size) {
double* scaled_influence = new double[size];
if (FLAGS_influence_flat_years > 0) {
// Note that we round up, to make sure we have at least one epoch.
int number_epochs = FLAGS_influence_flat_years * FLAGS_time_resolution;
double epoch_weight = 1.0 / number_epochs;
for (int i=0; i < number_epochs; ++i) {
scaled_influence[i] = epoch_weight;
}
for (int i=number_epochs; i < size; ++i) {
scaled_influence[i] = 0.0;
}
return scaled_influence;
}
/*
// Use the simple distribution: 1 at [0], 0 everywhere else.
for (int i=0; i < size; ++i) {
scaled_influence[i] = 0.0;
}
scaled_influence[0] = 1.0;
// scaled_beta[1] = 0;
return scaled_influence;
*/
/*
// Simulate a beta distribution with specified mean and variance.
double total = 0.0;
double tmp = (scaled_beta_mean
* (1 - scaled_beta_mean)
/ scaled_beta_variance) - 1.0;
double beta_alpha = scaled_beta_mean * tmp;
double beta_beta = (1 - scaled_beta_mean) * tmp;
for (int i=0; i < scaled_beta_max; ++i) {
// Offset tmp by 0.5 so we get a centered distribution
// and don't run into degeneracy issues.
tmp = (i + 0.5) / (scaled_beta_max);
scaled_beta[i] = (pow(tmp, beta_alpha - 1.0)
* pow(1 - tmp, beta_beta - 1.0));
total += scaled_beta[i];
}
*/
// Handle the log-normal distribution.
double total = 0.0;
// Here, we're interested more in the median.
// So we treat the variable mean as median and note this in
// our paper.
double scaled_influence_mean = FLAGS_influence_mean_years;
double scaled_influence_variance = (FLAGS_influence_stdev_years
* FLAGS_influence_stdev_years);
double tmp = (1.0
+ (scaled_influence_variance
/ (scaled_influence_mean
* scaled_influence_mean)));
double lognormal_sigma_squared = log(tmp);
double lognormal_mu = (log(scaled_influence_mean)
- 0.5 * lognormal_sigma_squared);
printf("Median: %.2f\n", exp(lognormal_mu));
for (int i = 0; i < size; ++i) {
// Shift right by half a timeframe to avoid corner cases.
double x = (i / FLAGS_time_resolution) + (1.0 / FLAGS_time_resolution) / 2;
double tmp2 = (log(x) - lognormal_mu);
scaled_influence[i] = (1.0
/ (x * sqrt(lognormal_sigma_squared * 2 * 3.1415926))
* exp(-tmp2 * tmp2
/ (2.0
* lognormal_sigma_squared)));
total += scaled_influence[i];
}
for (int i = 0; i < kScaledInfluenceMax; ++i) {
scaled_influence[i] /= total;
}
return scaled_influence;
}
} // namespace dtm