diff --git a/hlda/.#gibbs.h.1.2 b/hlda/.#gibbs.h.1.2 new file mode 100755 index 0000000..e990eb1 --- /dev/null +++ b/hlda/.#gibbs.h.1.2 @@ -0,0 +1,40 @@ +#ifndef GIBBSH +#define GIBBSH + +#include "utils.h" +#include "typedefs.h" +#include "doc.h" +#include "topic.h" + +#include + +#define WRITE_MODE_CORPUS 1 + +#define DEFAULT_OUTPUT_LAG 100 +#define DEFAULT_HYPER_LAG 1 +#define DEFAULT_SHUFFLE_LAG 100 +#define DEFAULT_LEVEL_LAG 1 + +void write_gibbs_state(gibbs_state * state, char* filename); + +void write_gibbs_output(gibbs_state * state); + +void compute_gibbs_score(gibbs_state * state); + +void iterate_gibbs_state(gibbs_state * state); + +void initialize_gibbs_state(gibbs_state * state); + +gibbs_state * new_gibbs_state(char* corpus, char* settings, char* out_dir); + +gibbs_state * new_heldout_gibbs_state(corpus* corp, gibbs_state* orig); + +double mean_heldout_score(corpus* corp, + gibbs_state* orig, + int burn, + int lag, + int niter); + +void free_gibbs_state(gibbs_state* state); + +#endif diff --git a/hlda/.#main.c.1.44 b/hlda/.#main.c.1.44 new file mode 100755 index 0000000..4ea53bd --- /dev/null +++ b/hlda/.#main.c.1.44 @@ -0,0 +1,92 @@ +#include "utils.h" +#include "typedefs.h" +#include "doc.h" +#include "topic.h" +#include "gibbs.h" +#include +#include +#include +#include + +#define MAX_ITER 10000 +#define TEST_LAG 100 +#define NRESTARTS 10 + +// simple gibbs sampling on a data set + +void main_gibbs(int ac, char* av[]) +{ + assert(ac == 5); + + char* corpus = av[2]; + char* settings = av[3]; + char* out_dir = av[4]; + + int restart; + for (restart = 0; restart < NRESTARTS; restart++) + { + gibbs_state* state = new_gibbs_state(corpus, settings, out_dir); + initialize_gibbs_state(state); + int iter; + for (iter = 0; iter < MAX_ITER; iter++) + { + iterate_gibbs_state(state); + } + free_gibbs_state(state); + } +} + +void main_heldout(int ac, char* av[]) +{ + assert(ac == 6); + + char* train = av[2]; + char* test = av[3]; + char* settings = av[4]; + char* out_dir = av[5]; + + gibbs_state* state = new_gibbs_state(train, settings, out_dir); + initialize_gibbs_state(state); + corpus* heldout_corp = corpus_new(state->corp->gem_mean, + state->corp->gem_scale); + read_corpus(test, heldout_corp, state->tr->depth); + + char filename[100]; + sprintf(filename, "%s/test.dat", state->run_dir); + FILE* test_log = fopen(filename, "w"); + int iter; + for (iter = 0; iter < MAX_ITER; iter++) + { + iterate_gibbs_state(state); + if ((state->iter % TEST_LAG) == 0) + { + double score = mean_heldout_score(heldout_corp, state, + 200, 1, 1000); + fprintf(test_log, "%04d %10.3f %d\n", + state->iter, score, ntopics_in_tree(state->tr)); + fflush(test_log); + } + } + fclose(test_log); +} + + +int main(int ac, char* av[]) +{ + if (ac > 1) + { + if (strcmp(av[1], "gibbs") == 0) + { + main_gibbs(ac, av); + return(0); + } + else if (strcmp(av[1], "heldout") == 0) + { + main_heldout(ac, av); + return(0); + } + } + outlog("USAGE: ./main gibbs corpus settings out"); + outlog(" ./main heldout train test settings out"); + return(0); +} diff --git a/hlda/.#main.c.1.46 b/hlda/.#main.c.1.46 new file mode 100755 index 0000000..d85b9fa --- /dev/null +++ b/hlda/.#main.c.1.46 @@ -0,0 +1,91 @@ +#include "utils.h" +#include "typedefs.h" +#include "doc.h" +#include "topic.h" +#include "gibbs.h" +#include +#include +#include +#include + +#define MAX_ITER 10000 +#define TEST_LAG 100 +#define NRESTARTS 1 + +// simple gibbs sampling on a data set + +void main_gibbs(int ac, char* av[]) +{ + assert(ac == 5); + + char* corpus = av[2]; + char* settings = av[3]; + char* out_dir = av[4]; + + int restart; + for (restart = 0; restart < NRESTARTS; restart++) + { + gibbs_state* state = + init_gibbs_state_w_rep(corpus, settings, out_dir); + int iter; + for (iter = 0; iter < MAX_ITER; iter++) + { + iterate_gibbs_state(state); + } + free_gibbs_state(state); + } +} + +void main_heldout(int ac, char* av[]) +{ + assert(ac == 6); + + char* train = av[2]; + char* test = av[3]; + char* settings = av[4]; + char* out_dir = av[5]; + + gibbs_state* state = init_gibbs_state_w_rep(train, settings, out_dir); + corpus* heldout_corp = corpus_new(state->corp->gem_mean, + state->corp->gem_scale); + read_corpus(test, heldout_corp, state->tr->depth); + + char filename[100]; + sprintf(filename, "%s/test.dat", state->run_dir); + FILE* test_log = fopen(filename, "w"); + int iter; + for (iter = 0; iter < MAX_ITER; iter++) + { + iterate_gibbs_state(state); + if ((state->iter % TEST_LAG) == 0) + { + double score = mean_heldout_score(heldout_corp, state, + 200, 1, 1000); + fprintf(test_log, "%04d %10.3f %d\n", + state->iter, score, ntopics_in_tree(state->tr)); + fflush(test_log); + } + } + fclose(test_log); +} + + +int main(int ac, char* av[]) +{ + if (ac > 1) + { + if (strcmp(av[1], "gibbs") == 0) + { + main_gibbs(ac, av); + return(0); + } + else if (strcmp(av[1], "heldout") == 0) + { + main_heldout(ac, av); + return(0); + } + } + outlog("USAGE: ./main gibbs corpus settings out"); + outlog(" ./main heldout train test settings out"); + return(0); +} diff --git a/hlda/.Rhistory b/hlda/.Rhistory new file mode 100755 index 0000000..443225f --- /dev/null +++ b/hlda/.Rhistory @@ -0,0 +1,18 @@ +scan("/Users/blei/3.Current/DP-nested/src/full-jacm/run012/score.log") +foo <- scan("/Users/blei/3.Current/DP-nested/src/full-jacm/run012/score.log") +foo[,4] +foo <- read.table("/Users/blei/3.Current/DP-nested/src/full-jacm/run012/score.log") +foo[,4] +foo[,5] +foo[,6] +plot(foo[,5], type="b") +plot(foo[,7], type="b") +plot(foo[,8], type="b") +plot(foo[,1], type="b") +plot(foo[,2], type="b") +plot(foo[,2], type="l") +plot(foo[,3], type="l") +plot(foo[,4], type="l") +plot(foo[,5], type="l") +quit() +n diff --git a/hlda/.gdb_history b/hlda/.gdb_history new file mode 100755 index 0000000..8e5d792 --- /dev/null +++ b/hlda/.gdb_history @@ -0,0 +1,6 @@ +run ~/data/jacm/current-jacm/jacm-mult/jacm.dat settings.txt ~/SANDBOX/FOO +up +up +up +run +quit diff --git a/hlda/Makefile b/hlda/Makefile new file mode 100755 index 0000000..f1cf52d --- /dev/null +++ b/hlda/Makefile @@ -0,0 +1,29 @@ +.SUFFIXES: .c .u +CC= gcc +# CFLAGS_MAC = -g -Wall -O3 -DHAVE_INLINE -DGSL_RANGE_CHECK_OFF -Winline -funroll-loops -fstrict-aliasing -fsched-interblock -falign-loops=16 -falign-jumps=16 -falign-functions=16 -falign-jumps-max-skip=15 -falign-loops-max-skip=15 -malign-natural -ffast-math -mdynamic-no-pic -mpowerpc-gpopt -force_cpusubtype_ALL -fstrict-aliasing -mcpu=7450 -faltivec +CFLAGS_MAC = -g -Wall -O3 -DHAVE_INLINE -DGSL_RANGE_CHECK_OFF -Winline -fast -I/opt/local/include/gsl +CFLAGS_PTON = -g -Wall -O3 -DHAVE_INLINE=1 -DGSL_RANGE_CHECK_OFF=1 +CFLAGS_DEBUG = -g -Wall +CFLAGS = -g -Wall -I/opt/local/include/gsl/ -I/usr/include/sys/ -I/usr/include/ + +# MAC_LDFLAGS = -lgsl -latlas -lcblas -L/sw/li +MAC_LDFLAGS = -lgsl -lgslcblas -L/opt/local/lib +C2_LDFLAGS = -lgsl -lcblas -latlas +CYCLES_LDFLAGS = -lgsl -lgslcblas +LSOURCE = utils.c topic.c doc.c hyperparameter.c main.c gibbs.c +LOBJECTS = utils.o topic.o doc.o hyperparameter.o main.o gibbs.o + +main: $(LOBJECTS) + $(CC) $(CFLAGS_MAC) $(LOBJECTS) -o main $(MAC_LDFLAGS) + +c2: $(LOBJECTS) + $(CC) $(CFLAGS_PTON) $(LOBJECTS) -o main $(C2_LDFLAGS) + +cycles: $(LOBJECTS) + $(CC) $(CFLAGS_PTON) $(LOBJECTS) -o main $(CYCLES_LDFLAGS) + +debug: $(LOBJECTS) + $(CC) $(CFLAGS_DEBUG) $(LOBJECTS) -o main $(MAC_LDFLAGS) + +clean: + -rm -f *.o diff --git a/hlda/README b/hlda/README new file mode 100644 index 0000000..d9dc80e --- /dev/null +++ b/hlda/README @@ -0,0 +1,34 @@ +This code implements hierarchical LDA with a fixed depth tree and a +stick breaking prior on the depth weights. An infinite-depth tree can +be approximated by setting the depth to be very high. This code +requires that you have installed the GSL package. + +The input format of the data is the same as in the LDA-C package. +Each line contains + + [# of unique terms] [term #] : [count] ... + +The settings file controls various parameters of the model. There are +several settings files contained in this directory. + + +IMPORTANT: + +I hope that this code is useful to you, but please note that this code +is UNSUPPORTED. Do not email me with questions. I like posting as +much code as possible, but I unfortunately do not have the time to +support all of it. (This paragraph is my solution to the problem.) + +HLDA-C is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +LDA-C is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + + + + diff --git a/hlda/TAGS b/hlda/TAGS new file mode 100755 index 0000000..39148be --- /dev/null +++ b/hlda/TAGS @@ -0,0 +1,183 @@ + +doc.c,423 +void doc_sample_levels(8,66 +void compute_log_p_level(65,1506 +void doc_update_level(102,2469 +void read_corpus(113,2599 +corpus* corpus_new(181,4319 +void free_corpus(193,4555 +void free_doc(204,4720 +void write_corpus_assignment(219,4995 +void write_corpus_levels(237,5487 +double gem_score(260,5987 +void corpus_mh_update_gem(324,8287 +void corpus_mh_update_gem_mean(362,9385 +void corpus_mh_update_gem_scale(394,10220 + +doc.h,133 +#define DOCH2,13 +#define OFFSET 10,136 +#define MH_GEM_STDEV 11,153 +#define MH_GEM_MEAN_STDEV 12,179 +#define MH_GEM_STDEV 13,210 + +gibbs.c,400 +void write_gibbs_state(5,49 +void write_gibbs_output(33,971 +void compute_gibbs_score(86,2648 +void iterate_gibbs_state(101,3087 +void init_gibbs_state(164,4800 +gibbs_state* init_gibbs_state_w_rep(191,5636 +gibbs_state * new_gibbs_state(231,6828 +void set_up_directories(272,8294 +gibbs_state * new_heldout_gibbs_state(322,9943 +double mean_heldout_score(342,10412 +void free_gibbs_state(376,11369 + +gibbs.h,253 +#define GIBBSH2,15 +#define WRITE_MODE_CORPUS 11,129 +#define DEFAULT_OUTPUT_LAG 12,157 +#define DEFAULT_HYPER_LAG 13,189 +#define DEFAULT_SHUFFLE_LAG 14,217 +#define DEFAULT_LEVEL_LAG 15,249 +#define DEFAULT_SAMPLE_GAM 16,278 +#define NINITREP 17,307 + +hyperparameter.c,39 +double gibbs_sample_DP_scaling(21,390 + +hyperparameter.h,29 +#define HYPERPARAMETERH2,24 + +main.c,144 +#define MAX_ITER 11,176 +#define TEST_LAG 12,199 +#define NRESTARTS 13,220 +void main_gibbs(17,281 +void main_heldout(39,760 +int main(73,1775 + +topic.c,1016 +void topic_update_word(8,72 +void topic_update_doc_cnt(29,511 +double eta_score(42,702 +void tree_mh_update_eta(70,1354 +void tree_update_from_doc(113,2362 +void tree_remove_doc_from_path(139,2895 +void tree_add_doc_to_path(146,3053 +void tree_sample_doc_path(171,3479 +double log_gamma_ratio(208,4294 +double log_gamma_ratio_new(246,5242 +void populate_prob_dfs(278,5881 +void tree_prune(344,7749 +void delete_node(363,7995 +topic* tree_fill(398,8577 +topic* topic_add_child(417,8790 +topic* topic_new(434,9158 +topic* tree_sample_path(473,10302 +topic* tree_sample_dfs(491,10657 +double gamma_score(524,11335 +void dfs_sample_scaling(552,11981 +tree* tree_new(576,12496 +void write_tree_topics_dfs(603,13002 +int ntopics_in_tree(635,13729 +int ntopics_in_tree_dfs(640,13808 +void free_tree(657,14056 +void free_tree_dfs(663,14130 +tree * copy_tree(680,14415 +void copy_tree_dfs(694,14801 +void copy_topic(707,15066 +void write_tree_levels(728,15609 +void write_tree_level_dfs(736,15745 +void write_tree(754,16100 + +topic.h,78 +#define TOPICH2,15 +#define MH_ETA_STDEV 10,147 +#define MH_GAM_STDEV 11,174 + +typedefs.h,320 +#define TYPEDEFS2,17 +#define MH_REPS 4,35 +#define TRUE 5,55 +#define FALSE 6,70 +typedef struct topic11,135 +} topic;30,1011 +typedef struct tree33,1022 +} tree;42,1488 +typedef struct doc45,1498 +} doc;57,1899 +typedef struct corpus60,1908 +} corpus;67,2020 +typedef struct gibbs_state69,2031 +} gibbs_state;96,2539 + +unused-code.C,601 + gsl_permutation* perm 5,75 + int temp1[temp16,132 + int temp2[temp27,162 + for (i = 0; i < d->word->size;size8,192 + temp1[temp110,238 + temp2[temp211,286 + for (i = 0; i < d->word->size;size13,342 + d->word->val[val15,388 + d->levels->val[val16,424 +void tree_mh_update_gam(25,575 +double gamma_score_PY(69,1661 +void initialize_corpus_old(140,3362 +void reinitialize_corpus(179,4231 +void doc_initialize_levels(202,4799 +void remove_state(222,5381 +void tree_sample_all(241,5833 +double alpha_score(261,6185 +void corpus_mh_update_alpha(288,6704 + +utils.c,854 +gsl_rng* RANDNUMGEN 2,19 +void init_random_number_generator(9,105 +int directory_exist(29,445 +void make_directory(55,712 +int sample_from_log(66,850 +int_vector* new_int_vector(94,1332 +void delete_int_vector(107,1579 +void ivappend(119,1705 +void vct_fscanf(132,1905 +int_vector* iv_copy(146,2172 +void iv_permute(165,2478 +void iv_permute_from_perm(173,2640 +gsl_permutation* rpermutation(191,2946 +double runif(205,3189 +double rgauss(211,3251 +double rgamma(219,3406 +double log_dgamma(226,3538 +double rbeta(237,3788 +int rbernoulli(244,3901 +double sum(255,4019 +void print_vector(271,4190 +void write_vect(286,4391 +gsl_vector* read_vect(298,4624 +int read_int(324,5048 +void write_int(334,5246 +void write_double(341,5354 +double read_double(347,5472 +FILE* LOG;356,5640 +static time_t TVAL;357,5651 +void outlog(358,5671 +void resize(382,6026 + +utils.h,354 +#define UTILSH2,15 +time_t TIME;17,312 +char line1[line118,325 +typedef struct int_vector22,372 +} int_vector;26,428 +static inline int ivget(33,562 +static inline void ivset(36,633 +static inline void vset(39,712 +static inline double vget(42,799 +static inline void vinc(45,889 +static inline double log_sum(48,979 +static inline double lgam(56,1164 + +ncrp-gibbs.c,0 diff --git a/hlda/alpha b/hlda/alpha new file mode 100755 index 0000000..783875f --- /dev/null +++ b/hlda/alpha @@ -0,0 +1,976 @@ +doc.c:13: int depth = d->path[0]->tr->depth; +doc.c:18: gsl_permutation* p = rpermutation(d->levels->size); +doc.c:19: iv_permute_from_perm(d->levels, p); +doc.c:20: iv_permute_from_perm(d->word, p); +doc.c:26: for (i = 0; i < d->word->size; i++) +doc.c:28: int w = ivget(d->word, i); +doc.c:31: int l = ivget(d->levels, i); +doc.c:32: doc_update_level(d, l, -1.0); +doc.c:33: topic_update_word(d->path[l], w, -1.0); +doc.c:39: compute_log_p_level(d, *(d->gem_mean), *(d->gem_scale)); +doc.c:43: vget(d->log_p_level, k) + +doc.c:44: vget(d->path[k]->log_prob_w, w)); +doc.c:50: topic_update_word(d->path[new_l], w, 1.0); +doc.c:51: ivset(d->levels, i, new_l); +doc.c:69: double levels_sum = sum(d->tot_levels); +doc.c:73: for (i = 0; i < d->tot_levels->size; i++) +doc.c:76: ((1 - gem_mean) * gem_scale + vget(d->tot_levels, i)) / +doc.c:77: (gem_scale + vget(d->tot_levels, i) + levels_sum); +doc.c:79: vset(d->log_p_level, +doc.c:83: levels_sum -= vget(d->tot_levels, i); +doc.c:84: sum_log_prob += log(1 - expected_stick_len); +doc.c:95: vinc(d->tot_levels, l, update); +doc.c:111: c->nterms = 0; +doc.c:112: c->ndoc = 0; +doc.c:118: c->ndoc = c->ndoc + 1; +doc.c:120: if ((c->ndoc % 10) == 0) outlog("read doc %d", c->ndoc); +doc.c:122: c->doc = (doc**) realloc(c->doc, sizeof(doc*) * c->ndoc); +doc.c:123: c->doc[c->ndoc-1] = malloc(sizeof(doc)); +doc.c:124: d = c->doc[c->ndoc-1]; +doc.c:125: d->id = c->ndoc-1; +doc.c:126: d->word = new_int_vector(0); +doc.c:134: word = word - OFFSET; +doc.c:137: if (word >= c->nterms) +doc.c:139: c->nterms = word + 1; +doc.c:143: ivappend(d->word, word); +doc.c:149: d->levels = new_int_vector(d->word->size); +doc.c:150: d->path = malloc(sizeof(topic*) * depth); +doc.c:151: d->tot_levels = gsl_vector_calloc(depth); +doc.c:152: d->log_p_level = gsl_vector_calloc(depth); +doc.c:153: d->gem_mean = &(c->gem_mean); +doc.c:154: d->gem_scale = &(c->gem_scale); +doc.c:156: for (n = 0; n < d->levels->size; n++) +doc.c:157: ivset(d->levels, n, -1); +doc.c:161: outlog("number of docs : %d", c->ndoc); +doc.c:162: outlog("number of words : %d", c->nterms); +doc.c:176: c->gem_mean = gem_mean; +doc.c:177: c->gem_scale = gem_scale; +doc.c:178: c->ndoc = 0; +doc.c:179: c->doc = malloc(sizeof(doc*) * c->ndoc); +doc.c:186: * - each line contains a space delimited list of topic IDs +doc.c:193: int depth = corp->doc[0]->path[0]->tr->depth; +doc.c:194: for (d = 0; d < corp->ndoc; d++) +doc.c:196: fprintf(file, "%d", corp->doc[d]->id); +doc.c:197: fprintf(file, " %1.9e", corp->doc[d]->score); +doc.c:200: fprintf(file, " %d", corp->doc[d]->path[l]->id); +doc.c:215: int depth = corp->doc[0]->path[0]->tr->depth; +doc.c:216: double prior_a = (1 - corp->gem_mean) * corp->gem_scale; +doc.c:217: double prior_b = corp->gem_mean * corp->gem_scale; +doc.c:219: for (i = 0; i < corp->ndoc; i++) +doc.c:221: doc* curr_doc = corp->doc[i]; +doc.c:222: curr_doc->score = 0; +doc.c:227: double count = vget(curr_doc->tot_levels, l); +doc.c:236: double a = vget(curr_doc->tot_levels, l) + prior_a; +doc.c:238: curr_doc->score += +doc.c:239: lgamma(a) + lgamma(b) - lgamma(a + b) - +doc.c:240: lgamma(prior_b) - lgamma(prior_a) + +doc.c:243: score += curr_doc->score; +doc.c:245: // exponential 1 prior: log(1) - 1 * s +doc.c:246: score += -corp->gem_scale; +doc.c:260: gsl_vector* alpha = corp->alpha; +doc.c:262: int depth = corp->doc[0]->path[0]->tr->depth; +doc.c:264: for (i = 0; i < corp->ndoc; i++) +doc.c:266: doc* curr_doc = corp->doc[i]; +doc.c:269: score += lgamma(ivget(curr_doc->levels, l) + vget(alpha, l)); +doc.c:271: score -= lgamma(alphatot + curr_doc->word->size); +doc.c:286: gsl_vector* alpha = corp->alpha; +doc.c:289: int accept[alpha->size]; +doc.c:292: for (l = 0; l < alpha->size; l++) accept[l] = 0; +doc.c:295: for (l = 0; l < alpha->size; l++) +doc.c:305: if (r > exp(new_score - current_score)) +doc.c:329: double old_mean = corp->gem_mean; +doc.c:330: double old_scale = corp->gem_scale; +doc.c:331: double old_alpha = corp->gem_mean * corp->gem_scale; +doc.c:338: corp->gem_mean = new_mean; +doc.c:339: corp->gem_scale = new_scale; +doc.c:342: if (r > exp(new_score - current_score)) +doc.c:344: corp->gem_mean = old_mean; +doc.c:345: corp->gem_scale = old_scale; +doc.c:355: accept, corp->gem_mean * corp->gem_scale); +doc.c:368: double old_mean = corp->gem_mean; +doc.c:373: corp->gem_mean = new_mean; +doc.c:376: if (r > exp(new_score - current_score)) +doc.c:378: corp->gem_mean = old_mean; +doc.c:387: printf("ACCEPTED %d; GEM MEAN = %5.3f\n", accept, corp->gem_mean); +doc.c:399: double old_scale = corp->gem_scale; +doc.c:404: corp->gem_scale = new_scale; +doc.c:407: if (r > exp(new_score - current_score)) +doc.c:409: corp->gem_scale = old_scale; +doc.c:418: // printf("ACCEPTED %d; GEM SCALE = %5.3f\n", accept, corp->gem_scale); +doc.c.~1.33.~:13: int depth = d->path[0]->tr->depth; +doc.c.~1.33.~:18: gsl_permutation* p = rpermutation(d->levels->size); +doc.c.~1.33.~:19: iv_permute_from_perm(d->levels, p); +doc.c.~1.33.~:20: iv_permute_from_perm(d->word, p); +doc.c.~1.33.~:26: for (i = 0; i < d->word->size; i++) +doc.c.~1.33.~:28: int w = ivget(d->word, i); +doc.c.~1.33.~:31: int l = ivget(d->levels, i); +doc.c.~1.33.~:32: doc_update_level(d, l, -1.0); +doc.c.~1.33.~:33: topic_update_word(d->path[l], w, -1.0); +doc.c.~1.33.~:39: compute_log_p_level(d, *(d->gem_mean), *(d->gem_scale)); +doc.c.~1.33.~:43: vget(d->log_p_level, k) + +doc.c.~1.33.~:44: vget(d->path[k]->log_prob_w, w)); +doc.c.~1.33.~:50: topic_update_word(d->path[new_l], w, 1.0); +doc.c.~1.33.~:51: ivset(d->levels, i, new_l); +doc.c.~1.33.~:69: double levels_sum = sum(d->tot_levels); +doc.c.~1.33.~:73: for (i = 0; i < d->tot_levels->size; i++) +doc.c.~1.33.~:76: ((1 - gem_mean) * gem_scale + vget(d->tot_levels, i)) / +doc.c.~1.33.~:77: (gem_scale + vget(d->tot_levels, i) + levels_sum); +doc.c.~1.33.~:79: vset(d->log_p_level, +doc.c.~1.33.~:83: // sum += vget(d->tot_levels, i); +doc.c.~1.33.~:85: levels_sum -= vget(d->tot_levels, i); +doc.c.~1.33.~:86: sum_log_prob += log(1 - expected_stick_len); +doc.c.~1.33.~:97: vinc(d->tot_levels, l, update); +doc.c.~1.33.~:113: c->nterms = 0; +doc.c.~1.33.~:114: c->ndoc = 0; +doc.c.~1.33.~:120: c->ndoc = c->ndoc + 1; +doc.c.~1.33.~:122: if ((c->ndoc % 10) == 0) outlog("read doc %d", c->ndoc); +doc.c.~1.33.~:124: c->doc = (doc**) realloc(c->doc, sizeof(doc*) * c->ndoc); +doc.c.~1.33.~:125: c->doc[c->ndoc-1] = malloc(sizeof(doc)); +doc.c.~1.33.~:126: d = c->doc[c->ndoc-1]; +doc.c.~1.33.~:127: d->id = c->ndoc-1; +doc.c.~1.33.~:128: d->word = new_int_vector(0); +doc.c.~1.33.~:136: word = word - OFFSET; +doc.c.~1.33.~:139: if (word >= c->nterms) +doc.c.~1.33.~:141: c->nterms = word + 1; +doc.c.~1.33.~:145: ivappend(d->word, word); +doc.c.~1.33.~:151: d->levels = new_int_vector(d->word->size); +doc.c.~1.33.~:152: d->path = malloc(sizeof(topic*) * depth); +doc.c.~1.33.~:153: d->tot_levels = gsl_vector_calloc(depth); +doc.c.~1.33.~:154: d->log_p_level = gsl_vector_calloc(depth); +doc.c.~1.33.~:155: d->gem_mean = &(c->gem_mean); +doc.c.~1.33.~:156: d->gem_scale = &(c->gem_scale); +doc.c.~1.33.~:158: for (n = 0; n < d->levels->size; n++) +doc.c.~1.33.~:159: ivset(d->levels, n, -1); +doc.c.~1.33.~:163: outlog("number of docs : %d", c->ndoc); +doc.c.~1.33.~:164: outlog("number of words : %d", c->nterms); +doc.c.~1.33.~:178: c->gem_mean = gem_mean; +doc.c.~1.33.~:179: c->gem_scale = gem_scale; +doc.c.~1.33.~:180: c->ndoc = 0; +doc.c.~1.33.~:181: c->doc = malloc(sizeof(doc*) * c->ndoc); +doc.c.~1.33.~:188: * - each line contains a space delimited list of topic IDs +doc.c.~1.33.~:195: int depth = corp->doc[0]->path[0]->tr->depth; +doc.c.~1.33.~:196: for (d = 0; d < corp->ndoc; d++) +doc.c.~1.33.~:198: fprintf(file, "%d", corp->doc[d]->id); +doc.c.~1.33.~:199: fprintf(file, " %1.9e", corp->doc[d]->score); +doc.c.~1.33.~:202: fprintf(file, " %d", corp->doc[d]->path[l]->id); +doc.c.~1.33.~:217: int depth = corp->doc[0]->path[0]->tr->depth; +doc.c.~1.33.~:218: double prior_a = (1 - corp->gem_mean) * corp->gem_scale; +doc.c.~1.33.~:219: double prior_b = corp->gem_mean * corp->gem_scale; +doc.c.~1.33.~:221: for (i = 0; i < corp->ndoc; i++) +doc.c.~1.33.~:223: doc* curr_doc = corp->doc[i]; +doc.c.~1.33.~:224: curr_doc->score = 0; +doc.c.~1.33.~:229: double count = vget(curr_doc->tot_levels, l); +doc.c.~1.33.~:238: double a = vget(curr_doc->tot_levels, l) + prior_a; +doc.c.~1.33.~:240: curr_doc->score += +doc.c.~1.33.~:241: lgamma(a) + lgamma(b) - lgamma(a + b) - +doc.c.~1.33.~:242: lgamma(prior_b) - lgamma(prior_a) + +doc.c.~1.33.~:245: score += curr_doc->score; +doc.c.~1.33.~:247: // exponential 1 prior: log(1) - 1 * s +doc.c.~1.33.~:248: score += -corp->gem_scale; +doc.c.~1.33.~:262: gsl_vector* alpha = corp->alpha; +doc.c.~1.33.~:264: int depth = corp->doc[0]->path[0]->tr->depth; +doc.c.~1.33.~:266: for (i = 0; i < corp->ndoc; i++) +doc.c.~1.33.~:268: doc* curr_doc = corp->doc[i]; +doc.c.~1.33.~:271: score += lgamma(ivget(curr_doc->levels, l) + vget(alpha, l)); +doc.c.~1.33.~:273: score -= lgamma(alphatot + curr_doc->word->size); +doc.c.~1.33.~:288: gsl_vector* alpha = corp->alpha; +doc.c.~1.33.~:291: int accept[alpha->size]; +doc.c.~1.33.~:294: for (l = 0; l < alpha->size; l++) accept[l] = 0; +doc.c.~1.33.~:297: for (l = 0; l < alpha->size; l++) +doc.c.~1.33.~:307: if (r > exp(new_score - current_score)) +doc.c.~1.33.~:331: double old_mean = corp->gem_mean; +doc.c.~1.33.~:332: double old_scale = corp->gem_scale; +doc.c.~1.33.~:333: double old_alpha = corp->gem_mean * corp->gem_scale; +doc.c.~1.33.~:340: corp->gem_mean = new_mean; +doc.c.~1.33.~:341: corp->gem_scale = new_scale; +doc.c.~1.33.~:344: if (r > exp(new_score - current_score)) +doc.c.~1.33.~:346: corp->gem_mean = old_mean; +doc.c.~1.33.~:347: corp->gem_scale = old_scale; +doc.c.~1.33.~:357: accept, corp->gem_mean * corp->gem_scale); +doc.c.~1.33.~:370: double old_mean = corp->gem_mean; +doc.c.~1.33.~:375: corp->gem_mean = new_mean; +doc.c.~1.33.~:378: if (r > exp(new_score - current_score)) +doc.c.~1.33.~:380: corp->gem_mean = old_mean; +doc.c.~1.33.~:389: printf("ACCEPTED %d; GEM MEAN = %5.3f\n", accept, corp->gem_mean); +doc.c.~1.33.~:401: double old_scale = corp->gem_scale; +doc.c.~1.33.~:406: corp->gem_scale = new_scale; +doc.c.~1.33.~:409: if (r > exp(new_score - current_score)) +doc.c.~1.33.~:411: corp->gem_scale = old_scale; +doc.c.~1.33.~:420: // printf("ACCEPTED %d; GEM SCALE = %5.3f\n", accept, corp->gem_scale); +Binary file doc.o matches +funs.R:1:plot.scores <- function(filename, ...) +funs.R:3: x <- read.table(filename); +funs.R:12:plot.score <- function(v, title) +funs.R:20:monitor.scores <- function(filename, lag = 15) +funs.R:31:compute.gamma <- function(n, k.1, k.2) +funs.R:33: gam.1 <- k.1 / log(n); +funs.R:34: gam.2 <- k.2 / (log(n) - log(gam.1) - log(log(n))) +gibbs.c:9: tree * tr = state->tr; +gibbs.c:10: corpus * corp = state->corp; +gibbs.c:11: double score = state->score; +gibbs.c:16: write_vect(tr->eta, "ETA", file); +gibbs.c:17: write_vect(tr->gam, "GAMMA", file); +gibbs.c:18: write_double(corp->gem_mean, "GEM_MEAN", file); +gibbs.c:19: write_double(corp->gem_scale, "GEM_SCALE", file); +gibbs.c:20: write_double(tr->scaling_shape, "SCALING_SHAPE", file); +gibbs.c:21: write_double(tr->scaling_scale, "SCALING_SCALE", file); +gibbs.c:36: FILE* score_f = state->score_log; +gibbs.c:37: corpus * corp = state->corp; +gibbs.c:38: tree * tr = state->tr; +gibbs.c:39: int depth = tr->depth; +gibbs.c:43: fprintf(state->score_log, +gibbs.c:45: state->iter, state->gem_score, state->eta_score, +gibbs.c:46: state->gamma_score, state->score, +gibbs.c:47: corp->gem_mean, corp->gem_scale); +gibbs.c:49: for (l = 0; l < depth - 1; l++) +gibbs.c:51: fprintf(state->score_log, " %7.4e", vget(tr->gam,l)); +gibbs.c:55: fprintf(state->score_log, " %7.4e", vget(tr->eta,l)); +gibbs.c:58: fprintf(state->score_log, "\n"); +gibbs.c:59: fflush(state->score_log); +gibbs.c:61: if (state->tree_structure_log != NULL) +gibbs.c:63: write_tree_levels(tr, state->tree_structure_log); +gibbs.c:65: if (state->run_dir != NULL) +gibbs.c:68: if ((state->output_lag > 0) && +gibbs.c:69: (state->iter % state->output_lag) == 0) +gibbs.c:71: sprintf(filename, "%s/iter=%06d", state->run_dir, state->iter); +gibbs.c:74: if (state->score == state->max_score) +gibbs.c:76: outlog("mode at iteration %04d", state->iter); +gibbs.c:77: sprintf(filename, "%s/mode", state->run_dir); +gibbs.c:85: tree * tr = state->tr; +gibbs.c:86: corpus * corp = state->corp; +gibbs.c:88: state->gem_score = gem_score(corp); +gibbs.c:89: state->eta_score = eta_score(tr->root); +gibbs.c:90: state->gamma_score = gamma_score(tr->root); +gibbs.c:91: state->score = state->gem_score + state->eta_score + state->gamma_score; +gibbs.c:92: if ((state->score > state->max_score) || (state->iter == 0)) +gibbs.c:94: state->max_score = state->score; +gibbs.c:100: tree* tr = state->tr; +gibbs.c:101: corpus* corp = state->corp; +gibbs.c:102: state->iter = state->iter + 1; +gibbs.c:103: int iter = state->iter; +gibbs.c:105: // set up the sampling level (or fix at the depth - 1) +gibbs.c:107: if (state->level_lag == -1) +gibbs.c:109: sampling_level = tr->depth - 1; +gibbs.c:111: else if ((iter % state->level_lag) == 0) +gibbs.c:113: int level_inc = iter / state->level_lag; +gibbs.c:114: sampling_level = level_inc % (tr->depth - 1); +gibbs.c:118: if (state->shuffle_lag > 0) +gibbs.c:120: do_shuffle = 1 - (iter % state->shuffle_lag); +gibbs.c:124: gsl_ran_shuffle(RANDNUMGEN, corp->doc, corp->ndoc, sizeof(doc*)); +gibbs.c:128: for (d = 0; d < corp->ndoc; d++) +gibbs.c:130: tree_sample_doc_path(tr, corp->doc[d], 1, sampling_level); +gibbs.c:132: for (d = 0; d < corp->ndoc; d++) +gibbs.c:134: doc_sample_levels(corp->doc[d], do_shuffle, 1); +gibbs.c:136: // sample hyper-parameters +gibbs.c:137: if ((state->hyper_lag > 0) && (iter % state->hyper_lag) == 0) +gibbs.c:141: // !!! FOR NOW, ALL HYPER-PARAMETER LEARNING IS OFF +gibbs.c:142: // dfs_sample_scaling(tr->root); +gibbs.c:153: tree * tr = state->tr; +gibbs.c:154: corpus * corp = state->corp; +gibbs.c:156: gsl_ran_shuffle(RANDNUMGEN, corp->doc, corp->ndoc, sizeof(doc*)); +gibbs.c:157: int depth = tr->depth; +gibbs.c:159: for (i = 0; i < corp->ndoc; i++) +gibbs.c:161: doc* d = corp->doc[i]; +gibbs.c:162: gsl_vector_set_zero(d->tot_levels); +gibbs.c:163: gsl_vector_set_zero(d->log_p_level); +gibbs.c:164: iv_permute(d->word); +gibbs.c:165: d->path[depth - 1] = tree_fill(tr->root); +gibbs.c:166: topic_update_doc_cnt(d->path[depth - 1], 1.0); +gibbs.c:167: for (j = depth - 2; j >= 0; j--) +gibbs.c:169: d->path[j] = d->path[j+1]->parent; +gibbs.c:170: topic_update_doc_cnt(d->path[j], 1.0); +gibbs.c:177: if (state->run_dir != NULL) +gibbs.c:180: sprintf(filename, "%s/initial", state->run_dir); +gibbs.c:182: sprintf(filename, "%s/mode", state->run_dir); +gibbs.c:198: gsl_vector* gam = read_vect("GAM", depth - 1, init); +gibbs.c:205: state->iter = 0; +gibbs.c:206: state->corp = corpus_new(gem_mean, gem_scale); +gibbs.c:207: read_corpus(corpus, state->corp, depth); +gibbs.c:208: state->tr = tree_new(depth, state->corp->nterms, +gibbs.c:214: state->shuffle_lag = DEFAULT_SHUFFLE_LAG; +gibbs.c:215: state->hyper_lag = DEFAULT_HYPER_LAG; +gibbs.c:216: state->level_lag = DEFAULT_LEVEL_LAG; +gibbs.c:217: state->output_lag = DEFAULT_OUTPUT_LAG; +gibbs.c:218: state->run_dir = NULL; +gibbs.c:222: state->run_dir = malloc(sizeof(char) * 100); +gibbs.c:225: sprintf(state->run_dir, "%s/run%03d", out_dir, id); +gibbs.c:226: while (directory_exist(state->run_dir)) +gibbs.c:229: sprintf(state->run_dir, "%s/run%03d", out_dir, id); +gibbs.c:231: mkdir(state->run_dir, S_IRUSR|S_IWUSR|S_IXUSR); +gibbs.c:234: sprintf(filename, "%s/tree.log", state->run_dir); +gibbs.c:235: state->tree_structure_log = fopen(filename, "w"); +gibbs.c:236: sprintf(filename, "%s/score.log", state->run_dir); +gibbs.c:237: state->score_log = fopen(filename, "w"); +gibbs.c:245: state->corp = corp; +gibbs.c:246: state->tr = copy_tree(orig->tr); +gibbs.c:248: state->run_dir = NULL; +gibbs.c:249: state->score_log = NULL; +gibbs.c:250: state->tree_structure_log = NULL; +gibbs.c:252: state->shuffle_lag = orig->shuffle_lag; +gibbs.c:253: state->hyper_lag = -1; +gibbs.c:254: state->level_lag = orig->level_lag; +gibbs.c:255: state->output_lag = -1; +gibbs.c:276: if ((iter % 100) == 0) outlog("held-out iter %04d", iter); +gibbs.c:280: double this_score = state->score - orig->score; +gibbs.c:286: outlog("mean held-out score = %7.3f (%d samples)", score, nsamples); +gibbs.c:287: free_tree(state->tr); +hyperparameter.c:12: where \pi/(1-\pi) = a+k-1 / n(b-log(\eta)) +hyperparameter.c:15: sample \alpha' ~ G(a + k, b - log(\eta)) +hyperparameter.c:17: sample \alpha' ~ G(a + k - 1, b - log(\eta)) +hyperparameter.c:31: double pi = shape + k - 1; +hyperparameter.c:32: double rate = 1.0/scale - log(eta); +hyperparameter.c:39: alpha_new = rgamma(shape + k - 1, 1.0/rate); +hyperparameter.c:46: printf("-----\nnew alpha=%g\n-----\n", alpha_new); +Binary file hyperparameter.o matches +main.c:43: corpus* heldout_corp = corpus_new(state->corp->gem_mean, +main.c:44: state->corp->gem_scale); +main.c:45: read_corpus(test, heldout_corp, state->tr->depth); +main.c:48: sprintf(filename, "%s/test.dat", state->run_dir); +main.c:54: iter, ntopics_in_tree(state->tr)); +main.c:56: if ((state->iter % TEST_LAG) == 0) +main.c:61: state->iter, score, ntopics_in_tree(state->tr)); +Binary file main.o matches +notes.txt:14: - distribution over words +notes.txt:15: - scaling parameter +notes.txt:19: - tree of topics +notes.txt:20: - gem mean and scaling +notes.txt:21: - eta for each level +notes.txt:22: - gamma can be left over +notes.txt:23: - hyperparameters for scaling +notes.txt:29: - for each document +notes.txt:30: - sample the z's +notes.txt:31: - sample the path +notes.txt:33: - for each tree node +notes.txt:34: - sample gamma +notes.txt:36: - sample other hyperparameters (maybe) +notes.txt:46:new design: - permute docs if score goes down (is this a proper MC?) +notes.txt:47: - never permute words +notes.txt:48: - always sample words as a block +notes.txt:59: - always permute documents +notes.txt:60: - always permute document words and sample as a block +notes.txt:61: - sometimes resample the tree +notes.txt:62: - sometimes don't sample the tree +notes.txt:63: - sometimes don't sample the levels +notes.txt:64: - rarely, completely restart +notes.txt:72:to-do +notes.txt:74:- make everthing a parameter to be read: +notes.txt:75: - in main.c, topics.h, ... grep "#define" +notes.txt:76:- print code and organize it +notes.txt:77:- make the MH code more general by passing in a score function +notes.txt:78:- no need to recurse always in computing the scores (?) +notes.txt:80:longer term to-do +notes.txt:82:- "warm" start from states +notes.txt:86:- add number of words and total words to corpus structure +notes.txt:87:- add an environment variable and corresponding code for priority of verbosity +notes.txt:88:- make alpha like eta, gamma: point documents back to the corpus to get it +notes.txt:89:- divide up the main function +notes.txt:93:pseudo-code +notes.txt:122: level[i,d] = -1 for all i, d +notes.txt:132:--- +notes.txt:141:- tree +notes.txt:142:- corpus +notes.txt:143:- eta, alpha, gamma +notes.txt:146:- depth +notes.txt:147:- root +notes.txt:148:- eta* +notes.txt:149:- gamma* +notes.txt:150:- total_doc_count +notes.txt:153:- doc_count +notes.txt:154:- word_count +notes.txt:155:- word_total +notes.txt:156:- log_word_prob +notes.txt:157:- nchild +notes.txt:158:- child*[1..nchild] +notes.txt:159:- parent* +notes.txt:160:- prob +notes.txt:161:- eta* +notes.txt:162:- total_doc_count* +notes.txt:165:- words +notes.txt:166:- levels +notes.txt:167:- path[1..L] +notes.txt:168:- tree* +notes.txt:171:- ndoc +notes.txt:172:- doc +notes.txt:173:- alpha* +notes.txt:176:- size +notes.txt:177:- int[] +notes.txt:180:- size +notes.txt:181:- double[] +notes.txt:186:- update word in word_count +notes.txt:187:- update word in word_total +notes.txt:188:- change log_word_prob +notes.txt:191:- populate probabilities [eta, gamma] +notes.txt:192:- sample probability +notes.txt:193:- fill in tree +notes.txt:194:- populate path of the document +notes.txt:195:- (do not update!) +notes.txt:198:- for each document +notes.txt:199: - sample path +notes.txt:200: - sample levels +notes.txt:203:- write log probabilities on a line +notes.txt:206:- DFS through topics +notes.txt:207: - write level and log probabilities +notes.txt:211: - for each word +notes.txt:212: - if not init remove word from the topic +notes.txt:213: - sample level [alpha] +notes.txt:214: - add word to the topic +notes.txt:219: - if not init, update document along path (doc, -1.0) +notes.txt:220: - tree_sample_doc_path(tree, doc) +notes.txt:221: - update document along path(doc, +1.0) +topic.c:10: vinc(t->w_cnt, w, update); +topic.c:11: t->w_tot += update; +topic.c:15: double eta = vget(t->tr->eta, t->level); +topic.c:16: vset(t->log_prob_w, w, +topic.c:17: log(vget(t->w_cnt, w) + eta) - +topic.c:18: log(t->w_tot + t->w_cnt->size * eta)); +topic.c:20: vset(t->lgam_w_plus_eta, w, lgam(vget(t->w_cnt, w) + eta)); +topic.c:32: t->doc_tot += update; +topic.c:33: t->log_doc_tot = log(t->doc_tot); +topic.c:46: int nwords = t->w_cnt->size; +topic.c:47: double eta = vget(t->tr->eta, t->level); +topic.c:49: score = lgam(nwords * eta) - nwords * lgam(eta); +topic.c:53: // score += lgam(vget(t->w_cnt, w) + eta); +topic.c:54: score += vget(t->lgam_w_plus_eta, w); +topic.c:57: score -= lgam(t->w_tot + nwords * eta); +topic.c:58: // exponential(1) prior: log(1) - 1 * eta +topic.c:59: score -= eta; +topic.c:61: for (c = 0; c < t->nchild; c++) +topic.c:63: score += eta_score(t->child[c]); +topic.c:71: gsl_vector* vect = tr->eta; +topic.c:72: int depth = vect->size; +topic.c:80: double current_score = eta_score(tr->root); +topic.c:89: double new_score = eta_score(tr->root); +topic.c:91: if (r > exp(new_score - current_score)) +topic.c:113: int depth = d->path[0]->tr->depth; +topic.c:114: int nword = d->word->size; +topic.c:119: int level = ivget(d->levels, n); +topic.c:121: topic_update_word(d->path[level], ivget(d->word, n), update); +topic.c:125: topic_update_doc_cnt(d->path[n], update); +topic.c:139: tree_update_from_doc(d, -1.0, root_level); +topic.c:140: tree_prune(d->path[tr->depth - 1]); +topic.c:148: int depth = node->tr->depth; +topic.c:149: int l = depth-1; +topic.c:152: d->path[l] = node; +topic.c:153: node = node->parent; +topic.c:154: l--; +topic.c:184: double path_prob[tr->depth]; +topic.c:185: populate_prob_dfs(d->path[root_level], d, &logsum, path_prob, root_level); +topic.c:189: topic* node = tree_sample_path(d->path[root_level], logsum); +topic.c:208: int nterms = t->log_prob_w->size; +topic.c:209: int nword = d->word->size; +topic.c:215: count[ivget(d->word, n)] = 0; +topic.c:219: if (ivget(d->levels, n) == level) +topic.c:221: count[ivget(d->word, n)]++; +topic.c:225: double eta = vget(t->tr->eta, t->level); +topic.c:226: result = lgam(t->w_tot + nterms * eta); // !!! this should be precomputed +topic.c:227: result -= lgam(t->w_tot + vget(d->tot_levels, level) + nterms * eta); +topic.c:231: int wd = ivget(d->word, n); +topic.c:234: // result -= vget(t->lgam_w_plus_eta, wd); +topic.c:235: result -= lgam(vget(t->w_cnt, wd) + eta); // !!! this should be precomputed +topic.c:236: result += lgam(vget(t->w_cnt, wd) + count[wd] + eta); +topic.c:248: int nword = d->word->size; +topic.c:252: count[ivget(d->word, n)] = 0; +topic.c:256: if (ivget(d->levels, n) == level) +topic.c:257: count[ivget(d->word, n)]++; +topic.c:260: result -= lgam(vget(d->tot_levels, level) + nterms * eta); +topic.c:264: int wd = ivget(d->word, n); +topic.c:267: result -= lgam(eta); +topic.c:283: int level = node->level; +topic.c:284: int depth = node->tr->depth; +topic.c:293: denom = log(node->parent->doc_tot + node->parent->scaling); +topic.c:295: // denom = log(node->parent->doc_tot + node->parent->nchild * (level - 1) * 0.01 + vget(node->tr->gam, level - 1)); +topic.c:297: // pprob[level] += node->log_doc_tot - denom; +topic.c:298: pprob[level] += log(node->doc_tot) - denom; +topic.c:303: if (level < depth - 1) +topic.c:305: int nterms = node->log_prob_w->size; +topic.c:308: double eta = vget(node->tr->eta, l); +topic.c:312: // double gam = vget(node->tr->gam, level) + node->nchild * level * 0.01; +topic.c:315: pprob[level+1] += log(node->scaling); +topic.c:316: pprob[level+1] -= log(node->doc_tot + node->scaling); +topic.c:320: node->prob = 0; +topic.c:321: for (l = root_level; l < depth; l++) node->prob += pprob[l]; +topic.c:325: *logsum = node->prob; +topic.c:327: *logsum = log_sum(*logsum, node->prob); +topic.c:330: for (c = 0; c < node->nchild; c++) +topic.c:331: populate_prob_dfs(node->child[c], d, logsum, pprob, root_level); +topic.c:343: topic* parent = t->parent; +topic.c:344: if (t->doc_tot == 0) +topic.c:365: for (c = 0; c < t->nchild; c++) +topic.c:367: delete_node(t->child[c]); +topic.c:371: int nc = t->parent->nchild; +topic.c:374: if (t->parent->child[c] == t) +topic.c:376: t->parent->child[c] = t->parent->child[nc - 1]; +topic.c:377: t->parent->nchild--; +topic.c:382: gsl_vector_free(t->w_cnt); +topic.c:383: gsl_vector_free(t->log_prob_w); +topic.c:384: gsl_vector_free(t->lgam_w_plus_eta); +topic.c:385: free(t->child); +topic.c:397: if (t->level < t->tr->depth-1) +topic.c:417: t->nchild++; +topic.c:419: t->child = (topic**) realloc(t->child, sizeof(topic*) * t->nchild); +topic.c:420: t->child[t->nchild - 1] = topic_new(t->w_cnt->size, t->level+1, t, t->tr); +topic.c:422: return(t->child[t->nchild - 1]); +topic.c:435: t->w_tot = 0; +topic.c:436: t->w_cnt = gsl_vector_calloc(nwords); +topic.c:437: t->log_prob_w = gsl_vector_calloc(nwords); +topic.c:438: t->lgam_w_plus_eta = gsl_vector_calloc(nwords); +topic.c:439: t->log_doc_tot = 0; // !!! make this a NAN? +topic.c:440: t->doc_tot = 0; +topic.c:441: t->level = level; +topic.c:442: t->nchild = 0; +topic.c:443: t->child = NULL; +topic.c:444: t->parent = parent; +topic.c:445: t->tr = tr; +topic.c:446: t->id = tr->next_id++; +topic.c:449: // t->scaling = rgamma(tr->scaling_shape, tr->scaling_scale); +topic.c:450: t->scaling = t->tr->scaling_shape * t->tr->scaling_scale; +topic.c:454: double eta = vget(t->tr->eta, t->level); +topic.c:455: double log_p_w = log(eta) - log(eta * nwords); +topic.c:456: gsl_vector_set_all(t->log_prob_w, log_p_w); +topic.c:483: * sum : pointer to running sum -- updated at each call +topic.c:489: *sum = *sum + exp(node->prob - logsum); +topic.c:497: for (i = 0; i < node->nchild; i++) +topic.c:499: topic* val = tree_sample_dfs(r, node->child[i], sum, logsum); +topic.c:521: if (t->nchild > 0) +topic.c:523: score += log_dgamma(t->scaling, +topic.c:524: t->tr->scaling_shape, +topic.c:525: t->tr->scaling_scale); +topic.c:526: score += log(t->scaling) * t->nchild; +topic.c:527: score -= lgam(t->scaling + t->doc_tot); +topic.c:528: for (c = 0; c < t->nchild; c++) +topic.c:530: score += lgam(t->scaling + t->child[c]->doc_tot); +topic.c:531: score += gamma_score(t->child[c]); +topic.c:545: if (t->nchild > 0) +topic.c:547: t->scaling = gibbs_sample_DP_scaling(t->scaling, +topic.c:548: t->tr->scaling_shape, +topic.c:549: t->tr->scaling_scale, +topic.c:550: t->nchild, +topic.c:551: t->doc_tot); +topic.c:554: for (c = 0; c < t->nchild; c++) +topic.c:556: dfs_sample_scaling(t->child[c]); +topic.c:576: tr->depth = depth; +topic.c:577: tr->eta = eta; +topic.c:578: tr->gam = gam; +topic.c:579: tr->next_id = 0; +topic.c:580: tr->scaling_shape = scaling_shape; +topic.c:581: tr->scaling_scale = scaling_scale; +topic.c:583: tr->root = topic_new(nwords, 0, NULL, tr); +topic.c:598: fprintf(file, "%-6d", root_topic->id); +topic.c:600: if (root_topic->parent != NULL) +topic.c:601: fprintf(file, " %-6d", root_topic->parent->id); +topic.c:603: fprintf(file, " %-6d", -1); +topic.c:605: fprintf(file, " %06.0f", root_topic->doc_tot); +topic.c:606: fprintf(file, " %06.0f", root_topic->w_tot); +topic.c:607: fprintf(file, " %06.3e", root_topic->scaling); +topic.c:609: for (i = 0; i < root_topic->w_cnt->size; i++) +topic.c:611: fprintf(file, " %6.0f", vget(root_topic->w_cnt, i)); +topic.c:615: for (i = 0; i < root_topic->nchild; i++) +topic.c:617: write_tree_topics_dfs(root_topic->child[i], file); +topic.c:628: return(ntopics_in_tree_dfs(tr->root)); +topic.c:635: for (c = 0; c < t->nchild; c++) +topic.c:637: topics_below += ntopics_in_tree_dfs(t->child[c]); +topic.c:639: return(t->nchild + topics_below); +topic.c:650: free_tree_dfs(tr->root); +topic.c:656: gsl_vector_free(t->w_cnt); +topic.c:657: gsl_vector_free(t->log_prob_w); +topic.c:658: gsl_vector_free(t->lgam_w_plus_eta); +topic.c:660: for (c = 0; c < t->nchild; c++) +topic.c:661: free_tree_dfs(t->child[c]); +topic.c:672: tree* tree_copy = tree_new(tr->depth, +topic.c:673: tr->root->w_cnt->size, +topic.c:674: tr->eta, +topic.c:675: tr->gam, +topic.c:676: tr->scaling_shape, +topic.c:677: tr->scaling_scale); +topic.c:679: copy_tree_dfs(tr->root, tree_copy->root); +topic.c:689: for (c = 0; c < src->nchild; c++) +topic.c:692: child->parent = dest; +topic.c:693: copy_tree_dfs(src->child[c], child); +topic.c:699: dest->w_tot = src->w_tot; +topic.c:700: gsl_vector_memcpy(dest->w_cnt, src->w_cnt); +topic.c:701: gsl_vector_memcpy(dest->log_prob_w, src->log_prob_w); +topic.c:702: gsl_vector_memcpy(dest->lgam_w_plus_eta, src->lgam_w_plus_eta); +topic.c:703: dest->doc_tot = src->doc_tot; +topic.c:704: dest->log_doc_tot = src->log_doc_tot; +topic.c:705: dest->id = src->id; +topic.c:706: dest->level = src->level; +topic.c:707: dest->nchild = 0; // children get added separately +topic.c:708: dest->scaling = src->scaling; +topic.c:709: dest->prob = src->prob; +topic.c:720: write_tree_level_dfs(tr->root, file); +topic.c:730: if (root_topic->parent == NULL) +topic.c:732: fprintf(file, "%d", root_topic->level); +topic.c:736: fprintf(file, " %d", root_topic->level); +topic.c:738: for (i = 0; i < root_topic->nchild; i++) +topic.c:740: write_tree_level_dfs(root_topic->child[i], file); +topic.c:747: fprintf(file, "%-6s %-6s %-6s %-6s %-9s %-6s\n", +topic.c:749: write_tree_topics_dfs(tf->root, file); +topic.c.~1.17.~:10: vinc(t->w_cnt, w, update); +topic.c.~1.17.~:11: t->w_tot += update; +topic.c.~1.17.~:15: double eta = vget(t->tr->eta, t->level); +topic.c.~1.17.~:16: vset(t->log_prob_w, w, +topic.c.~1.17.~:17: log(vget(t->w_cnt, w) + eta) - +topic.c.~1.17.~:18: log(t->w_tot + t->w_cnt->size * eta)); +topic.c.~1.17.~:20: vset(t->lgam_w_plus_eta, w, lgam(vget(t->w_cnt, w) + eta)); +topic.c.~1.17.~:32: t->doc_tot += update; +topic.c.~1.17.~:33: t->log_doc_tot = log(t->doc_tot); +topic.c.~1.17.~:46: int nwords = t->w_cnt->size; +topic.c.~1.17.~:47: double eta = vget(t->tr->eta, t->level); +topic.c.~1.17.~:49: score = lgam(nwords * eta) - nwords * lgam(eta); +topic.c.~1.17.~:53: // score += lgam(vget(t->w_cnt, w) + eta); +topic.c.~1.17.~:54: score += vget(t->lgam_w_plus_eta, w); +topic.c.~1.17.~:57: score -= lgam(t->w_tot + nwords * eta); +topic.c.~1.17.~:58: // exponential(1) prior: log(1) - 1 * eta +topic.c.~1.17.~:59: score -= eta; +topic.c.~1.17.~:61: for (c = 0; c < t->nchild; c++) +topic.c.~1.17.~:63: score += eta_score(t->child[c]); +topic.c.~1.17.~:71: gsl_vector* vect = tr->eta; +topic.c.~1.17.~:72: int depth = vect->size; +topic.c.~1.17.~:80: double current_score = eta_score(tr->root); +topic.c.~1.17.~:89: double new_score = eta_score(tr->root); +topic.c.~1.17.~:91: if (r > exp(new_score - current_score)) +topic.c.~1.17.~:113: int depth = d->path[0]->tr->depth; +topic.c.~1.17.~:114: int nword = d->word->size; +topic.c.~1.17.~:119: int level = ivget(d->levels, n); +topic.c.~1.17.~:121: topic_update_word(d->path[level], ivget(d->word, n), update); +topic.c.~1.17.~:125: topic_update_doc_cnt(d->path[n], update); +topic.c.~1.17.~:139: tree_update_from_doc(d, -1.0, root_level); +topic.c.~1.17.~:140: tree_prune(d->path[tr->depth - 1]); +topic.c.~1.17.~:148: int depth = node->tr->depth; +topic.c.~1.17.~:149: int l = depth-1; +topic.c.~1.17.~:152: d->path[l] = node; +topic.c.~1.17.~:153: node = node->parent; +topic.c.~1.17.~:154: l--; +topic.c.~1.17.~:167: for (n = 1; n < corp->ndoc; n++) +topic.c.~1.17.~:169: doc* d = corp->doc[n]; +topic.c.~1.17.~:172: for (n = 1; n < corp->ndoc; n++) +topic.c.~1.17.~:174: tree_sample_doc_path(tr, corp->doc[n], 0, 0); +topic.c.~1.17.~:199: double path_prob[tr->depth]; +topic.c.~1.17.~:200: populate_prob_dfs(d->path[root_level], d, &logsum, path_prob, root_level); +topic.c.~1.17.~:204: topic* node = tree_sample_path(d->path[root_level], logsum); +topic.c.~1.17.~:223: int nterms = t->log_prob_w->size; +topic.c.~1.17.~:224: int nword = d->word->size; +topic.c.~1.17.~:230: count[ivget(d->word, n)] = 0; +topic.c.~1.17.~:234: if (ivget(d->levels, n) == level) +topic.c.~1.17.~:236: count[ivget(d->word, n)]++; +topic.c.~1.17.~:240: double eta = vget(t->tr->eta, t->level); +topic.c.~1.17.~:241: result = lgam(t->w_tot + nterms * eta); // !!! this should be precomputed +topic.c.~1.17.~:242: result -= lgam(t->w_tot + vget(d->tot_levels, level) + nterms * eta); +topic.c.~1.17.~:246: int wd = ivget(d->word, n); +topic.c.~1.17.~:249: // result -= vget(t->lgam_w_plus_eta, wd); +topic.c.~1.17.~:250: result -= lgam(vget(t->w_cnt, wd) + eta); // !!! this should be precomputed +topic.c.~1.17.~:251: result += lgam(vget(t->w_cnt, wd) + count[wd] + eta); +topic.c.~1.17.~:263: int nword = d->word->size; +topic.c.~1.17.~:267: count[ivget(d->word, n)] = 0; +topic.c.~1.17.~:271: if (ivget(d->levels, n) == level) +topic.c.~1.17.~:272: count[ivget(d->word, n)]++; +topic.c.~1.17.~:275: result -= lgam(vget(d->tot_levels, level) + nterms * eta); +topic.c.~1.17.~:279: int wd = ivget(d->word, n); +topic.c.~1.17.~:282: result -= lgam(eta); +topic.c.~1.17.~:298: int level = node->level; +topic.c.~1.17.~:299: int depth = node->tr->depth; +topic.c.~1.17.~:308: denom = log(node->parent->doc_tot + node->parent->scaling); +topic.c.~1.17.~:310: // denom = log(node->parent->doc_tot + node->parent->nchild * (level - 1) * 0.01 + vget(node->tr->gam, level - 1)); +topic.c.~1.17.~:312: // pprob[level] += node->log_doc_tot - denom; +topic.c.~1.17.~:313: pprob[level] += log(node->doc_tot) - denom; +topic.c.~1.17.~:318: if (level < depth - 1) +topic.c.~1.17.~:320: int nterms = node->log_prob_w->size; +topic.c.~1.17.~:323: double eta = vget(node->tr->eta, l); +topic.c.~1.17.~:327: // double gam = vget(node->tr->gam, level) + node->nchild * level * 0.01; +topic.c.~1.17.~:330: pprob[level+1] += log(node->scaling); +topic.c.~1.17.~:331: pprob[level+1] -= log(node->doc_tot + node->scaling); +topic.c.~1.17.~:335: node->prob = 0; +topic.c.~1.17.~:336: for (l = root_level; l < depth; l++) node->prob += pprob[l]; +topic.c.~1.17.~:340: *logsum = node->prob; +topic.c.~1.17.~:342: *logsum = log_sum(*logsum, node->prob); +topic.c.~1.17.~:345: for (c = 0; c < node->nchild; c++) +topic.c.~1.17.~:346: populate_prob_dfs(node->child[c], d, logsum, pprob, root_level); +topic.c.~1.17.~:358: topic* parent = t->parent; +topic.c.~1.17.~:359: if (t->doc_tot == 0) +topic.c.~1.17.~:380: for (c = 0; c < t->nchild; c++) +topic.c.~1.17.~:382: delete_node(t->child[c]); +topic.c.~1.17.~:386: int nc = t->parent->nchild; +topic.c.~1.17.~:389: if (t->parent->child[c] == t) +topic.c.~1.17.~:391: t->parent->child[c] = t->parent->child[nc - 1]; +topic.c.~1.17.~:392: t->parent->nchild--; +topic.c.~1.17.~:397: gsl_vector_free(t->w_cnt); +topic.c.~1.17.~:398: gsl_vector_free(t->log_prob_w); +topic.c.~1.17.~:399: gsl_vector_free(t->lgam_w_plus_eta); +topic.c.~1.17.~:400: free(t->child); +topic.c.~1.17.~:412: if (t->level < t->tr->depth-1) +topic.c.~1.17.~:432: t->nchild++; +topic.c.~1.17.~:434: t->child = (topic**) realloc(t->child, sizeof(topic*) * t->nchild); +topic.c.~1.17.~:435: t->child[t->nchild - 1] = topic_new(t->w_cnt->size, t->level+1, t, t->tr); +topic.c.~1.17.~:437: return(t->child[t->nchild - 1]); +topic.c.~1.17.~:450: t->w_tot = 0; +topic.c.~1.17.~:451: t->w_cnt = gsl_vector_calloc(nwords); +topic.c.~1.17.~:452: t->log_prob_w = gsl_vector_calloc(nwords); +topic.c.~1.17.~:453: t->lgam_w_plus_eta = gsl_vector_calloc(nwords); +topic.c.~1.17.~:454: t->log_doc_tot = 0; // !!! make this a NAN? +topic.c.~1.17.~:455: t->doc_tot = 0; +topic.c.~1.17.~:456: t->level = level; +topic.c.~1.17.~:457: t->nchild = 0; +topic.c.~1.17.~:458: t->child = NULL; +topic.c.~1.17.~:459: t->parent = parent; +topic.c.~1.17.~:460: t->tr = tr; +topic.c.~1.17.~:461: t->id = tr->next_id++; +topic.c.~1.17.~:464: // t->scaling = rgamma(tr->scaling_shape, tr->scaling_scale); +topic.c.~1.17.~:465: t->scaling = t->tr->scaling_shape * t->tr->scaling_scale; +topic.c.~1.17.~:469: double eta = vget(t->tr->eta, t->level); +topic.c.~1.17.~:470: double log_p_w = log(eta) - log(eta * nwords); +topic.c.~1.17.~:471: gsl_vector_set_all(t->log_prob_w, log_p_w); +topic.c.~1.17.~:498: * sum : pointer to running sum -- updated at each call +topic.c.~1.17.~:504: *sum = *sum + exp(node->prob - logsum); +topic.c.~1.17.~:512: for (i = 0; i < node->nchild; i++) +topic.c.~1.17.~:514: topic* val = tree_sample_dfs(r, node->child[i], sum, logsum); +topic.c.~1.17.~:536: if (t->nchild > 0) +topic.c.~1.17.~:538: score += log_dgamma(t->scaling, +topic.c.~1.17.~:539: t->tr->scaling_shape, +topic.c.~1.17.~:540: t->tr->scaling_scale); +topic.c.~1.17.~:541: score += log(t->scaling) * t->nchild; +topic.c.~1.17.~:542: score -= lgam(t->scaling + t->doc_tot); +topic.c.~1.17.~:543: for (c = 0; c < t->nchild; c++) +topic.c.~1.17.~:545: score += lgam(t->scaling + t->child[c]->doc_tot); +topic.c.~1.17.~:546: score += gamma_score(t->child[c]); +topic.c.~1.17.~:560: if (t->nchild > 0) +topic.c.~1.17.~:562: t->scaling = gibbs_sample_DP_scaling(t->scaling, +topic.c.~1.17.~:563: t->tr->scaling_shape, +topic.c.~1.17.~:564: t->tr->scaling_scale, +topic.c.~1.17.~:565: t->nchild, +topic.c.~1.17.~:566: t->doc_tot); +topic.c.~1.17.~:569: for (c = 0; c < t->nchild; c++) +topic.c.~1.17.~:571: dfs_sample_scaling(t->child[c]); +topic.c.~1.17.~:591: tr->depth = depth; +topic.c.~1.17.~:592: tr->eta = eta; +topic.c.~1.17.~:593: tr->gam = gam; +topic.c.~1.17.~:594: tr->next_id = 0; +topic.c.~1.17.~:595: tr->scaling_shape = scaling_shape; +topic.c.~1.17.~:596: tr->scaling_scale = scaling_scale; +topic.c.~1.17.~:598: tr->root = topic_new(nwords, 0, NULL, tr); +topic.c.~1.17.~:613: fprintf(file, "%-6d", root_topic->id); +topic.c.~1.17.~:615: if (root_topic->parent != NULL) +topic.c.~1.17.~:616: fprintf(file, " %-6d", root_topic->parent->id); +topic.c.~1.17.~:618: fprintf(file, " %-6d", -1); +topic.c.~1.17.~:620: fprintf(file, " %06.0f", root_topic->doc_tot); +topic.c.~1.17.~:621: fprintf(file, " %06.0f", root_topic->w_tot); +topic.c.~1.17.~:622: fprintf(file, " %06.3e", root_topic->scaling); +topic.c.~1.17.~:624: for (i = 0; i < root_topic->w_cnt->size; i++) +topic.c.~1.17.~:626: fprintf(file, " %6.0f", vget(root_topic->w_cnt, i)); +topic.c.~1.17.~:630: for (i = 0; i < root_topic->nchild; i++) +topic.c.~1.17.~:632: write_tree_topics_dfs(root_topic->child[i], file); +topic.c.~1.17.~:643: return(ntopics_in_tree_dfs(tr->root)); +topic.c.~1.17.~:650: for (c = 0; c < t->nchild; c++) +topic.c.~1.17.~:652: topics_below += ntopics_in_tree_dfs(t->child[c]); +topic.c.~1.17.~:654: return(t->nchild + topics_below); +topic.c.~1.17.~:665: free_tree_dfs(tr->root); +topic.c.~1.17.~:671: gsl_vector_free(t->w_cnt); +topic.c.~1.17.~:672: gsl_vector_free(t->log_prob_w); +topic.c.~1.17.~:673: gsl_vector_free(t->lgam_w_plus_eta); +topic.c.~1.17.~:675: for (c = 0; c < t->nchild; c++) +topic.c.~1.17.~:676: free_tree_dfs(t->child[c]); +topic.c.~1.17.~:687: tree* tree_copy = tree_new(tr->depth, +topic.c.~1.17.~:688: tr->root->w_cnt->size, +topic.c.~1.17.~:689: tr->eta, +topic.c.~1.17.~:690: tr->gam, +topic.c.~1.17.~:691: tr->scaling_shape, +topic.c.~1.17.~:692: tr->scaling_scale); +topic.c.~1.17.~:694: copy_tree_dfs(tr->root, tree_copy->root); +topic.c.~1.17.~:704: for (c = 0; c < src->nchild; c++) +topic.c.~1.17.~:707: child->parent = dest; +topic.c.~1.17.~:708: copy_tree_dfs(src->child[c], child); +topic.c.~1.17.~:714: dest->w_tot = src->w_tot; +topic.c.~1.17.~:715: gsl_vector_memcpy(dest->w_cnt, src->w_cnt); +topic.c.~1.17.~:716: gsl_vector_memcpy(dest->log_prob_w, src->log_prob_w); +topic.c.~1.17.~:717: gsl_vector_memcpy(dest->lgam_w_plus_eta, src->lgam_w_plus_eta); +topic.c.~1.17.~:718: dest->doc_tot = src->doc_tot; +topic.c.~1.17.~:719: dest->log_doc_tot = src->log_doc_tot; +topic.c.~1.17.~:720: dest->id = src->id; +topic.c.~1.17.~:721: dest->level = src->level; +topic.c.~1.17.~:722: dest->nchild = 0; // children get added separately +topic.c.~1.17.~:723: dest->scaling = src->scaling; +topic.c.~1.17.~:724: dest->prob = src->prob; +topic.c.~1.17.~:735: write_tree_level_dfs(tr->root, file); +topic.c.~1.17.~:745: if (root_topic->parent == NULL) +topic.c.~1.17.~:747: fprintf(file, "%d", root_topic->level); +topic.c.~1.17.~:751: fprintf(file, " %d", root_topic->level); +topic.c.~1.17.~:753: for (i = 0; i < root_topic->nchild; i++) +topic.c.~1.17.~:755: write_tree_level_dfs(root_topic->child[i], file); +topic.c.~1.17.~:762: fprintf(file, "%-6s %-6s %-6s %-6s %-9s %-6s\n", +topic.c.~1.17.~:764: write_tree_topics_dfs(tf->root, file); +Binary file topic.o matches +Binary file tree.pyc matches +typedefs.h:18: double doc_tot; // total number of doc-level instances +unused-code.C:1:--- to permute the words before resampling them --- +unused-code.C:5: gsl_permutation* perm = rpermutation(d->word->size); +unused-code.C:6: int temp1[d->word->size]; +unused-code.C:7: int temp2[d->word->size]; +unused-code.C:8: for (i = 0; i < d->word->size; i++) +unused-code.C:10: temp1[i] = d->word->val[perm->data[i]]; +unused-code.C:11: temp2[i] = d->levels->val[perm->data[i]]; +unused-code.C:13: for (i = 0; i < d->word->size; i++) +unused-code.C:15: d->word->val[i] = temp1[i]; +unused-code.C:16: d->levels->val[i] = temp2[i]; +unused-code.C:21:--- MH sampling of level gammas --- +unused-code.C:27: outlog(stderr, "%-10s updating gam", "[TOPIC]"); +unused-code.C:28: gsl_vector* vect = tr->gam; +unused-code.C:29: int depth = vect->size; +unused-code.C:30: double current_score = gamma_score(tr->root); +unused-code.C:47: double new_score = gamma_score(tr->root); +unused-code.C:49: if (r > exp(new_score - current_score)) +unused-code.C:67:-- PY score -- +unused-code.C:73: double gam = vget(t->tr->gam, t->level) + gam_add; +unused-code.C:75: if (t->nchild > 0) +unused-code.C:77: score = log(gam) * t->nchild; +unused-code.C:78: score -= lgam(gam + t->doc_tot); +unused-code.C:79: for (c = 0; c < t->nchild; c++) +unused-code.C:81: score += gamma_score_PY(t->child[c], (double) c * t->level * 0.01); +unused-code.C:99:/* gsl_vector* log_prob = gsl_vector_alloc(d->path[0]->tr->depth); */ +unused-code.C:103:/* for (i = 0; i < d->word->size; i++) */ +unused-code.C:105:/* l = ivget(d->levels, i); */ +unused-code.C:106:/* doc_update_level(d, l, -1.0); */ +unused-code.C:107:/* topic_update_word(d->path[l], ivget(d->word, i), -1.0); */ +unused-code.C:110:/* if (do_permute == 1) iv_permute(d->word); */ +unused-code.C:112:/* for (i = 0; i < d->word->size; i++) */ +unused-code.C:114:/* w = ivget(d->word, i); */ +unused-code.C:119:/* vget(d->log_p_level, k) + vget(d->path[k]->log_prob_w, w)); */ +unused-code.C:125:/* topic_update_word(d->path[new_l], w, 1.0); */ +unused-code.C:126:/* ivset(d->levels, i, new_l); */ +unused-code.C:144: int depth = tr->depth; +unused-code.C:147: for (i = 0; i < c->ndoc; i++) +unused-code.C:149: doc* d = c->doc[i]; +unused-code.C:150: iv_permute(d->word); +unused-code.C:154: d->path[depth - 1] = tree_fill(tr->root); +unused-code.C:155: topic_update_doc_cnt(d->path[depth - 1], 1.0); +unused-code.C:156: for (j = depth - 2; j >= 0; j--) +unused-code.C:158: d->path[j] = d->path[j+1]->parent; +unused-code.C:159: topic_update_doc_cnt(d->path[j], 1.0); +unused-code.C:164: d->path[0] = tr->root; +unused-code.C:169: for (i = 0; i < c->ndoc; i++) +unused-code.C:171: doc* d = c->doc[i]; +unused-code.C:182: int depth = tr->depth; +unused-code.C:184: for (n = 1; n < corp->ndoc; n++) +unused-code.C:186: doc* d = corp->doc[n]; +unused-code.C:187: d->path[depth - 1] = tree_fill(tr->root); +unused-code.C:188: topic_update_doc_cnt(d->path[depth - 1], 1.0); +unused-code.C:190: for (j = depth - 2; j >= 0; j--) +unused-code.C:192: d->path[j] = d->path[j+1]->parent; +unused-code.C:193: topic_update_doc_cnt(d->path[j], 1.0); +unused-code.C:205: int depth = d->path[0]->tr->depth; +unused-code.C:207: for (i = 0; i < d->word->size; i++) +unused-code.C:209: int w = ivget(d->word, i); +unused-code.C:212: int new_l = depth - 1; +unused-code.C:213: topic_update_word(d->path[new_l], w, 1.0); +unused-code.C:214: ivset(d->levels, i, new_l); +unused-code.C:225: for (n = 1; n < corp->ndoc; n++) +unused-code.C:227: doc* d = corp->doc[n]; +unused-code.C:228: tree_remove_doc_from_path(tr, d, -1); +unused-code.C:231: for (i = 0; i < d->word->size; i++) +unused-code.C:233: int l = ivget(d->levels, i); +unused-code.C:234: ivset(d->levels, i, -1); +unused-code.C:235: doc_update_level(d, l, -1.0); +unused-code.C:244: for (n = 1; n < corp->ndoc; n++) +unused-code.C:246: doc* d = corp->doc[n]; +unused-code.C:249: for (n = 1; n < corp->ndoc; n++) +unused-code.C:251: tree_sample_doc_path(tr, corp->doc[n], 0, 0); +unused-code.C.~1.4.~:1:--- to permute the words before resampling them --- +unused-code.C.~1.4.~:5: gsl_permutation* perm = rpermutation(d->word->size); +unused-code.C.~1.4.~:6: int temp1[d->word->size]; +unused-code.C.~1.4.~:7: int temp2[d->word->size]; +unused-code.C.~1.4.~:8: for (i = 0; i < d->word->size; i++) +unused-code.C.~1.4.~:10: temp1[i] = d->word->val[perm->data[i]]; +unused-code.C.~1.4.~:11: temp2[i] = d->levels->val[perm->data[i]]; +unused-code.C.~1.4.~:13: for (i = 0; i < d->word->size; i++) +unused-code.C.~1.4.~:15: d->word->val[i] = temp1[i]; +unused-code.C.~1.4.~:16: d->levels->val[i] = temp2[i]; +unused-code.C.~1.4.~:21:--- MH sampling of level gammas --- +unused-code.C.~1.4.~:27: outlog(stderr, "%-10s updating gam", "[TOPIC]"); +unused-code.C.~1.4.~:28: gsl_vector* vect = tr->gam; +unused-code.C.~1.4.~:29: int depth = vect->size; +unused-code.C.~1.4.~:30: double current_score = gamma_score(tr->root); +unused-code.C.~1.4.~:47: double new_score = gamma_score(tr->root); +unused-code.C.~1.4.~:49: if (r > exp(new_score - current_score)) +unused-code.C.~1.4.~:67:-- PY score -- +unused-code.C.~1.4.~:73: double gam = vget(t->tr->gam, t->level) + gam_add; +unused-code.C.~1.4.~:75: if (t->nchild > 0) +unused-code.C.~1.4.~:77: score = log(gam) * t->nchild; +unused-code.C.~1.4.~:78: score -= lgam(gam + t->doc_tot); +unused-code.C.~1.4.~:79: for (c = 0; c < t->nchild; c++) +unused-code.C.~1.4.~:81: score += gamma_score_PY(t->child[c], (double) c * t->level * 0.01); +unused-code.C.~1.4.~:99:/* gsl_vector* log_prob = gsl_vector_alloc(d->path[0]->tr->depth); */ +unused-code.C.~1.4.~:103:/* for (i = 0; i < d->word->size; i++) */ +unused-code.C.~1.4.~:105:/* l = ivget(d->levels, i); */ +unused-code.C.~1.4.~:106:/* doc_update_level(d, l, -1.0); */ +unused-code.C.~1.4.~:107:/* topic_update_word(d->path[l], ivget(d->word, i), -1.0); */ +unused-code.C.~1.4.~:110:/* if (do_permute == 1) iv_permute(d->word); */ +unused-code.C.~1.4.~:112:/* for (i = 0; i < d->word->size; i++) */ +unused-code.C.~1.4.~:114:/* w = ivget(d->word, i); */ +unused-code.C.~1.4.~:119:/* vget(d->log_p_level, k) + vget(d->path[k]->log_prob_w, w)); */ +unused-code.C.~1.4.~:125:/* topic_update_word(d->path[new_l], w, 1.0); */ +unused-code.C.~1.4.~:126:/* ivset(d->levels, i, new_l); */ +unused-code.C.~1.4.~:144: int depth = tr->depth; +unused-code.C.~1.4.~:147: for (i = 0; i < c->ndoc; i++) +unused-code.C.~1.4.~:149: doc* d = c->doc[i]; +unused-code.C.~1.4.~:150: iv_permute(d->word); +unused-code.C.~1.4.~:154: d->path[depth - 1] = tree_fill(tr->root); +unused-code.C.~1.4.~:155: topic_update_doc_cnt(d->path[depth - 1], 1.0); +unused-code.C.~1.4.~:156: for (j = depth - 2; j >= 0; j--) +unused-code.C.~1.4.~:158: d->path[j] = d->path[j+1]->parent; +unused-code.C.~1.4.~:159: topic_update_doc_cnt(d->path[j], 1.0); +unused-code.C.~1.4.~:164: d->path[0] = tr->root; +unused-code.C.~1.4.~:169: for (i = 0; i < c->ndoc; i++) +unused-code.C.~1.4.~:171: doc* d = c->doc[i]; +unused-code.C.~1.4.~:182: int depth = tr->depth; +unused-code.C.~1.4.~:184: for (n = 1; n < corp->ndoc; n++) +unused-code.C.~1.4.~:186: doc* d = corp->doc[n]; +unused-code.C.~1.4.~:187: d->path[depth - 1] = tree_fill(tr->root); +unused-code.C.~1.4.~:188: topic_update_doc_cnt(d->path[depth - 1], 1.0); +unused-code.C.~1.4.~:190: for (j = depth - 2; j >= 0; j--) +unused-code.C.~1.4.~:192: d->path[j] = d->path[j+1]->parent; +unused-code.C.~1.4.~:193: topic_update_doc_cnt(d->path[j], 1.0); +unused-code.C.~1.4.~:205: int depth = d->path[0]->tr->depth; +unused-code.C.~1.4.~:207: for (i = 0; i < d->word->size; i++) +unused-code.C.~1.4.~:209: int w = ivget(d->word, i); +unused-code.C.~1.4.~:212: int new_l = depth - 1; +unused-code.C.~1.4.~:213: topic_update_word(d->path[new_l], w, 1.0); +unused-code.C.~1.4.~:214: ivset(d->levels, i, new_l); +unused-code.C.~1.4.~:225: for (n = 1; n < corp->ndoc; n++) +unused-code.C.~1.4.~:227: doc* d = corp->doc[n]; +unused-code.C.~1.4.~:228: tree_remove_doc_from_path(tr, d, -1); +unused-code.C.~1.4.~:231: for (i = 0; i < d->word->size; i++) +unused-code.C.~1.4.~:233: int l = ivget(d->levels, i); +unused-code.C.~1.4.~:234: ivset(d->levels, i, -1); +unused-code.C.~1.4.~:235: doc_update_level(d, l, -1.0); +utils.c:67: for (i = 1; i < log_prob->size; i++) +utils.c:74: double rolling_sum = exp(vget(log_prob, 0) - logsum); +utils.c:79: rolling_sum += exp(vget(log_prob, result) - logsum); +utils.c:95: iv->size = size; +utils.c:96: iv->val = malloc(sizeof(int) * iv->size); +utils.c:98: for (i = 0; i < iv->size; i++) +utils.c:106: free(iv->val); +utils.c:118: iv->size += 1; +utils.c:119: iv->val = (int*) realloc(iv->val, sizeof(int) * (iv->size)); +utils.c:120: iv->val[iv->size - 1] = val; +utils.c:131: outlog("reading %ld vector from %s", v->size, filename); +utils.c:146: ivc->size = iv->size; +utils.c:147: ivc->val = malloc(sizeof(int) * ivc->size); +utils.c:150: for (i = 0; i < ivc->size; i++) +utils.c:151: ivc->val[i] = iv->val[i]; +utils.c:165: iv->val, +utils.c:166: iv->size, +utils.c:172: assert(iv->size == p->size); +utils.c:175: for (i = 0; i < p->size; i++) +utils.c:177: ivset(iv, i, ivget(ivc, p->data[i])); +utils.c:192: perm->data, size, sizeof(size_t)); +utils.c:227: // f(x)= - a * log(s) + log_gamma(a) + (a-1) * log(x) - x/s +utils.c:229: double v = - shape*log(scale)+lgam(shape)+(shape-1)*log(x)-x/scale; +utils.c:256: for (i = 0; i < v->size; i++) +utils.c:271: for (i = 0; i < v->size; i++) +utils.c:286: fprintf(file, "%-10s", name); +utils.c:287: for (i = 0; i < vect->size; i++) +utils.c:297: outlog("reading %d-vector %s", size, name); +utils.c:304: for (i = 0; i < ret->size; i++) +utils.h:34:{ return(v->val[n]); }; +utils.h:37:{ v->val[n] = val; }; +utils.h:51: return(log_b+log(1 + exp(log_a-log_b))); +utils.h:53: return(log_a+log(1 + exp(log_b-log_a))); +Binary file utils.o matches diff --git a/hlda/doc.c b/hlda/doc.c new file mode 100755 index 0000000..373a87c --- /dev/null +++ b/hlda/doc.c @@ -0,0 +1,423 @@ +#include "doc.h" + +/* + * resample the levels of a document + * + */ + +void doc_sample_levels(doc* d, + short do_permute, + short do_remove) +{ + int i; + int depth = d->path[0]->tr->depth; + gsl_vector* log_prob = gsl_vector_alloc(depth); + + if (do_permute == 1) + { + gsl_permutation* p = rpermutation(d->levels->size); + iv_permute_from_perm(d->levels, p); + iv_permute_from_perm(d->word, p); + gsl_permutation_free(p); + } + + // resample levels + + for (i = 0; i < d->word->size; i++) + { + int w = ivget(d->word, i); + if (do_remove == 1) + { + int l = ivget(d->levels, i); + doc_update_level(d, l, -1.0); + topic_update_word(d->path[l], w, -1.0); + } + + // compute probabilties + + int k; + compute_log_p_level(d, *(d->gem_mean), *(d->gem_scale)); + for (k = 0; k < depth; k++) + { + vset(log_prob, k, + vget(d->log_p_level, k) + + vget(d->path[k]->log_prob_w, w)); + } + + // sample new level and update + + int new_l = sample_from_log(log_prob); + topic_update_word(d->path[new_l], w, 1.0); + ivset(d->levels, i, new_l); + // !!! this should take the word position, and remove or add it + doc_update_level(d, new_l, 1.0); + } + + gsl_vector_free(log_prob); +} + + +/* + compute the log probability of each level, conditioned on the + current level counts +*/ + +void compute_log_p_level(doc* d, double gem_mean, double gem_scale) +{ + // first, compute E[stick size] + + double levels_sum = sum(d->tot_levels); + double sum_log_prob = 0; + double last_section = 0; + int i; + + for (i = 0; i < d->tot_levels->size-1; i++) + { + levels_sum -= vget(d->tot_levels, i); + + double expected_stick_len = + ((1 - gem_mean) * gem_scale + vget(d->tot_levels, i)) / + (gem_scale + vget(d->tot_levels, i) + levels_sum); + + vset(d->log_p_level, + i, + log(expected_stick_len) + sum_log_prob); + + if (i == 0) + last_section = vget(d->log_p_level, i); + else + last_section = log_sum(vget(d->log_p_level, i), last_section); + + sum_log_prob += log(1 - expected_stick_len); + } + last_section = log(1.0 - exp(last_section)); + vset(d->log_p_level, d->tot_levels->size-1, last_section); +} + +/* + * update the level counts + * + */ + +void doc_update_level(doc* d, int l, double update) +{ + vinc(d->tot_levels, l, update); +} + + +/* + * read corpus from data + * + */ + +void read_corpus(char* data_filename, corpus* c, int depth) +{ + outlog("READING CORPUS FROM %s", data_filename); + + FILE *fileptr; + int nunique, count, word, n, i, total = 0; + doc *d; + c->nterms = 0; + c->ndoc = 0; + + fileptr = fopen(data_filename, "r"); + + while (fscanf(fileptr, "%10d", &nunique) != EOF) + { + c->ndoc = c->ndoc + 1; + + if ((c->ndoc % 100) == 0) outlog("read doc %d", c->ndoc); + + c->doc = (doc**) realloc(c->doc, sizeof(doc*) * c->ndoc); + c->doc[c->ndoc-1] = malloc(sizeof(doc)); + d = c->doc[c->ndoc-1]; + d->id = c->ndoc-1; + d->word = new_int_vector(0); + + // read document + + for (n = 0; n < nunique; n++) + { + fscanf(fileptr, "%10d:%10d", &word, &count); + total += count; + word = word - OFFSET; + assert(word >= 0); + + if (word >= c->nterms) + { + c->nterms = word + 1; + } + for (i = 0; i < count; i++) + { + ivappend(d->word, word); + } + } + + // set up gibbs state variables + + d->levels = new_int_vector(d->word->size); + d->path = malloc(sizeof(topic*) * depth); + d->tot_levels = gsl_vector_calloc(depth); + d->log_p_level = gsl_vector_calloc(depth); + d->gem_mean = &(c->gem_mean); + d->gem_scale = &(c->gem_scale); + + for (n = 0; n < d->levels->size; n++) + ivset(d->levels, n, -1); + } + + fclose(fileptr); + outlog("number of docs : %d", c->ndoc); + outlog("number of words : %d", c->nterms); + outlog("total word count : %d", total); +} + + +/* + * allocate a new corpus + * + */ + +corpus* corpus_new(double gem_mean, double gem_scale) +{ + corpus* c = malloc(sizeof(corpus)); + + c->gem_mean = gem_mean; + c->gem_scale = gem_scale; + c->ndoc = 0; + c->doc = malloc(sizeof(doc*) * c->ndoc); + + return(c); +} + +void free_corpus(corpus* corp) +{ + int d; + for (d = 0; d < corp->ndoc; d++) + { + free_doc(corp->doc[d]); + } + free(corp->doc); + free(corp); +} + +void free_doc(doc* d) +{ + delete_int_vector(d->word); + delete_int_vector(d->levels); + gsl_vector_free(d->tot_levels); + gsl_vector_free(d->log_p_level); + free(d); +} + +/* + * write corpus assignment + * each line contains a space delimited list of topic IDs + * + */ + +void write_corpus_assignment(corpus* corp, FILE* file) +{ + int d, l; + int depth = corp->doc[0]->path[0]->tr->depth; + for (d = 0; d < corp->ndoc; d++) + { + fprintf(file, "%d", corp->doc[d]->id); + fprintf(file, " %1.9e", (corp->doc[d]->score / + (double) corp->doc[d]->word->size)); + for (l = 0; l < depth; l++) + { + fprintf(file, " %d", corp->doc[d]->path[l]->id); + } + fprintf(file, "\n"); + } +} + + +void write_corpus_levels(corpus* corp, FILE* file) +{ + outlog("writing all corpus level variables"); + int d, n; + for (d = 0; d < corp->ndoc; d++) + { + for (n = 0; n < corp->doc[d]->word->size; n++) + { + if (n > 0) fprintf(file, " "); + fprintf(file, "%d:%d", + ivget(corp->doc[d]->word, n), + ivget(corp->doc[d]->levels, n)); + } + fprintf(file, "\n"); + } +} + + +/* + * corpus score (i.e., GEM score) + * + */ + +double gem_score(corpus* corp) +{ + double score = 0; + int depth = corp->doc[0]->path[0]->tr->depth; + double prior_a = (1 - corp->gem_mean) * corp->gem_scale; + double prior_b = corp->gem_mean * corp->gem_scale; + int i, l, k; + for (i = 0; i < corp->ndoc; i++) + { + doc* curr_doc = corp->doc[i]; + curr_doc->score = 0; + double count_gt_k[depth]; + for (l = 0; l < depth; l++) + { + count_gt_k[l] = 0; + double count = vget(curr_doc->tot_levels, l); + for (k = 0; k < l; k++) + count_gt_k[k] += count; + } + double sum_log_prob = 0; + double levels_sum = sum(curr_doc->tot_levels); + double last_log_prob = 0; + for (l = 0; l < depth-1; l++) + { + double a = vget(curr_doc->tot_levels, l) + prior_a; + double b = count_gt_k[l] + prior_b; + curr_doc->score += + lgamma(a) + lgamma(b) - lgamma(a + b) - + lgamma(prior_b) - lgamma(prior_a) + + lgamma(prior_a + prior_b); + + // compute the probability of this level for computing the + // probability of the bottom level later. + + levels_sum -= vget(curr_doc->tot_levels, l); + double expected_stick_len = + (prior_a + vget(curr_doc->tot_levels, l)) / + (corp->gem_scale + vget(curr_doc->tot_levels, l) + levels_sum); + + double log_p = log(expected_stick_len) + sum_log_prob; + if (l==0) + last_log_prob = log_p; + else + last_log_prob += log_sum(log_p, last_log_prob); + sum_log_prob += log(1 - expected_stick_len); + } + last_log_prob = log(1 - exp(last_log_prob)); + + // now handle the bottom levels, which are conditionally + // independent given everything else. (more z's allocated to + // the last level doesn't make other's any more likely because the + // probability of reaching the last level has only to do with the + // previous stick lenths.) + + curr_doc->score += vget(curr_doc->tot_levels, depth-1) * last_log_prob; + + score += curr_doc->score; + } + // exponential 1 prior: log(1) - 1 * s + score += -corp->gem_scale; + return(score); +} + + +void corpus_mh_update_gem(corpus* corp) +{ + double current_score = gem_score(corp); + + int accept = 0; + int iter; + for (iter = 0; iter < MH_REPS; iter++) + { + double old_mean = corp->gem_mean; + double old_scale = corp->gem_scale; + double old_alpha = corp->gem_mean * corp->gem_scale; + double new_alpha = rgauss(old_alpha, MH_GEM_STDEV); + double new_mean = new_alpha / (1.0 + new_alpha); + double new_scale = 1.0 + new_alpha; + + if (new_alpha < 0) continue; + + corp->gem_mean = new_mean; + corp->gem_scale = new_scale; + double new_score = gem_score(corp); + double r = runif(); + if (r > exp(new_score - current_score)) + { + corp->gem_mean = old_mean; + corp->gem_scale = old_scale; + } + else + { + current_score = new_score; + accept++; + } + } + outlog("sampled gem: accepted %d; mean = %5.3f scale = %5.3f", + accept, corp->gem_mean, corp->gem_scale); +} + + +void corpus_mh_update_gem_mean(corpus* corp) +{ + outlog("updating gem"); + double current_score = gem_score(corp); + + int accept = 0; + int iter; + for (iter = 0; iter < MH_REPS; iter++) + { + double old_mean = corp->gem_mean; + double new_mean = rgauss(old_mean, MH_GEM_MEAN_STDEV); + + if ((new_mean > 0) && (new_mean < 1)) + { + corp->gem_mean = new_mean; + double new_score = gem_score(corp); + double r = runif(); + if (r > exp(new_score - current_score)) + { + corp->gem_mean = old_mean; + } + else + { + current_score = new_score; + accept++; + } + } + } + outlog("sampled gem mean [accepted %d; mean = %5.3f]", + accept, corp->gem_mean); +} + +void corpus_mh_update_gem_scale(corpus* corp) +{ + // outlog("updating gem"); + double current_score = gem_score(corp); + + int accept = 0; + int iter; + for (iter = 0; iter < MH_REPS; iter++) + { + double old_scale = corp->gem_scale; + double new_scale = rgauss(old_scale, MH_GEM_STDEV); + + if (new_scale > 0) + { + corp->gem_scale = new_scale; + double new_score = gem_score(corp); + double r = runif(); + if (r > exp(new_score - current_score)) + { + corp->gem_scale = old_scale; + } + else + { + current_score = new_score; + accept++; + } + } + } + outlog("sampled gem scale [ accepted %d; scale = %5.3f]", + accept, corp->gem_scale); +} diff --git a/hlda/doc.h b/hlda/doc.h new file mode 100755 index 0000000..308922e --- /dev/null +++ b/hlda/doc.h @@ -0,0 +1,75 @@ +#ifndef DOCH +#define DOCH + +#include +#include +#include "utils.h" +#include "topic.h" +#include "typedefs.h" + +#define OFFSET 0 +#define MH_GEM_STDEV 0.05 +#define MH_GEM_MEAN_STDEV 0.05 +#define MH_GEM_STDEV 0.05 + +/* + * resample the levels of a document + * + */ + +void doc_sample_levels(doc* d, short do_permute, short do_remove); + +/* + * update a level count + * + */ + +void doc_update_level(doc* d, int l, double update); + + +/* + * read corpus from data + * + */ + +void read_corpus(char* filename, corpus* c, int depth); + +/* + * allocate a new corpus + * + */ + +corpus* corpus_new(double mean, double scale); + +/* + * score the corpus + * + */ + +double gem_score(corpus* corp); + +/* + * GEM MH updates + * + */ + +void corpus_mh_update_gem(corpus* corp); +void corpus_mh_update_gem_mean(corpus* corp); +void corpus_mh_update_gem_scale(corpus* corp); + +void compute_log_p_level(doc* d, double gem_mean, double gem_scale); + +/* + * write the document clustering to a file + * + */ + +void write_corpus_assignment(corpus* corp, FILE* file); +void write_corpus_levels(corpus* corp, FILE* file); + +// free a corpus + +void free_corpus(corpus* corp); +void free_doc(doc* d); + +#endif diff --git a/hlda/funs.R b/hlda/funs.R new file mode 100755 index 0000000..52e592a --- /dev/null +++ b/hlda/funs.R @@ -0,0 +1,36 @@ +plot.scores <- function(filename, ...) +{ + x <- read.table(filename); + par(mfrow=c(2,2)); + plot.score(x[,2], title="GEM"); + plot.score(x[,3], title="ETA"); + plot.score(x[,4], title="GAMMA"); + plot.score(x[,5], title="TOTAL"); +} + + +plot.score <- function(v, title) +{ + plot(v, type="b", col="blue", lwd=2, main=title) + abline(h=max(v), col="gray", lwd=2, lty=2); + points(which.max(v), max(v), col="red", pch=16.0, cex=2); +} + + +monitor.scores <- function(filename, lag = 15) +{ + while (TRUE) + { + cat("plotting\n"); + plot.scores(filename, lwd=2, col="red"); + Sys.sleep(lag); + } +} + + +compute.gamma <- function(n, k.1, k.2) +{ + gam.1 <- k.1 / log(n); + gam.2 <- k.2 / (log(n) - log(gam.1) - log(log(n))) + return(c(gam.1, gam.2)) +} diff --git a/hlda/gibbs.c b/hlda/gibbs.c new file mode 100755 index 0000000..0952886 --- /dev/null +++ b/hlda/gibbs.c @@ -0,0 +1,387 @@ +#include "gibbs.h" + +extern gsl_rng* RANDNUMGEN; + +void write_gibbs_state(gibbs_state * state, char* filename) +{ + tree * tr = state->tr; + corpus * corp = state->corp; + double score = state->score; + + char topic_filename[100]; + sprintf(topic_filename, "%s.topics", filename); + FILE* file = fopen(filename, "w"); + write_double(score, "SCORE", file); + write_int(state->iter, "ITER", file); + write_vect(tr->eta, "ETA", file); + write_vect(tr->gam, "GAMMA", file); + write_double(corp->gem_mean, "GEM_MEAN", file); + write_double(corp->gem_scale, "GEM_SCALE", file); + write_double(tr->scaling_shape, "SCALING_SHAPE", file); + write_double(tr->scaling_scale, "SCALING_SCALE", file); + + write_tree(tr, file); + char assign_filename[100]; + sprintf(assign_filename, "%s.assign", filename); + FILE* assign_file = fopen(assign_filename, "w"); + write_corpus_assignment(corp, assign_file); + fclose(assign_file); + fclose(file); +} + + +void write_gibbs_output(gibbs_state * state) +{ + FILE* score_f = state->score_log; + corpus * corp = state->corp; + tree * tr = state->tr; + int depth = tr->depth; + + if (score_f != NULL) + { + fprintf(state->score_log, + "%06d %14.3f %14.3f %14.3f %14.3f %7.4e %7.4e", + state->iter, state->gem_score, state->eta_score, + state->gamma_score, state->score, + corp->gem_mean, corp->gem_scale); + int l; + for (l = 0; l < depth - 1; l++) + { + fprintf(state->score_log, " %7.4e", vget(tr->gam,l)); + } + for (l = 0; l < depth; l++) + { + fprintf(state->score_log, " %7.4e", vget(tr->eta,l)); + } + + fprintf(state->score_log, "\n"); + fflush(state->score_log); + } + if (state->tree_structure_log != NULL) + { + write_tree_levels(tr, state->tree_structure_log); + } + if (state->run_dir != NULL) + { + char filename[100]; + if ((state->output_lag > 0) && + (state->iter % state->output_lag) == 0) + { + sprintf(filename, "%s/iter=%06d", state->run_dir, state->iter); + write_gibbs_state(state, filename); + } + if (state->score == state->max_score) + { + outlog("mode at iteration %04d", state->iter); + sprintf(filename, "%s/mode", state->run_dir); + write_gibbs_state(state, filename); + sprintf(filename, "%s/mode.levels", state->run_dir); + FILE* levels_file = fopen(filename, "w"); + write_corpus_levels(state->corp, levels_file); + fclose(levels_file); + } + } +} + +void compute_gibbs_score(gibbs_state * state) +{ + tree * tr = state->tr; + corpus * corp = state->corp; + + state->gem_score = gem_score(corp); + state->eta_score = eta_score(tr->root); + state->gamma_score = gamma_score(tr->root); + state->score = state->gem_score + state->eta_score + state->gamma_score; + if ((state->score > state->max_score) || (state->iter == 0)) + { + state->max_score = state->score; + } +} + +void iterate_gibbs_state(gibbs_state * state) +{ + tree* tr = state->tr; + corpus* corp = state->corp; + state->iter = state->iter + 1; + int iter = state->iter; + outlog("iteration %04d (%04d topics)", + iter, ntopics_in_tree(state->tr)); + + // set up the sampling level (or fix at the depth - 1) + int sampling_level = 0; + if (state->level_lag == -1) + { + sampling_level = 0; + } + else if ((iter % state->level_lag) == 0) + { + int level_inc = iter / state->level_lag; + sampling_level = level_inc % (tr->depth - 1); + outlog("sampling at level %d", sampling_level); + } + // set up shuffling + int do_shuffle = 0; + if (state->shuffle_lag > 0) + { + do_shuffle = 1 - (iter % state->shuffle_lag); + } + if (do_shuffle == TRUE) + { + gsl_ran_shuffle(RANDNUMGEN, corp->doc, corp->ndoc, sizeof(doc*)); + } + // sample paths and level allocations + int d; + for (d = 0; d < corp->ndoc; d++) + { + tree_sample_doc_path(tr, corp->doc[d], 1, sampling_level); + } + for (d = 0; d < corp->ndoc; d++) + { + doc_sample_levels(corp->doc[d], do_shuffle, 1); + } + // sample hyper-parameters + if ((state->hyper_lag > 0) && (iter % state->hyper_lag) == 0) + { + if (state->sample_eta == 1) + { + tree_mh_update_eta(tr); + } + if (state->sample_gem == 1) + { + corpus_mh_update_gem_scale(corp); + corpus_mh_update_gem_mean(corp); + } + if (state->sample_gam) + { + dfs_sample_scaling(tr->root); + // tree_mh_update_gam(tr); + } + } + compute_gibbs_score(state); + write_gibbs_output(state); +} + +void init_gibbs_state(gibbs_state* state) +{ + tree* tr = state->tr; + corpus* corp = state->corp; + gsl_ran_shuffle(RANDNUMGEN, corp->doc, corp->ndoc, sizeof(doc*)); + int depth = tr->depth; + int i, j; + for (i = 0; i < corp->ndoc; i++) + { + doc* d = corp->doc[i]; + gsl_vector_set_zero(d->tot_levels); + gsl_vector_set_zero(d->log_p_level); + iv_permute(d->word); + d->path[depth - 1] = tree_fill(tr->root); + topic_update_doc_cnt(d->path[depth - 1], 1.0); + for (j = depth - 2; j >= 0; j--) + { + d->path[j] = d->path[j+1]->parent; + topic_update_doc_cnt(d->path[j], 1.0); + } + doc_sample_levels(d, 0, 0); + if (i > 0) tree_sample_doc_path(tr, d, 1, 0); + doc_sample_levels(d, 0, 1); + } + compute_gibbs_score(state); +} + +gibbs_state* init_gibbs_state_w_rep(char* corpus_fname, + char* settings, + char* out_dir) +{ + outlog("initializing state"); + gibbs_state* best_state; + double best_score = 0; + int rep; + for (rep = 0; rep < NINITREP; rep++) + { + gibbs_state* state = new_gibbs_state(corpus_fname, + settings); + init_gibbs_state(state); + + if ((rep == 0) || (state->score > best_score)) + { + outlog("best initial state at rep = %03d; score = %10.7e", + rep, state->score); + best_state = state; + best_score = state->score; + } + else + { + free_gibbs_state(state); + } + } + if (out_dir != NULL) + { + set_up_directories(best_state, out_dir); + char filename[100]; + sprintf(filename, "%s/initial", best_state->run_dir); + write_gibbs_state(best_state, filename); + sprintf(filename, "%s/mode", best_state->run_dir); + write_gibbs_state(best_state, filename); + } + outlog("done initializing state"); + return(best_state); +} + + +gibbs_state * new_gibbs_state(char* corpus, char* settings) +{ + gibbs_state * state = malloc(sizeof(gibbs_state)); + init_random_number_generator(); + + // read hyperparameters + FILE* init = fopen(settings, "r"); + int depth = read_int("DEPTH", init); + gsl_vector* eta = read_vect("ETA", depth, init); + gsl_vector* gam = read_vect("GAM", depth - 1, init); + double gem_mean = read_double("GEM_MEAN", init); + double gem_scale = read_double("GEM_SCALE", init); + double scaling_shape = read_double("SCALING_SHAPE", init); + double scaling_scale = read_double("SCALING_SCALE", init); + int sample_eta = read_int("SAMPLE_ETA", init); + int sample_gem = read_int("SAMPLE_GEM", init); + + // set up the gibbs state + state->iter = 0; + state->corp = corpus_new(gem_mean, gem_scale); + read_corpus(corpus, state->corp, depth); + state->tr = tree_new(depth, state->corp->nterms, + eta, + gam, + scaling_shape, + scaling_scale); + + state->shuffle_lag = DEFAULT_SHUFFLE_LAG; + state->hyper_lag = DEFAULT_HYPER_LAG; + state->level_lag = DEFAULT_LEVEL_LAG; + state->output_lag = DEFAULT_OUTPUT_LAG; + state->sample_eta = sample_eta; + state->sample_gem = sample_gem; + state->sample_gam = DEFAULT_SAMPLE_GAM; + state->run_dir = NULL; + state->score_log = NULL; + state->tree_structure_log = NULL; + return(state); +} + + +void set_up_directories(gibbs_state * state, char * out_dir) +{ + state->run_dir = malloc(sizeof(char) * 100); + // set up the run directory + int id = 0; + sprintf(state->run_dir, "%s/run%03d", out_dir, id); + while (directory_exist(state->run_dir)) + { + id++; + sprintf(state->run_dir, "%s/run%03d", out_dir, id); + } + mkdir(state->run_dir, S_IRUSR|S_IWUSR|S_IXUSR); + // set up output files + char filename[100]; + sprintf(filename, "%s/tree.log", state->run_dir); + state->tree_structure_log = fopen(filename, "w"); + sprintf(filename, "%s/score.log", state->run_dir); + state->score_log = fopen(filename, "w"); + fprintf(state->score_log, + "%6s %14s %14s %14s %14s %10s %10s", + "iter", "gem.score", "eta.score", "gamma.score", + "total.score", "gem.mean", "gem.scale"); + int l; + for (l = 0; l < state->tr->depth - 1; l++) + fprintf(state->score_log, " %8s.%d", "gamma", l); + for (l = 0; l < state->tr->depth; l++) + fprintf(state->score_log, " %8s.%d", "eta", l); + fprintf(state->score_log, "\n"); + fflush(state->score_log); +} + + +/* + gibbs_state * parcopy_gibbs_state(gibbs_state* orig) + { + gibbs_state * state = malloc(sizeof(gibbs_state)); + state->corp = copy_corp(orig->corp); + state->tr = copy_tree(orig->tr); + state->run_dir = orig->run_dir; + state->score_log = orig->score_log; + state->tree_structure_log = orig->tree_structure_log; + state->shuffle_lag = orig->shuffle_lag; + state->hyper_lag = orig->hyper_lag; + state->level_lag = orig->level_lag; + state->output_lag = orig->output_lag; + state->iter = orig->iter; + } +*/ + + +gibbs_state * new_heldout_gibbs_state(corpus* corp, gibbs_state* orig) +{ + gibbs_state * state = malloc(sizeof(gibbs_state)); + state->corp = corp; + state->tr = copy_tree(orig->tr); + + state->run_dir = NULL; + state->score_log = NULL; + state->tree_structure_log = NULL; + + state->shuffle_lag = orig->shuffle_lag; + state->hyper_lag = -1; + state->level_lag = orig->level_lag; + state->output_lag = -1; + state->iter = 0; + + return(state); +} + + +double mean_heldout_score(corpus* corp, + gibbs_state* orig, + int burn, + int lag, + int niter) +{ + double score = 0; + int nsamples = 0; + + gibbs_state* state = new_heldout_gibbs_state(corp, orig); + + init_gibbs_state(state); + int iter = 0; + for (iter = 0; iter < niter; iter++) + { + if ((iter % 100) == 0) outlog("held-out iter %04d", iter); + iterate_gibbs_state(state); + if ((iter >= burn) && ((iter % lag) == 0)) + { + double this_score = state->score - orig->score; + this_score -= state->gamma_score; + this_score += orig->gamma_score; + score += this_score; + nsamples += 1; + } + } + score = score / nsamples; + outlog("mean held-out score = %7.3f (%d samples)", score, nsamples); + free_tree(state->tr); + free(state); + return(score); +} + + +void free_gibbs_state(gibbs_state* state) +{ + free_corpus(state->corp); + free_tree(state->tr); + if (state->score_log != NULL) + fclose(state->score_log); + if (state->tree_structure_log != NULL) + fclose(state->tree_structure_log); + if (state->run_dir != NULL) + free(state->run_dir); + free(state); +} diff --git a/hlda/gibbs.h b/hlda/gibbs.h new file mode 100755 index 0000000..4af1b74 --- /dev/null +++ b/hlda/gibbs.h @@ -0,0 +1,47 @@ +#ifndef GIBBSH +#define GIBBSH + +#include "utils.h" +#include "typedefs.h" +#include "doc.h" +#include "topic.h" + +#include + +#define WRITE_MODE_CORPUS 1 +#define DEFAULT_OUTPUT_LAG 1000 +#define DEFAULT_HYPER_LAG 1 +#define DEFAULT_SHUFFLE_LAG 100 +#define DEFAULT_LEVEL_LAG -1 +#define DEFAULT_SAMPLE_GAM 0 +#define NINITREP 100 + +void write_gibbs_state(gibbs_state * state, char* filename); + +void write_gibbs_output(gibbs_state * state); + +void compute_gibbs_score(gibbs_state * state); + +void iterate_gibbs_state(gibbs_state * state); + +gibbs_state * new_gibbs_state(char* corpus, char* settings); + +gibbs_state * new_heldout_gibbs_state(corpus* corp, gibbs_state* orig); + +double mean_heldout_score(corpus* corp, + gibbs_state* orig, + int burn, + int lag, + int niter); + +void free_gibbs_state(gibbs_state* state); + +void init_gibbs_state(gibbs_state * state); + +gibbs_state * init_gibbs_state_w_rep(char* corpus, + char* settings, + char* out_dir); + +void set_up_directories(gibbs_state * state, char * out_dir); + +#endif diff --git a/hlda/hyperparameter.c b/hlda/hyperparameter.c new file mode 100755 index 0000000..7ac014b --- /dev/null +++ b/hlda/hyperparameter.c @@ -0,0 +1,51 @@ +#include "hyperparameter.h" + +/* + (p 585 in escobar and west) + + function of a,b,k,n,\alpha + + 1. sample \eta ~ B(\alpha + 1, n) + where n = the number of documents allocated to me + + 2. sample c ~ Bern(\pi) + where \pi/(1-\pi) = a+k-1 / n(b-log(\eta)) + + 3. if c = 0 then + sample \alpha' ~ G(a + k, b - log(\eta)) + else + sample \alpha' ~ G(a + k - 1, b - log(\eta)) + +*/ + +double gibbs_sample_DP_scaling(double alpha, // current alpha + double shape, // Gamma shape parameter + double scale, // Gamma scale parameter + int k, // number of components + int n) // number of data points +{ + printf("alpha=%g\nshape=%g\nscale=%g\nk=%d\nn=%d\n", + alpha, shape, scale, k, n); + + double eta = rbeta(alpha + 1, (double) n); + double pi = shape + k - 1; + double rate = 1.0/scale - log(eta); + pi = pi / (pi + rate * n); + int c = rbernoulli(pi); + + double alpha_new = 0; + if (c == 0) + { + alpha_new = rgamma(shape + k - 1, 1.0/rate); + } + else + { + alpha_new = rgamma(shape + k, 1.0/rate); + } + + printf("-----\nnew alpha=%g\n-----\n", alpha_new); + + + return(alpha_new); +} + diff --git a/hlda/hyperparameter.h b/hlda/hyperparameter.h new file mode 100755 index 0000000..4e2d54f --- /dev/null +++ b/hlda/hyperparameter.h @@ -0,0 +1,12 @@ +#ifndef HYPERPARAMETERH +#define HYPERPARAMETERH + +#include "utils.h" + +double gibbs_sample_DP_scaling(double alpha, + double shape, + double scale, + int k, + int n); + +#endif diff --git a/hlda/initial.txt b/hlda/initial.txt new file mode 100755 index 0000000..5c55752 --- /dev/null +++ b/hlda/initial.txt @@ -0,0 +1,4 @@ +DEPTH 3 +ALPHA 50 20 10 +ETA 0.5 0.5 0.5 +GAM 0.5 0.5 0.5 diff --git a/hlda/main b/hlda/main new file mode 100755 index 0000000..e273f18 Binary files /dev/null and b/hlda/main differ diff --git a/hlda/main.c b/hlda/main.c new file mode 100755 index 0000000..8bd56cd --- /dev/null +++ b/hlda/main.c @@ -0,0 +1,91 @@ +#include "utils.h" +#include "typedefs.h" +#include "doc.h" +#include "topic.h" +#include "gibbs.h" +#include +#include +#include +#include + +#define MAX_ITER 10000 +#define TEST_LAG 100 +#define NRESTARTS 1 + +// simple gibbs sampling on a data set + +void main_gibbs(int ac, char* av[]) +{ + assert(ac == 5); + + char* corpus = av[2]; + char* settings = av[3]; + char* out_dir = av[4]; + + int restart; + for (restart = 0; restart < NRESTARTS; restart++) + { + gibbs_state* state = + init_gibbs_state_w_rep(corpus, settings, out_dir); + int iter; + for (iter = 0; iter < MAX_ITER; iter++) + { + iterate_gibbs_state(state); + } + free_gibbs_state(state); + } +} + +void main_heldout(int ac, char* av[]) +{ + assert(ac == 6); + + char* train = av[2]; + char* test = av[3]; + char* settings = av[4]; + char* out_dir = av[5]; + + gibbs_state* state = init_gibbs_state_w_rep(train, settings, out_dir); + corpus* heldout_corp = corpus_new(state->corp->gem_mean, + state->corp->gem_scale); + read_corpus(test, heldout_corp, state->tr->depth); + + char filename[100]; + sprintf(filename, "%s/test.dat", state->run_dir); + FILE* test_log = fopen(filename, "w"); + int iter; + for (iter = 0; iter < MAX_ITER; iter++) + { + iterate_gibbs_state(state); + if ((state->iter % TEST_LAG) == 0) + { + double score = mean_heldout_score(heldout_corp, state, + 200, 1, 1000); + fprintf(test_log, "%04d %10.3f %d\n", + state->iter, score, ntopics_in_tree(state->tr)); + fflush(test_log); + } + } + fclose(test_log); +} + + +int main(int ac, char* av[]) +{ + if (ac > 1) + { + if (strcmp(av[1], "gibbs") == 0) + { + main_gibbs(ac, av); + return(0); + } + else if (strcmp(av[1], "heldout") == 0) + { + main_heldout(ac, av); + return(0); + } + } + outlog("USAGE: ./main gibbs corpus settings out"); + outlog(" ./main heldout train test settings out"); + return(0); +} diff --git a/hlda/ncrp-gibbs.c b/hlda/ncrp-gibbs.c new file mode 100755 index 0000000..e69de29 diff --git a/hlda/settings-d4.txt b/hlda/settings-d4.txt new file mode 100755 index 0000000..d4f62a3 --- /dev/null +++ b/hlda/settings-d4.txt @@ -0,0 +1,9 @@ +DEPTH 4 +ETA 1.0 0.5 0.25 0.125 +GAM 0.5 0.5 0.5 +GEM_MEAN 0.35 +GEM_SCALE 100 +SCALING_SHAPE 1.0 +SCALING_SCALE 0.5 +SAMPLE_ETA 0 +SAMPLE_GEM 0 diff --git a/hlda/settings-d5.txt b/hlda/settings-d5.txt new file mode 100755 index 0000000..cb944e7 --- /dev/null +++ b/hlda/settings-d5.txt @@ -0,0 +1,7 @@ +DEPTH 5 +ETA 18 18 18 18 18 +GAM 1.0 1.0 1.0 1.0 +GEM_MEAN 0.5 +GEM_SCALE 10 +SCALING_SHAPE 1.0 +SCALING_SCALE 0.5 diff --git a/hlda/settings.txt b/hlda/settings.txt new file mode 100755 index 0000000..6e5fe03 --- /dev/null +++ b/hlda/settings.txt @@ -0,0 +1,9 @@ +DEPTH 8 +ETA 5e-4 5e-4 5e-4 5e-4 5e-4 5e-4 5e-4 5e-4 +GAM 1.0 1.0 1.0 1.0 1.0 1.0 1.0 +GEM_MEAN 0.5 +GEM_SCALE 100 +SCALING_SHAPE 1.0 +SCALING_SCALE 0.5 +SAMPLE_ETA 1 +SAMPLE_GEM 1 diff --git a/hlda/topic.c b/hlda/topic.c new file mode 100755 index 0000000..185942b --- /dev/null +++ b/hlda/topic.c @@ -0,0 +1,759 @@ +#include "topic.h" + +/* + * update the count of a word in a topic + * + */ + +void topic_update_word(topic* t, int w, double update) +{ + vinc(t->w_cnt, w, update); + t->w_tot += update; + + // change log probability and log gammas + + double eta = vget(t->tr->eta, t->level); + vset(t->log_prob_w, w, + log(vget(t->w_cnt, w) + eta) - + log(t->w_tot + t->w_cnt->size * eta)); + + vset(t->lgam_w_plus_eta, w, lgam(vget(t->w_cnt, w) + eta)); +} + + +/* + * update the document count in a topic + * + */ + +void topic_update_doc_cnt(topic* t, + double update) +{ + t->doc_tot += update; + t->log_doc_tot = log(t->doc_tot); +} + + +/* + * topic score (i.e., eta score) + * + */ + +double eta_score(topic* t) +{ + int w, c; + double score = 0; + int nwords = t->w_cnt->size; + double eta = vget(t->tr->eta, t->level); + + score = lgam(nwords * eta) - nwords * lgam(eta); + + for (w = 0; w < nwords; w++) + { + // score += lgam(vget(t->w_cnt, w) + eta); + score += vget(t->lgam_w_plus_eta, w); + } + + score -= lgam(t->w_tot + nwords * eta); + // exponential(1) prior: log(1) - 1 * eta + // score -= eta; (!!! note : this should not be added for each topic.) + + for (c = 0; c < t->nchild; c++) + { + if (t->child[c]->w_tot > 0) + score += eta_score(t->child[c]); + } + return(score); +} + + +void tree_mh_update_eta(tree* tr) +{ + gsl_vector* vect = tr->eta; + int depth = vect->size; + int accept[depth]; + int iter; + int l; + for (l = 0; l < depth; l++) accept[l] = 0; + + for (iter = 0; iter < MH_REPS; iter++) + { + double current_score = eta_score(tr->root); + for (l = 0; l < depth; l++) + { + double old = vget(vect, l); + double new = rgauss(old, MH_ETA_STDEV); + + if (new > 0) + { + vset(vect, l, new); + double new_score = eta_score(tr->root); + double r = runif(); + if (r > exp(new_score - current_score)) + { + vset(vect, l, old); + } + else + { + current_score = new_score; + accept[l]++; + } + } + } + } + outlog("sampled eta"); +} + + +/* + * update the topics from a document beginning at a specified level + * + */ + +void tree_update_from_doc(doc* d, double update, int root_level) +{ + int depth = d->path[0]->tr->depth; + int nword = d->word->size; + int n; + + for (n = 0; n < nword; n++) + { + int level = ivget(d->levels, n); + if (level > root_level) + topic_update_word(d->path[level], ivget(d->word, n), update); + } + for (n = root_level + 1; n < depth; n++) + { + topic_update_doc_cnt(d->path[n], update); + } +} + + +/* + * sample a document path from a tree + * + */ + +// !!! these should be in doc.c + +void tree_remove_doc_from_path(tree* tr, doc* d, int root_level) +{ + tree_update_from_doc(d, -1.0, root_level); + tree_prune(d->path[tr->depth - 1]); +} + + +void tree_add_doc_to_path(topic* node, doc* d, int root_level) +{ + // set path + + int depth = node->tr->depth; + int l = depth-1; + do + { + d->path[l] = node; + node = node->parent; + l--; + } + while (l >= root_level); + + // update new path with this document + + tree_update_from_doc(d, +1.0, root_level); +} + + +/* + * sample the path of a document starting from a particular level. + * + */ + +void tree_sample_doc_path(tree* tr, + doc* d, + short do_remove, + int root_level) +{ + // possibly remove document from path + + if (do_remove == 1) + { + tree_remove_doc_from_path(tr, d, root_level); + } + + // compute probability + + double logsum = 0; + double path_prob[tr->depth]; + populate_prob_dfs(d->path[root_level], d, &logsum, path_prob, root_level); + + // sample node and fill tree + + topic* node = tree_sample_path(d->path[root_level], logsum); + node = tree_fill(node); + + // add document to this path + + tree_add_doc_to_path(node, d, root_level); +} + + +/* + * populate the probability slot, based on a document + * + */ + +// !!! we can easily make these the same function and pass in an empty +// topic. + +double log_gamma_ratio(doc* d, topic* t, int level) +{ + int nterms = t->log_prob_w->size; + int nword = d->word->size; + int n, count[nterms]; + double result; + + for (n = 0; n < nword; n++) + { + count[ivget(d->word, n)] = 0; + } + for (n = 0; n < nword; n++) + { + if (ivget(d->levels, n) == level) + { + count[ivget(d->word, n)]++; + } + } + + double eta = vget(t->tr->eta, t->level); + result = lgam(t->w_tot + nterms * eta); // !!! this should be precomputed + result -= lgam(t->w_tot + vget(d->tot_levels, level) + nterms * eta); + + for (n = 0; n < nword; n++) + { + int wd = ivget(d->word, n); + if (count[wd] > 0) + { + // result -= vget(t->lgam_w_plus_eta, wd); + result -= lgam(vget(t->w_cnt, wd) + eta); // !!! this should be precomputed + result += lgam(vget(t->w_cnt, wd) + count[wd] + eta); + count[wd] = 0; + } + } + return(result); +} + + +double log_gamma_ratio_new(doc* d, int level, double eta, int nterms) +{ + int n, count[nterms]; + double result; + int nword = d->word->size; + + for (n = 0; n < nword; n++) + { + count[ivget(d->word, n)] = 0; + } + for (n = 0; n < nword; n++) + { + if (ivget(d->levels, n) == level) + count[ivget(d->word, n)]++; + } + result = lgam(nterms*eta); + result -= lgam(vget(d->tot_levels, level) + nterms * eta); + + for (n = 0; n < nword; n++) + { + int wd = ivget(d->word, n); + if (count[wd] > 0) + { + result -= lgam(eta); + result += lgam(count[wd] + eta); + count[wd] = 0; + } + } + return(result); +} + + +void populate_prob_dfs(topic* node, + doc* d, + double* logsum, + double* pprob, + int root_level) +{ + int l, c; + int level = node->level; + int depth = node->tr->depth; + + // set path_prob for current node + pprob[level] = log_gamma_ratio(d, node, level); + + double denom = 0; + if (level > root_level) + { + + denom = log(node->parent->doc_tot + node->parent->scaling); + // !!! PY process + // denom = log(node->parent->doc_tot + node->parent->nchild * (level - 1) * 0.01 + vget(node->tr->gam, level - 1)); + // !!! possibly slowing us down + // pprob[level] += node->log_doc_tot - denom; + pprob[level] += log(node->doc_tot) - denom; + } + + // set path probs for levels below this node + // !!! do we need this if statement? + if (level < depth - 1) + { + int nterms = node->log_prob_w->size; + for (l = level+1; l < depth; l++) + { + double eta = vget(node->tr->eta, l); + pprob[l] = log_gamma_ratio_new(d, l, eta, nterms); + } + // !!! PY process + // double gam = vget(node->tr->gam, level) + node->nchild * level * 0.01; + + // !!! precompute these logs + pprob[level+1] += log(node->scaling); + pprob[level+1] -= log(node->doc_tot + node->scaling); + } + + // set probability for this node + node->prob = 0; + for (l = root_level; l < depth; l++) node->prob += pprob[l]; + // printf("%d %10.7e\n", level, node->prob); + + // update the normalizing constant + if (level==root_level) + *logsum = node->prob; + else + *logsum = log_sum(*logsum, node->prob); + + // recurse + for (c = 0; c < node->nchild; c++) + populate_prob_dfs(node->child[c], d, logsum, pprob, root_level); +} + + + +/* + * prune tree + * + */ + +void tree_prune(topic* t) +{ + topic* parent = t->parent; + if (t->doc_tot == 0) + { + delete_node(t); + if (parent != NULL) + { + tree_prune(parent); + } + } +} + + +/* + * delete a node from the tree + * + */ + +void delete_node(topic* t) +{ + int c; + + // delete all children + for (c = 0; c < t->nchild; c++) + { + delete_node(t->child[c]); + } + + // update parent + int nc = t->parent->nchild; + for (c = 0; c < nc; c++) + { + if (t->parent->child[c] == t) + { + t->parent->child[c] = t->parent->child[nc - 1]; + t->parent->nchild--; + } + } + + // free allocated memory for word counts and children + gsl_vector_free(t->w_cnt); + gsl_vector_free(t->log_prob_w); + gsl_vector_free(t->lgam_w_plus_eta); + free(t->child); + free(t); +} + + +/* + * fill tree + * + */ + +topic* tree_fill(topic* t) +{ + if (t->level < t->tr->depth-1) + { + topic* c = topic_add_child(t); + return(tree_fill(c)); + } + else + { + return(t); + } +} + + +/* + * add child + * + */ + +topic* topic_add_child(topic* t) +{ + // increase the number of children + t->nchild++; + // reallocate the child vector and create the new child + t->child = (topic**) realloc(t->child, sizeof(topic*) * t->nchild); + t->child[t->nchild - 1] = topic_new(t->w_cnt->size, t->level+1, t, t->tr); + + return(t->child[t->nchild - 1]); +} + + +/* + * new topic + * + */ + +topic* topic_new(int nwords, int level, topic* parent, tree* tr) +{ + topic* t = malloc(sizeof(topic)); + + t->w_tot = 0; + t->w_cnt = gsl_vector_calloc(nwords); + t->log_prob_w = gsl_vector_calloc(nwords); + t->lgam_w_plus_eta = gsl_vector_calloc(nwords); + t->log_doc_tot = 0; // !!! make this a NAN? + t->doc_tot = 0; + t->level = level; + t->nchild = 0; + t->child = NULL; + t->parent = parent; + t->tr = tr; + t->id = tr->next_id++; + // sample the scaling parameter from the prior + // !!! here we can set it to the level gamma to reproduce the old code + // t->scaling = rgamma(tr->scaling_shape, tr->scaling_scale); + t->scaling = t->tr->scaling_shape * t->tr->scaling_scale; + + // set log probabilities + + double eta = vget(t->tr->eta, t->level); + double log_p_w = log(eta) - log(eta * nwords); + gsl_vector_set_all(t->log_prob_w, log_p_w); + + return(t); +} + + +/* + * sample_node draws a random number and then selects a node in the + * tree based on prob and that random number. it takes the log + * normalizer as an argument and normalizes the probabilities as it + * goes through them. + * + */ + +topic* tree_sample_path(topic* root, double logsum) +{ + + double running_sum = 0; + double r = runif(); + // outlog("rand=%7.5f", r); + return(tree_sample_dfs(r, root, &running_sum, logsum)); +} + + +/* + * r : random number + * node : current node + * lognorm : log normalizer + * sum : pointer to running sum -- updated at each call + * + */ + +topic* tree_sample_dfs(double r, topic* node, double* sum, double logsum) +{ + *sum = *sum + exp(node->prob - logsum); + if (*sum >= r) + { + // outlog("selected node at level%d with prob %7.5e", + // node->level, exp(node->prob-logsum)); + return(node); + } + else + { + int i; + for (i = 0; i < node->nchild; i++) + { + topic* val = tree_sample_dfs(r, node->child[i], sum, logsum); + if (val != NULL) + { + return(val); + } + } + } + return(NULL); +} + + +/* + * tree score (i.e., gamma score) + * includes a flag to decide whether to sample the scaling parameter + * + */ + +// !!! is this right even if we have empty topics that aren't used? + +double gamma_score(topic* t) +{ + int c; + double score = 0; + + if (t->nchild > 0) + { + // !!! this is only appropriate when we have a prior on gamma + // score += log_dgamma(t->scaling, + // t->tr->scaling_shape, + // t->tr->scaling_scale); + // score += log(t->scaling) * t->nchild; // !!! what is this? + score -= lgam(t->scaling + t->doc_tot); + for (c = 0; c < t->nchild; c++) + { + score += lgam(t->scaling + t->child[c]->doc_tot); + score += gamma_score(t->child[c]); + } + } + return(score); +} + + +/* + * recursively sample the scaling parameters + * + */ + +void dfs_sample_scaling(topic* t) +{ + if (t->nchild > 0) + { + t->scaling = gibbs_sample_DP_scaling(t->scaling, + t->tr->scaling_shape, + t->tr->scaling_scale, + t->nchild, + t->doc_tot); + } + int c; + for (c = 0; c < t->nchild; c++) + { + dfs_sample_scaling(t->child[c]); + } +} + + + +/* + * allocate a new tree + * + */ + +tree* tree_new(int depth, + int nwords, + gsl_vector* eta, + gsl_vector* gam, + double scaling_shape, + double scaling_scale) +{ + tree* tr = malloc(sizeof(tree)); + + tr->depth = depth; + tr->eta = eta; + tr->gam = gam; + tr->next_id = 0; + tr->scaling_shape = scaling_shape; + tr->scaling_scale = scaling_scale; + + tr->root = topic_new(nwords, 0, NULL, tr); + + return(tr); +} + + +/* + * write a topic tree + * + */ + +void write_tree_topics_dfs(topic* root_topic, FILE* file) +{ + int i; + + fprintf(file, "%-6d", root_topic->id); + + if (root_topic->parent != NULL) + fprintf(file, " %-6d", root_topic->parent->id); + else + fprintf(file, " %-6d", -1); + + fprintf(file, " %06.0f", root_topic->doc_tot); + fprintf(file, " %06.0f", root_topic->w_tot); + fprintf(file, " %06.3e", root_topic->scaling); + + for (i = 0; i < root_topic->w_cnt->size; i++) + { + fprintf(file, " %6.0f", vget(root_topic->w_cnt, i)); + } + fprintf(file, "\n"); + + for (i = 0; i < root_topic->nchild; i++) + { + write_tree_topics_dfs(root_topic->child[i], file); + } +} + +/* + * compute the number of topics in a tree + * + */ + +int ntopics_in_tree(tree * tr) +{ + return(ntopics_in_tree_dfs(tr->root)); +} + +int ntopics_in_tree_dfs(topic * t) +{ + int topics_below = 0; + int c; + for (c = 0; c < t->nchild; c++) + { + topics_below += ntopics_in_tree_dfs(t->child[c]); + } + return(t->nchild + topics_below); +} + + +/* + * free a tree + * + */ + +void free_tree(tree * tr) +{ + free_tree_dfs(tr->root); + free(tr); +} + +void free_tree_dfs(topic * t) +{ + gsl_vector_free(t->w_cnt); + gsl_vector_free(t->log_prob_w); + gsl_vector_free(t->lgam_w_plus_eta); + int c; + for (c = 0; c < t->nchild; c++) + free_tree_dfs(t->child[c]); + free(t->child); + free(t); +} + +/* + * copy a tree + * + */ + +tree * copy_tree(const tree* tr) +{ + tree* tree_copy = tree_new(tr->depth, + tr->root->w_cnt->size, + tr->eta, + tr->gam, + tr->scaling_shape, + tr->scaling_scale); + + copy_tree_dfs(tr->root, tree_copy->root); + + return(tree_copy); +} + +void copy_tree_dfs(const topic* src, topic* dest) +{ + copy_topic(src, dest); + + int c; + for (c = 0; c < src->nchild; c++) + { + topic* child = topic_add_child(dest); + child->parent = dest; + copy_tree_dfs(src->child[c], child); + } +} + +void copy_topic(const topic* src, topic* dest) +{ + dest->w_tot = src->w_tot; + gsl_vector_memcpy(dest->w_cnt, src->w_cnt); + gsl_vector_memcpy(dest->log_prob_w, src->log_prob_w); + gsl_vector_memcpy(dest->lgam_w_plus_eta, src->lgam_w_plus_eta); + dest->doc_tot = src->doc_tot; + dest->log_doc_tot = src->log_doc_tot; + dest->id = src->id; + dest->level = src->level; + dest->nchild = 0; // children get added separately + dest->scaling = src->scaling; + dest->prob = src->prob; +} + + +/* + * write levels of the tree + * + */ + +void write_tree_levels(tree* tr, FILE* file) +{ + write_tree_level_dfs(tr->root, file); + fprintf(file, "\n"); + fflush(file); +} + + +void write_tree_level_dfs(topic* root_topic, FILE* file) +{ + int i; + if (root_topic->parent == NULL) + { + fprintf(file, "%d", root_topic->level); + } + else + { + fprintf(file, " %d", root_topic->level); + } + for (i = 0; i < root_topic->nchild; i++) + { + write_tree_level_dfs(root_topic->child[i], file); + } +} + + +void write_tree(tree* tf, FILE* file) +{ + fprintf(file, "%-6s %-6s %-6s %-6s %-9s %-6s\n", + "ID", "PARENT", "NDOCS", "NWORDS", "SCALE", "WD_CNT"); + write_tree_topics_dfs(tf->root, file); +} diff --git a/hlda/topic.h b/hlda/topic.h new file mode 100755 index 0000000..0301043 --- /dev/null +++ b/hlda/topic.h @@ -0,0 +1,175 @@ +#ifndef TOPICH +#define TOPICH + +#include +#include +#include "utils.h" +#include "typedefs.h" +#include "hyperparameter.h" + +#define MH_ETA_STDEV 0.005 +#define MH_GAM_STDEV 0.005 + +/* === topic methods === */ + +/* + * update the count of a word in a topic + * + */ + +void topic_update_word(topic* t, int w, double update); + + +/* + * update the document count in a topic + * + */ + +void topic_update_doc_cnt(topic* t, double update); + + +/* + * write log probability in a file + * + */ + +void topic_write_log_prob(topic* t, FILE* f); + + +/* === tree methods === */ + +/* + * allocate a new tree with a certain depth + * + */ + +tree* tree_new(int depth, + int nwords, + gsl_vector* eta, + gsl_vector* gam, + double scaling_shape, + double scaling_scale); + +/* + * add children to a node down to the depth of the tree; + * return the leaf of that path + * + */ + +topic* tree_fill(topic* t); + + +/* + * add a child to a topic + * + */ + +topic* topic_add_child(topic* t); + + +/* + * make a new topic + * + */ + +topic* topic_new(int nwords, int level, topic* parent, tree* tr); + + +/* + * given a leaf with 0 instances, delete the leaf and all ancestors + * which also have 0 instances. (!!! use asserts here) + * + */ + +void tree_prune(topic* t); + + +/* + * delete a node from the tree + * + */ + +void delete_node(topic* t); + + +/* + * write a tree to file + * + */ + +void tree_write_log_prob(tree* tree, FILE* f); + + +/* + * sample a document path from a tree + * + */ + +void populate_prob_dfs(topic* node, doc* d, double* logsum, double* pprob, int root_level); +void tree_sample_doc_path(tree* tr, doc* d, short do_remove, int root_level); + +/* + * sample a new path in the tree for a document + * + */ + +void tree_sample_path_for_doc(tree* t, doc* d); + + +/* + * update the tree from an entire document + * + */ + +void tree_update_from_doc(doc* d, double update, int root_level); + + +/* + * sample a leaf from the tree with populated probabilties + * + */ + +topic* tree_sample_path(topic* node, double logsum); +topic* tree_sample_dfs(double r, topic* node, double* sum, double logsum); +void tree_add_doc_to_path(topic* node, doc* d, int root_level); +void tree_remove_doc_from_path(tree* tr, doc* d, int root_level); + +/* + * write a tree to a file + * + */ + +void write_tree(tree* tf, FILE* file); +void write_tree_levels(tree* tr, FILE* file); +void write_tree_level_dfs(topic* t, FILE* f); +void write_tree_topics_dfs(topic* t, FILE* f); + +/* + * scores + * + */ + +double gamma_score(topic* t); +double gamma_score_PY(topic* t, double gam_add); +double eta_score(topic* t); +double log_gamma_ratio(doc* d, topic* t, int level); +double log_gamma_ratio_new(doc* d, int level, double eta, int nterms); +void tree_mh_update_eta(tree* tr); +void dfs_sample_scaling(topic* t); + +/* + * copying a tree + * + */ + +void copy_topic(const topic* src, topic* dest); +void copy_tree_dfs(const topic* src, topic* dest); +tree * copy_tree(const tree* tr); + +void free_tree(tree * tr); +void free_tree_dfs(topic * t); + +int ntopics_in_tree(tree * tr); +int ntopics_in_tree_dfs(topic * t); + +#endif diff --git a/hlda/tree.py b/hlda/tree.py new file mode 100755 index 0000000..4495909 --- /dev/null +++ b/hlda/tree.py @@ -0,0 +1,400 @@ +#! /usr/bin/python + +# how to use this: +# +# docmap = tree.read_jacm_docmap("/Users/blei/data/jacm/current-jacm/jacm-doc.map") +# state = tree.read_state("GOO/mode", vocab, 5) +# tree.add_assignments_to_tree('GOO/mode.assign', state['tree']) +# tree.write_topic_tree_ascii(state, docmap, "GOO.txt") +# tree.write_topic_tree_dot(goo, "GOO.dot", 0, 0) + +import sys, re, os, itertools, math + +VOCAB = '/Users/blei/data/jacm/002/jacm-vocab.dat' +DOCMAP = '/Users/blei/data/jacm/003/jacm-doc.map' + +def doc_sort_key(x, docmap, level): + return(-(x[1] + math.log(docmap[x[0]]['counts'].get(level,1e-5)))) + + +def top_n_words(topic, + vocab, + nwords): + """ + the top n words from a topic + vocab is a map from integers to words + + """ + indices = range(len(vocab)) + indices.sort(lambda x,y: -cmp(topic[x], topic[y])) + return([vocab[i] for i in indices[0:nwords]]) + + +def compute_level(id, tree): + """ + compute the level of an id in a tree + + """ + topic = tree[id] + level = 0 + while (id != 0): + level += 1 + id = topic['parent'] + topic = tree[id] + return(level) + + +def read_state(state_filename, + vocab, + sig_size): + """ + read the state from an iteration file (e.g., mode) + + """ + state = file(state_filename, 'r') + + score = float(state.readline().split()[1]) + iter = int(state.readline().split()[1]) + eta = state.readline().split() + eta = [float(x) for x in eta[1:len(eta)]] + gam = state.readline().split() + gam = [float(x) for x in gam[1:len(gam)]] + gem_mean = float(state.readline().split()[1]) + gem_scale = float(state.readline().split()[1]) + scaling_shape = float(state.readline().split()[1]) + scaling_scale = float(state.readline().split()[1]) + + header = state.readline() + tree = {} + for line in state: + + (id, parent, ndocs, nwords, scale, word_cnt) = line.split(None, 5) + (id, parent, ndocs, nwords) = [int(x) for + x in [id, parent, ndocs, nwords]] + scale = float(scale) + tree[id] = {} + tree[id]['parent'] = parent + if (parent >= 0): tree[parent]['children'].append(id) + + tree[id]['nwords'] = nwords + tree[id]['ndocs'] = ndocs + tree[id]['scale'] = scale + topic = [int(x) for x in word_cnt.split()] + tree[id]['top_words'] = top_n_words(topic, vocab, sig_size) + tree[id]['children'] = [] + + for topic in tree.values(): + topic['children'].sort(key=lambda id: -tree[id]['ndocs']) + + return({'score':score, + 'iter':iter, + 'gam':gam, + 'eta':eta, + 'gem_mean':gem_mean, + 'gem_scale':gem_scale, + 'scaling_shape':scaling_shape, + 'scaling_scale':scaling_scale, + 'tree':tree}) + + +def add_assignments_to_tree(filename, tree): + """ + reads an iter.assign file and adds document IDs to the leaf + topics. with a doc-map, we can associate titles with topics + + """ + for line in file(filename, 'r'): + (doc_id, score, path) = line.split(None, 2) + doc_id = int(doc_id) + score = float(score) + path = [int(x) for x in path.split()] + for topic in path: + tree[topic].setdefault('docs', []).append((doc_id, score)) + + + +def add_state_to_dmap(name, vocab, dmap): + """ + read the level assignments '.levels' + and the topic assignments '.assign' + and adds them to the document map + + (note: vocab is a mapping from numbers to vocabulary words) + + """ + dnum = 0 + for (levels, topics) in itertools.izip(file(name+'.levels'), + file(name+'.assign')): + + (id, score, path) = topics.split(None, 2) + id = int(id) + dmap[id]['score'] = float(score) + dmap[id]['path'] = [int(c) for c in path.split()] + + zvars = {} + counts = {} + items = levels.split() + for item in items: + (word, level) = [int(x) for x in item.split(':')] + counts[level] = counts.get(level, 0) + 1 + zvars.setdefault(vocab[word], []).append(level) + dmap[id]['levels'] = zvars + dmap[id]['counts'] = counts + + +def read_vocab_map(vocab_file): + """ + given a vocabulary file, returns a mapping from integers to words. + + """ + num = 0 + vocab = {} + for word in file(vocab_file): + vocab[num] = word.strip() + num = num + 1 + return(vocab) + + +def read_jacm_docmap(filename): + """ + read the jacm doc-map, which includes the title and abstract + + """ + docs = {} + doc_id = 0 + for line in file(filename, 'r'): + (bad_doc_id, title, abstract) = [x.replace('"', '') for x in line.split(' "')] + nwords = len(abstract.split()) + docs[doc_id] = {'title':title, 'abstract':abstract, 'nwords':nwords} + doc_id += 1 + return(docs) + + +def read_docmap(filename): + """ + read a doc-map, which is assumed to be a list of titles + + """ + docs = {} + doc_id = 0 + for line in file(filename, 'r'): + docs[doc_id] = {'title':line} + doc_id += 1 + return(docs) + + +def write_docs(docs, tree, outfile): + """ + writes a file with all the doc information + + """ + def word_and_level(word, level): + return('%s_%d' % (word, level)) + + out = file(outfile, 'w') + for doc in docs: + out.write(doc['title'] + '|') + + for topic in doc['path']: + out.write(','.join(tree[topic]['top_words'])+'|') + + abstract = ' '.join([word_and_level(w, doc['levels'].get(w,[-1])[0]) + for w in doc['abstract'].split()]) + out.write(abstract+'\n') + out.close() + +# write the topic tree with documents + +def write_topic_tree_ascii(state, + docmap, + out_filename, + ndocs = -1, + min_ndocs = 0, + include_docs = False): + + out = file(out_filename, 'w') + tree = state['tree'] + + eta = ' '.join(['%1.3e' % x for x in state['eta']]) + gam = ' '.join(['%1.3e' % x for x in state['gam']]) + + out.write('SCORE = %s\n' % str(state['score'])) + out.write('ITER = %s\n' % str(state['iter'])) + out.write('ETA = %s\n' % eta) + out.write('GAM = %s\n' % gam) + out.write('GEM_MEAN = %s\n' % str(state['gem_mean'])) + out.write('GEM_SCALE = %s\n' % str(state['gem_scale'])) + out.write('SCALING_SHAPE = %s\n' % str(state['scaling_shape'])) + out.write('SCALING_SCALE = %s\n\n' % str(state['scaling_scale'])) + + max_level = len(state['gam']) + + def write_topic(topic, level): + + indent = ' ' * level + out.write('%s' % indent) + out.write("[%d/%d/%d]" % (level, topic['nwords'], topic['ndocs'])) + # out.write(' %s' % str(topic['scale'])) + out.write(' %s\n\n' % ' '.join([x.upper() for x in topic['top_words']])) + + if ((level == max_level) and include_docs): + # if ((level > 0) and include_docs): + docs = topic['docs'] + if (docmap[0].has_key('counts')): + docs.sort(key=lambda x: doc_sort_key(x, docmap, level)) + if (ndocs > -1): docs = docs[0:ndocs] + for (doc, score) in docs: + # !!! this is broken if we don't have the counts + out.write('%s %3.2f %s\n' % + (indent, doc_sort_key([doc,score], docmap, level), + docmap[doc]['title'])) + # out.write('%s %3.2f %s\n' % + # (indent, score, docmap[doc]['title'])) + if (level == max_level): out.write('\n') + for id in topic['children']: + if ((tree[id]['ndocs'] >= min_ndocs) and + (tree[id]['nwords'] > 0)): + write_topic(tree[id], level + 1) + + write_topic(tree[0], 0) + out.close() + + +# write the topic tree + +def write_topic_tree_dot(state, + docmap, + out_filename, + min_ndocs = 2, + ndocs = -1, + join_char='\\n', + include_stats=False, + include_docs=True): + + outfile = file(out_filename, 'w') + outfile.write("digraph topic_tree {\n") + outfile.write("node [shape=egg, fontname=Helvetica];\n") + outfile.write("edge [style=bold, arrowhead=dot, arrowsize=1];\n") + outfile.write("graph [mindist=0];\n") + + eta = ' '.join(['%1.3e' % x for x in state['eta']]) + gamma = ' '.join(['%1.3e' % x for x in state['gam']]) + gem_mean = str(state['gem_mean']) + gem_scale = str(state['gem_scale']) + score = str(state['score']) + iter = str(state['iter']) + + max_level = len(state['gam']) + + outfile.write('params [shape=rectangle, style=bold, color=red,fontcolor=red, fontsize=24, label="ETA = %s\\nGAMMA = %s\\nGEM MEAN = %s\\nGEM SCALE=%s\\nSCORE = %s"]\n' % + (eta, gamma, gem_mean, gem_scale,score)) + + skip = {} + fontsizes = [24, 18, 12, 9] + + id = 0 + + def write_topic(topic, id, level): + + label = join_char.join(topic['top_words']) + if include_stats: + label = '[%d/%d]\\n' % (topic['nwords'],topic['ndocs']) + label + + outfile.write('%d [fontsize=%s, label="%s"];\n' % + (id, fontsizes[level], label)) + outfile.write('%d -> %d;\n' % (topic['parent'], id)) + + if ((level == max_level) and include_docs): + docs = topic['docs'] + docs.sort(key=lambda x: doc_sort_key(x, docmap, level)) + if (ndocs > -1): docs = docs[0:ndocs] + docs_label = join_char.join([docmap[doc[0]]['title'] + for doc in docs]) + docs_id = '%d' % (id * 10 + 1) + outfile.write('%s [fontsize=9, label="%s"];\n' % + (docs_id, docs_label)) + outfile.write('%d -> %s;\n' % (id, docs_id)) + + children = sorted(topic['children'], key=lambda x: -tree[x]['ndocs']) + for id in children: + if (tree[id]['ndocs'] >= min_ndocs): + write_topic(tree[id], id, level + 1) + + tree = state['tree'] + write_topic(tree[0], 0, 0) + outfile.write("}") + outfile.close() + + +# walk down a directory and make both text and dot trees using a +# single vocabulary and dmap. + +# tree.make_all_trees('fits/DP-nested/jacm/006/', 'data/jacm/002/jacm-vocab.dat', 'data/jacm/003/jacm-doc.map') + +def make_all_trees(dir, + vocab_filename, + dmap_filename, + sig_size=10, + ndocs=-1, + home=os.environ['HOME']): + + vocab = map(str.strip, file(home+'/'+vocab_filename, 'r').readlines()) + # docmap = read_docmap(dmap_filename) + docmap = read_jacm_docmap(home+'/'+dmap_filename) + + walk = os.walk(home+'/'+dir) + max_score = None + argmax_dir = None + for dir, _, files in walk: + files = filter(lambda x: x=='mode', files) + for f in files: + sys.stderr.write('WRITING %s/%s\n' % (dir, f)) + filename = dir+'/'+f + state = read_state(filename, vocab, sig_size) + if (state['score'] > max_score): + max_score = state['score'] + argmax_dir = dir + add_assignments_to_tree(filename+'.assign', state['tree']) + add_state_to_dmap(filename, vocab, docmap) + txt_tree = dir+'/mode.txt' + dot_tree = dir+'/mode.dot' + write_topic_tree_ascii(state, docmap, txt_tree, ndocs=ndocs) + write_topic_tree_dot(state, docmap, dot_tree, ndocs=10) + + sys.stderr.write("BEST RUN = %s\n" % argmax_dir) + +# main function + +def main(type, + iter_filename, + vocab_filename, + dmap_filename, + out_filename, + sig_size = 5, + ndocs = -1): + + vocab = map(str.strip, file(vocab_filename, "r").readlines()) + state = read_state(iter_filename, vocab, sig_size) + add_assignments_to_tree(iter_filename+'.assign', state['tree']) + + if (type == 'txt'): + if os.path. isfile(dmap_filename): + docmap = read_docmap(dmap_filename) + write_topic_tree_ascii(state, docmap, out_filename, ndocs=ndocs) + else: + write_topic_tree_dot(state, out_filename) + + +if (__name__ == '__main__'): + + if (len(sys.argv) != 6): + sys.stdout.write('usage: python tree.py \n') + sys.exit(1) + + type = sys.argv[1] + iter_filename = sys.argv[2] + vocab_filename = sys.argv[3] + dmap_filename = sys.argv[4] + out_filename = sys.argv[5] + + main(type, iter_filename, vocab_filename, dmap_filename, out_filename) diff --git a/hlda/typedefs.h b/hlda/typedefs.h new file mode 100755 index 0000000..efdf5a4 --- /dev/null +++ b/hlda/typedefs.h @@ -0,0 +1,98 @@ +#ifndef TYPEDEFS +#define TYPEDEFS + +#define MH_REPS 100 +#define TRUE 1 +#define FALSE 0 + +#include +#include "utils.h" + +typedef struct topic +{ + double w_tot; // total # of words assigned to the topic + gsl_vector* w_cnt; // vector of word counts + gsl_vector* log_prob_w; // vector of log probabilities + gsl_vector* lgam_w_plus_eta; // to prevent many repeated computations + + double doc_tot; // total number of doc-level instances + double log_doc_tot; // precomputed log of doc_tot + int id; // a unique ID for this topic + + int level; // level in the tree + int nchild; // number of children + double scaling; // scaling factor for my DP + struct topic** child; // array of pointers to child topics + struct topic* parent; // pointer to the parent topic + struct tree* tr; // pointer to my tree + + double prob; // probability (used to sample a path) +} topic; + + +typedef struct tree +{ + int depth; // depth of the tree + gsl_vector* eta; // topic dirichlet parameter + gsl_vector* gam; // scaling parameter (!!! not used; see G prior) + double scaling_shape; // shape parameter for the G prior on scaling + double scaling_scale; // scale parameter for the G prior on scaling + topic* root; // root topic of the tree + int next_id; // the next id for a new topic +} tree; + + +typedef struct doc +{ + int_vector* word; // each word + int_vector* levels; // level assigned to each word + + int id; + topic** path; // path of topics + gsl_vector* tot_levels; // level counts (convenience, for sampling) + gsl_vector* log_p_level; // log p(level) [ unnormalized ] + double* gem_mean; + double* gem_scale; + double score; +} doc; + + +typedef struct corpus +{ + double gem_mean; + double gem_scale; + int ndoc; + int nterms; + doc** doc; +} corpus; + +typedef struct gibbs_state +{ + // data and hidden variables + corpus* corp; + tree* tr; + + // current scores and iteration + double score; + double gem_score; + double eta_score; + double gamma_score; + double max_score; + int iter; + + // log files + char* run_dir; + FILE* score_log; + FILE* tree_structure_log; + + // sampling parameters + int shuffle_lag; + int hyper_lag; + int level_lag; + int output_lag; + int sample_eta; + int sample_gem; + int sample_gam; +} gibbs_state; + +#endif diff --git a/hlda/utils.c b/hlda/utils.c new file mode 100755 index 0000000..bf1df5c --- /dev/null +++ b/hlda/utils.c @@ -0,0 +1,389 @@ +#include "utils.h" +gsl_rng* RANDNUMGEN = NULL; + +/* + * initialize the GSL random number generator + * + */ + +void init_random_number_generator() +{ + if (RANDNUMGEN != NULL) return; + + RANDNUMGEN = gsl_rng_alloc(gsl_rng_taus); + long t1; + (void) time(&t1); + // !!! DEBUG + // t1 = 1147530551; + // t1 = 1201962438; + outlog("random seed = %ld\n", t1); + gsl_rng_set (RANDNUMGEN, t1); +} + + +/* + * check if a directory exists + * + */ + +int directory_exist(const char *dname) +{ + struct stat st; + int ret; + + if (stat(dname,&st) != 0) + { + return 0; + } + + ret = S_ISDIR(st.st_mode); + + if(!ret) + { + errno = ENOTDIR; + } + + return ret; +} + + +/* + * make directory + * + */ + +void make_directory(char* name) +{ + mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR); +} + + +/* + * sample from unnnormalized log probabilities + * + */ + +int sample_from_log(gsl_vector* log_prob) +{ + double logsum = vget(log_prob, 0); + int i; + for (i = 1; i < log_prob->size; i++) + { + logsum = log_sum(logsum, vget(log_prob, i)); + } + + double x = runif(); + + double rolling_sum = exp(vget(log_prob, 0) - logsum); + int result = 0; + while (x >= rolling_sum) + { + result++; + rolling_sum += exp(vget(log_prob, result) - logsum); + } + + return(result); +} + + +/* + * new integer vector + * + */ + +int_vector* new_int_vector(int size) +{ + int_vector* iv = malloc(sizeof(int_vector)); + + iv->size = size; + iv->val = malloc(sizeof(int) * iv->size); + int i; + for (i = 0; i < iv->size; i++) + ivset(iv, i, 0); + + return(iv); +} + +void delete_int_vector(int_vector* iv) +{ + free(iv->val); + free(iv); +} + + +/* + * append value to an integer vector + * + */ + +void ivappend(int_vector* iv, int val) +{ + iv->size += 1; + iv->val = (int*) realloc(iv->val, sizeof(int) * (iv->size)); + iv->val[iv->size - 1] = val; +} + + +/* + * read a vector from file + * + */ + +void vct_fscanf(const char* filename, gsl_vector* v) +{ + outlog("reading %ld vector from %s", v->size, filename); + FILE* fileptr; + fileptr = fopen(filename, "r"); + gsl_vector_fscanf(fileptr, v); + fclose(fileptr); +} + +/* + * copy an integer vector + * + */ + +int_vector* iv_copy(int_vector* iv) +{ + int_vector* ivc = malloc(sizeof(int_vector)); + ivc->size = iv->size; + ivc->val = malloc(sizeof(int) * ivc->size); + + int i; + for (i = 0; i < ivc->size; i++) + ivc->val[i] = iv->val[i]; + + return(ivc); +} + + +/* + * permute an integer vector + * + */ + +void iv_permute(int_vector* iv) +{ + gsl_ran_shuffle(RANDNUMGEN, + iv->val, + iv->size, + sizeof(int)); +} + +void iv_permute_from_perm(int_vector* iv, gsl_permutation* p) +{ + assert(iv->size == p->size); + int_vector* ivc = iv_copy(iv); + int i; + for (i = 0; i < p->size; i++) + { + ivset(iv, i, ivget(ivc, p->data[i])); + } + delete_int_vector(ivc); +} + + +/* + * get a random permutation + * + */ + +gsl_permutation* rpermutation(int size) +{ + gsl_permutation* perm = gsl_permutation_calloc(size); + gsl_ran_shuffle(RANDNUMGEN, + perm->data, size, sizeof(size_t)); + return(perm); +} + + +/* + * draw random numbers + * + */ + +double runif() +{ + return(gsl_rng_uniform(RANDNUMGEN)); +} + + +double rgauss(double mean, + double stdev) +{ + double v = + gsl_ran_gaussian_ratio_method(RANDNUMGEN, stdev) + mean; + return(v); +} + +double rgamma(double shape, + double scale) +{ + double v = gsl_ran_gamma(RANDNUMGEN, shape, scale); + return(v); +} + +double log_dgamma(double x, + double shape, + double scale) +{ + // f(x)= - a * log(s) + log_gamma(a) + (a-1) * log(x) - x/s + + double v = - shape*log(scale)+lgam(shape)+(shape-1)*log(x)-x/scale; + return(v); +} + + +double rbeta(double a, + double b) +{ + double v = gsl_ran_beta(RANDNUMGEN, a, b); + return(v); +} + +int rbernoulli(double p) +{ + int v = gsl_ran_bernoulli(RANDNUMGEN, p); + return(v); +} + +/* + * sum a vector + * + */ + +double sum(gsl_vector* v) +{ + int i; + double sum = 0; + for (i = 0; i < v->size; i++) + sum += vget(v, i); + + return(sum); +} + + +/* + * print a vector + * + */ + +void print_vector(gsl_vector* v) +{ + int i; + for (i = 0; i < v->size; i++) + { + printf(" [%d] %5.2e\n", i, vget(v, i)); + } +} + + +/* + * read/write a vector and name to a file + * + */ + +void write_vect(gsl_vector* vect, char* name, FILE* file) +{ + int i; + fprintf(file, "%-10s", name); + for (i = 0; i < vect->size; i++) + { + fprintf(file, " %17.14e", vget(vect, i)); + } + fprintf(file, "\n"); +} + + +gsl_vector* read_vect(char* name, int size, FILE* file) +{ + outlog("reading %d-vector %s", size, name); + + gsl_vector* ret = gsl_vector_alloc(size); + int i; + float v; + + fscanf(file, name); + for (i = 0; i < ret->size; i++) + { + fscanf(file, " %f", &v); + vset(ret, i, v); + } + fscanf(file, "\n"); + + print_vector(ret); + + return(ret); +} + +/* + * read a named integer from a file + * + */ + +int read_int(char* name, FILE* file) +{ + outlog("reading integer %s", name); + int ret; + fscanf(file, name); + fscanf(file, "%d\n", &ret); + outlog("read %d\n", ret); + return(ret); +} + +void write_int(int x, char* name, FILE* file) +{ + fprintf(file, name); + fprintf(file, " %d\n", x); +} + + +void write_double(double x, char* name, FILE* file) +{ + fprintf(file, name); + fprintf(file, " %17.14e\n", x); +} + +double read_double(char* name, FILE* file) +{ + double ret; + fscanf(file, name); + fscanf(file, "%lf\n", &ret); + printf("read %f\n", ret); + return(ret); +} + +FILE* LOG; +static time_t TVAL; +void outlog(char* fmt, ...) +{ + if (LOG==NULL) { LOG=stdout; } + + TVAL = time(NULL); + char* TIMESTR = ctime(&TVAL); + TIMESTR[24]=' '; + + fprintf(LOG, "[ %20s] ", TIMESTR); + + va_list args; + va_start(args, fmt); + vfprintf(LOG, fmt, args); + fprintf(LOG, "\n"); + va_end(args); + fflush(LOG); +} + +/* + * resize a gsl vector + * + */ + + +void resize(gsl_vector * vec, size_t newsize) +{ + assert(vec->stride == 1 && vec->owner && vec->block->data == vec->data); + double * p = realloc(vec->block->data, newsize * sizeof(double)); + vec->block->data = vec->data = p; + vec->size = newsize; +} + diff --git a/hlda/utils.h b/hlda/utils.h new file mode 100755 index 0000000..b095ecb --- /dev/null +++ b/hlda/utils.h @@ -0,0 +1,110 @@ +#ifndef UTILSH +#define UTILSH + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +time_t TIME; +char line1[80]; + +void outlog(char* fmt, ...); + +typedef struct int_vector +{ + int size; + int* val; +} int_vector; + +int_vector* new_int_vector(int size); +void delete_int_vector(int_vector* iv); + +void ivappend(int_vector* v, int val); + +static inline int ivget(int_vector* v, int n) +{ return(v->val[n]); }; + +static inline void ivset(int_vector* v, int n, int val) +{ v->val[n] = val; }; + +static inline void vset(gsl_vector* v, int i, double x) +{ gsl_vector_set(v, i, x); }; + +static inline double vget(const gsl_vector* v, int i) +{ return(gsl_vector_get(v, i)); }; + +static inline void vinc(gsl_vector* v, int i, double x) +{ vset(v, i, vget(v, i) + x); }; + +static inline double log_sum(double log_a, double log_b) +{ + if (log_a < log_b) + return(log_b+log(1 + exp(log_a-log_b))); + else + return(log_a+log(1 + exp(log_b-log_a))); +}; + +static inline double lgam(double x) +{ return(gsl_sf_lngamma(x)); } + +double runif(); + +double rgauss(double mean, double stdev); + +int sample_from_log(gsl_vector* log_prob); + +void init_random_number_generator(); + +void vct_fscanf(const char* filename, gsl_vector* v); + +void iv_permute(int_vector* iv); +void iv_permute_from_perm(int_vector* iv, gsl_permutation* p); +int_vector* iv_copy(int_vector* iv); + +double sum(gsl_vector* v); + +void print_vector(gsl_vector* v); + +gsl_permutation* rpermutation(int size); + +void write_vect(gsl_vector* vect, char* name, FILE* file); + +gsl_vector* read_vect(char* name, int size, FILE* file); + +int read_int(char* name, FILE* file); + +void write_int(int x, char* name, FILE* file); + +void write_double(double x, char* name, FILE* file); + +double read_double(char* name, FILE* file); + +int directory_exist(const char *dname); + +void make_directory(char* name); + +int rbernoulli(double p); + +double rbeta(double a, + double b); + +double rgamma(double shape, + double scale); + + +double log_dgamma(double x, + double shape, + double scale); + +void resize(gsl_vector * vec, size_t newsize); + +#endif