Skip to content

Commit

Permalink
adding nonparametric hlda to determine num topics that r optimal
Browse files Browse the repository at this point in the history
  • Loading branch information
msimchowitz committed May 10, 2014
1 parent e03903c commit 53ec8b3
Show file tree
Hide file tree
Showing 29 changed files with 4,551 additions and 0 deletions.
40 changes: 40 additions & 0 deletions hlda/.#gibbs.h.1.2
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef GIBBSH
#define GIBBSH

#include "utils.h"
#include "typedefs.h"
#include "doc.h"
#include "topic.h"

#include <stdio.h>

#define WRITE_MODE_CORPUS 1

#define DEFAULT_OUTPUT_LAG 100
#define DEFAULT_HYPER_LAG 1
#define DEFAULT_SHUFFLE_LAG 100
#define DEFAULT_LEVEL_LAG 1

void write_gibbs_state(gibbs_state * state, char* filename);

void write_gibbs_output(gibbs_state * state);

void compute_gibbs_score(gibbs_state * state);

void iterate_gibbs_state(gibbs_state * state);

void initialize_gibbs_state(gibbs_state * state);

gibbs_state * new_gibbs_state(char* corpus, char* settings, char* out_dir);

gibbs_state * new_heldout_gibbs_state(corpus* corp, gibbs_state* orig);

double mean_heldout_score(corpus* corp,
gibbs_state* orig,
int burn,
int lag,
int niter);

void free_gibbs_state(gibbs_state* state);

#endif
92 changes: 92 additions & 0 deletions hlda/.#main.c.1.44
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#include "utils.h"
#include "typedefs.h"
#include "doc.h"
#include "topic.h"
#include "gibbs.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>

#define MAX_ITER 10000
#define TEST_LAG 100
#define NRESTARTS 10

// simple gibbs sampling on a data set

void main_gibbs(int ac, char* av[])
{
assert(ac == 5);

char* corpus = av[2];
char* settings = av[3];
char* out_dir = av[4];

int restart;
for (restart = 0; restart < NRESTARTS; restart++)
{
gibbs_state* state = new_gibbs_state(corpus, settings, out_dir);
initialize_gibbs_state(state);
int iter;
for (iter = 0; iter < MAX_ITER; iter++)
{
iterate_gibbs_state(state);
}
free_gibbs_state(state);
}
}

void main_heldout(int ac, char* av[])
{
assert(ac == 6);

char* train = av[2];
char* test = av[3];
char* settings = av[4];
char* out_dir = av[5];

gibbs_state* state = new_gibbs_state(train, settings, out_dir);
initialize_gibbs_state(state);
corpus* heldout_corp = corpus_new(state->corp->gem_mean,
state->corp->gem_scale);
read_corpus(test, heldout_corp, state->tr->depth);

char filename[100];
sprintf(filename, "%s/test.dat", state->run_dir);
FILE* test_log = fopen(filename, "w");
int iter;
for (iter = 0; iter < MAX_ITER; iter++)
{
iterate_gibbs_state(state);
if ((state->iter % TEST_LAG) == 0)
{
double score = mean_heldout_score(heldout_corp, state,
200, 1, 1000);
fprintf(test_log, "%04d %10.3f %d\n",
state->iter, score, ntopics_in_tree(state->tr));
fflush(test_log);
}
}
fclose(test_log);
}


int main(int ac, char* av[])
{
if (ac > 1)
{
if (strcmp(av[1], "gibbs") == 0)
{
main_gibbs(ac, av);
return(0);
}
else if (strcmp(av[1], "heldout") == 0)
{
main_heldout(ac, av);
return(0);
}
}
outlog("USAGE: ./main gibbs corpus settings out");
outlog(" ./main heldout train test settings out");
return(0);
}
91 changes: 91 additions & 0 deletions hlda/.#main.c.1.46
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include "utils.h"
#include "typedefs.h"
#include "doc.h"
#include "topic.h"
#include "gibbs.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>

#define MAX_ITER 10000
#define TEST_LAG 100
#define NRESTARTS 1

// simple gibbs sampling on a data set

void main_gibbs(int ac, char* av[])
{
assert(ac == 5);

char* corpus = av[2];
char* settings = av[3];
char* out_dir = av[4];

int restart;
for (restart = 0; restart < NRESTARTS; restart++)
{
gibbs_state* state =
init_gibbs_state_w_rep(corpus, settings, out_dir);
int iter;
for (iter = 0; iter < MAX_ITER; iter++)
{
iterate_gibbs_state(state);
}
free_gibbs_state(state);
}
}

void main_heldout(int ac, char* av[])
{
assert(ac == 6);

char* train = av[2];
char* test = av[3];
char* settings = av[4];
char* out_dir = av[5];

gibbs_state* state = init_gibbs_state_w_rep(train, settings, out_dir);
corpus* heldout_corp = corpus_new(state->corp->gem_mean,
state->corp->gem_scale);
read_corpus(test, heldout_corp, state->tr->depth);

char filename[100];
sprintf(filename, "%s/test.dat", state->run_dir);
FILE* test_log = fopen(filename, "w");
int iter;
for (iter = 0; iter < MAX_ITER; iter++)
{
iterate_gibbs_state(state);
if ((state->iter % TEST_LAG) == 0)
{
double score = mean_heldout_score(heldout_corp, state,
200, 1, 1000);
fprintf(test_log, "%04d %10.3f %d\n",
state->iter, score, ntopics_in_tree(state->tr));
fflush(test_log);
}
}
fclose(test_log);
}


int main(int ac, char* av[])
{
if (ac > 1)
{
if (strcmp(av[1], "gibbs") == 0)
{
main_gibbs(ac, av);
return(0);
}
else if (strcmp(av[1], "heldout") == 0)
{
main_heldout(ac, av);
return(0);
}
}
outlog("USAGE: ./main gibbs corpus settings out");
outlog(" ./main heldout train test settings out");
return(0);
}
18 changes: 18 additions & 0 deletions hlda/.Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
scan("/Users/blei/3.Current/DP-nested/src/full-jacm/run012/score.log")
foo <- scan("/Users/blei/3.Current/DP-nested/src/full-jacm/run012/score.log")
foo[,4]
foo <- read.table("/Users/blei/3.Current/DP-nested/src/full-jacm/run012/score.log")
foo[,4]
foo[,5]
foo[,6]
plot(foo[,5], type="b")
plot(foo[,7], type="b")
plot(foo[,8], type="b")
plot(foo[,1], type="b")
plot(foo[,2], type="b")
plot(foo[,2], type="l")
plot(foo[,3], type="l")
plot(foo[,4], type="l")
plot(foo[,5], type="l")
quit()
n
6 changes: 6 additions & 0 deletions hlda/.gdb_history
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
run ~/data/jacm/current-jacm/jacm-mult/jacm.dat settings.txt ~/SANDBOX/FOO
up
up
up
run
quit
29 changes: 29 additions & 0 deletions hlda/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
.SUFFIXES: .c .u
CC= gcc
# CFLAGS_MAC = -g -Wall -O3 -DHAVE_INLINE -DGSL_RANGE_CHECK_OFF -Winline -funroll-loops -fstrict-aliasing -fsched-interblock -falign-loops=16 -falign-jumps=16 -falign-functions=16 -falign-jumps-max-skip=15 -falign-loops-max-skip=15 -malign-natural -ffast-math -mdynamic-no-pic -mpowerpc-gpopt -force_cpusubtype_ALL -fstrict-aliasing -mcpu=7450 -faltivec
CFLAGS_MAC = -g -Wall -O3 -DHAVE_INLINE -DGSL_RANGE_CHECK_OFF -Winline -fast -I/opt/local/include/gsl
CFLAGS_PTON = -g -Wall -O3 -DHAVE_INLINE=1 -DGSL_RANGE_CHECK_OFF=1
CFLAGS_DEBUG = -g -Wall
CFLAGS = -g -Wall -I/opt/local/include/gsl/ -I/usr/include/sys/ -I/usr/include/

# MAC_LDFLAGS = -lgsl -latlas -lcblas -L/sw/li
MAC_LDFLAGS = -lgsl -lgslcblas -L/opt/local/lib
C2_LDFLAGS = -lgsl -lcblas -latlas
CYCLES_LDFLAGS = -lgsl -lgslcblas
LSOURCE = utils.c topic.c doc.c hyperparameter.c main.c gibbs.c
LOBJECTS = utils.o topic.o doc.o hyperparameter.o main.o gibbs.o

main: $(LOBJECTS)
$(CC) $(CFLAGS_MAC) $(LOBJECTS) -o main $(MAC_LDFLAGS)

c2: $(LOBJECTS)
$(CC) $(CFLAGS_PTON) $(LOBJECTS) -o main $(C2_LDFLAGS)

cycles: $(LOBJECTS)
$(CC) $(CFLAGS_PTON) $(LOBJECTS) -o main $(CYCLES_LDFLAGS)

debug: $(LOBJECTS)
$(CC) $(CFLAGS_DEBUG) $(LOBJECTS) -o main $(MAC_LDFLAGS)

clean:
-rm -f *.o
34 changes: 34 additions & 0 deletions hlda/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
This code implements hierarchical LDA with a fixed depth tree and a
stick breaking prior on the depth weights. An infinite-depth tree can
be approximated by setting the depth to be very high. This code
requires that you have installed the GSL package.

The input format of the data is the same as in the LDA-C package.
Each line contains

[# of unique terms] [term #] : [count] ...

The settings file controls various parameters of the model. There are
several settings files contained in this directory.


IMPORTANT:

I hope that this code is useful to you, but please note that this code
is UNSUPPORTED. Do not email me with questions. I like posting as
much code as possible, but I unfortunately do not have the time to
support all of it. (This paragraph is my solution to the problem.)

HLDA-C is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2 of the License, or (at your
option) any later version.

LDA-C is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.




Loading

0 comments on commit 53ec8b3

Please sign in to comment.