Skip to content

Commit 864cc46

Browse files
authored
Merge pull request #2 from biocore/in-memory-one-off
In memory one off
2 parents 22e0302 + 80aeab0 commit 864cc46

9 files changed

+578
-107
lines changed

src/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ install: libssu.so ssu faithpd
135135
mkdir -p ${PREFIX}/include/unifrac
136136
cp task_parameters.hpp ${PREFIX}/include/unifrac/
137137
cp api.hpp ${PREFIX}/include/unifrac/
138+
cp status_enum.hpp ${PREFIX}/include/unifrac/
138139

139140
rapi_test: main
140141
mkdir -p ~/.R

src/api.cpp

+170-67
Large diffs are not rendered by default.

src/api.hpp

+85-10
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "task_parameters.hpp"
2+
#include "status_enum.hpp"
23

34
#ifdef __cplusplus
45
#include <vector>
@@ -14,10 +15,6 @@
1415
#define PARTIAL_MAGIC_V2 0x088ABA02
1516

1617

17-
typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error} ComputeStatus;
18-
typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end, write_error} IOStatus;
19-
typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus;
20-
2118
/* a result matrix
2219
*
2320
* n_samples <uint> the number of samples.
@@ -122,14 +119,49 @@ typedef struct partial_dyn_mat {
122119
char* filename;
123120
} partial_dyn_mat_t;
124121

122+
/* support structure to carry in biom table information
123+
*
124+
* obs_ids <char**> the observation IDs
125+
* sample_ids <char**> the sample IDs
126+
* indices <int32_t*> the indices of the data values
127+
* indptr <int32_t*> the row offset of the data values
128+
* data <double*> the actual matrix values
129+
* n_obs <int> the number of observations, corresponding to length of obs_ids
130+
* n_samples <int> the number of samples, corresponding to the length of sample_ids
131+
* nnz <int> the number of nonzero values, corresponding to the length of data and indices
132+
*/
133+
typedef struct support_biom {
134+
char** obs_ids;
135+
char** sample_ids;
136+
uint32_t* indices;
137+
uint32_t* indptr;
138+
double* data;
139+
int n_obs;
140+
int n_samples;
141+
int nnz;
142+
} support_biom_t;
143+
144+
/* support structure to carry in bptree information
145+
*
146+
* structure <bool*> the topology of the tree
147+
* lengths <double*> the branch lengths of the tree
148+
* names <char**> the names of the tips and internal nodes of hte tree
149+
* n_parens <int> the length of the structure array
150+
*/
151+
typedef struct support_bptree {
152+
bool* structure;
153+
double* lengths;
154+
char** names;
155+
int n_parens;
156+
} support_bptree_t;
125157

126158

127-
void destroy_mat(mat_t** result);
128-
void destroy_mat_full_fp64(mat_full_fp64_t** result);
129-
void destroy_mat_full_fp32(mat_full_fp32_t** result);
130-
void destroy_partial_mat(partial_mat_t** result);
131-
void destroy_partial_dyn_mat(partial_dyn_mat_t** result);
132-
void destroy_results_vec(r_vec** result);
159+
EXTERN void destroy_mat(mat_t** result);
160+
EXTERN void destroy_mat_full_fp64(mat_full_fp64_t** result);
161+
EXTERN void destroy_mat_full_fp32(mat_full_fp32_t** result);
162+
EXTERN void destroy_partial_mat(partial_mat_t** result);
163+
EXTERN void destroy_partial_dyn_mat(partial_dyn_mat_t** result);
164+
EXTERN void destroy_results_vec(r_vec** result);
133165

134166
/* Compute UniFrac - condensed form
135167
*
@@ -154,6 +186,49 @@ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filenam
154186
const char* unifrac_method, bool variance_adjust, double alpha,
155187
bool bypass_tips, unsigned int threads, mat_t** result);
156188

189+
190+
/* Compute UniFrac - against in-memory objects returning full form matrix
191+
*
192+
* table <biom> a constructed BIOM object
193+
* tree <BPTree> a constructed BPTree object
194+
* unifrac_method <const char*> the requested unifrac method.
195+
* variance_adjust <bool> whether to apply variance adjustment.
196+
* alpha <double> GUniFrac alpha, only relevant if method == generalized.
197+
* bypass_tips <bool> disregard tips, reduces compute by about 50%
198+
* threads <uint> the number of threads to use.
199+
* result <mat_full_fp64_t**> the resulting distance matrix in full form, this is initialized within the method so using **
200+
*
201+
* one_off_inmem returns the following error codes:
202+
*
203+
* okay : no problems encountered
204+
* unknown_method : the requested method is unknown.
205+
* table_empty : the table does not have any entries
206+
*/
207+
EXTERN ComputeStatus one_off_inmem(const support_biom_t *table_data, const support_bptree_t *tree_data,
208+
const char* unifrac_method, bool variance_adjust, double alpha,
209+
bool bypass_tips, unsigned int threads, mat_full_fp64_t** result);
210+
211+
/* Compute UniFrac - against in-memory objects returning full form matrix, fp32
212+
*
213+
* table <biom> a constructed BIOM object
214+
* tree <BPTree> a constructed BPTree object
215+
* unifrac_method <const char*> the requested unifrac method.
216+
* variance_adjust <bool> whether to apply variance adjustment.
217+
* alpha <double> GUniFrac alpha, only relevant if method == generalized.
218+
* bypass_tips <bool> disregard tips, reduces compute by about 50%
219+
* threads <uint> the number of threads to use.
220+
* result <mat_full_fp32_t**> the resulting distance matrix in full form, this is initialized within the method so using **
221+
*
222+
* one_off_inmem returns the following error codes:
223+
*
224+
* okay : no problems encountered
225+
* unknown_method : the requested method is unknown.
226+
* table_empty : the table does not have any entries
227+
*/
228+
EXTERN ComputeStatus one_off_inmem_fp32(const support_biom_t *table_data, const support_bptree_t *tree_data,
229+
const char* unifrac_method, bool variance_adjust, double alpha,
230+
bool bypass_tips, unsigned int threads, mat_full_fp32_t** result);
231+
157232
/* Compute UniFrac - matrix form
158233
*
159234
* biom_filename <const char*> the filename to the biom table.

src/biom.cpp

+140-21
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ const std::string SAMPLE_INDICES = std::string("/sample/matrix/indices");
2626
const std::string SAMPLE_DATA = std::string("/sample/matrix/data");
2727
const std::string SAMPLE_IDS = std::string("/sample/ids");
2828

29-
biom::biom(std::string filename) {
29+
biom::biom(std::string filename) : has_hdf5_backing(true) {
3030
file = H5File(filename.c_str(), H5F_ACC_RDONLY);
3131

3232
/* establish the datasets */
@@ -55,9 +55,51 @@ biom::biom(std::string filename) {
5555
obs_id_index = std::unordered_map<std::string, uint32_t>();
5656
sample_id_index = std::unordered_map<std::string, uint32_t>();
5757

58-
create_id_index(obs_ids, obs_id_index);
59-
create_id_index(sample_ids, sample_id_index);
58+
#pragma omp parallel for schedule(static)
59+
for(int i = 0; i < 3; i++) {
60+
if(i == 0)
61+
create_id_index(obs_ids, obs_id_index);
62+
else if(i == 1)
63+
create_id_index(sample_ids, sample_id_index);
64+
else if(i == 2)
65+
malloc_resident(n_obs);
66+
}
67+
68+
uint32_t *current_indices = NULL;
69+
double *current_data = NULL;
70+
for(unsigned int i = 0; i < obs_ids.size(); i++) {
71+
std::string id_ = obs_ids[i];
72+
unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
73+
obs_counts_resident[i] = n;
74+
obs_indices_resident[i] = current_indices;
75+
obs_data_resident[i] = current_data;
76+
}
77+
sample_counts = get_sample_counts();
78+
}
79+
80+
biom::~biom() {
81+
if(has_hdf5_backing) {
82+
if(obs_indices_resident != NULL && obs_data_resident != NULL) {
83+
for(unsigned int i = 0; i < n_obs; i++) {
84+
if(obs_indices_resident[i] != NULL)
85+
free(obs_indices_resident[i]);
86+
if(obs_data_resident[i] != NULL)
87+
free(obs_data_resident[i]);
88+
}
89+
}
90+
91+
if(obs_indices_resident != NULL)
92+
free(obs_indices_resident);
93+
if(obs_data_resident != NULL)
94+
free(obs_data_resident);
95+
if(obs_counts_resident != NULL)
96+
free(obs_counts_resident);
97+
}
98+
// else, it is the responsibility of the entity constructing this object
99+
// to clean itself up
100+
}
60101

102+
void biom::malloc_resident(uint32_t n_obs) {
61103
/* load obs sparse data */
62104
obs_indices_resident = (uint32_t**)malloc(sizeof(uint32_t**) * n_obs);
63105
if(obs_indices_resident == NULL) {
@@ -77,30 +119,82 @@ biom::biom(std::string filename) {
77119
sizeof(unsigned int) * n_obs, __FILE__, __LINE__);
78120
exit(EXIT_FAILURE);
79121
}
122+
}
80123

81-
uint32_t *current_indices = NULL;
82-
double *current_data = NULL;
83-
for(unsigned int i = 0; i < obs_ids.size(); i++) {
84-
std::string id_ = obs_ids[i];
85-
unsigned int n = get_obs_data_direct(id_, current_indices, current_data);
86-
obs_counts_resident[i] = n;
87-
obs_indices_resident[i] = current_indices;
88-
obs_data_resident[i] = current_data;
89-
}
90-
sample_counts = get_sample_counts();
124+
biom::biom() : has_hdf5_backing(false) {
125+
n_obs = 0;
126+
malloc_resident(0);
91127
}
92128

93-
biom::~biom() {
94-
for(unsigned int i = 0; i < n_obs; i++) {
95-
free(obs_indices_resident[i]);
96-
free(obs_data_resident[i]);
129+
// not using const on indices/indptr/data as the pointers are being borrowed
130+
biom::biom(char** obs_ids_in,
131+
char** samp_ids_in,
132+
uint32_t* indices,
133+
uint32_t* indptr,
134+
double* data,
135+
const int n_obs,
136+
const int n_samples,
137+
const int nnz) : has_hdf5_backing(false) {
138+
139+
this->nnz = nnz;
140+
this->n_samples = n_samples;
141+
this->n_obs = n_obs;
142+
143+
sample_ids = std::vector<std::string>();
144+
sample_ids.resize(n_samples);
145+
obs_ids = std::vector<std::string>();
146+
obs_ids.resize(n_obs);
147+
148+
#pragma omp parallel for schedule(static)
149+
for(int x = 0; x < 2; x++) {
150+
if(x == 0) {
151+
for(int i = 0; i < n_obs; i++) {
152+
obs_ids[i] = std::string(obs_ids_in[i]);
153+
}
154+
} else {
155+
for(int i = 0; i < n_samples; i++) {
156+
sample_ids[i] = std::string(samp_ids_in[i]);
157+
}
158+
}
97159
}
98-
free(obs_indices_resident);
99-
free(obs_data_resident);
100-
free(obs_counts_resident);
160+
161+
/* define a mapping between an ID and its corresponding offset */
162+
obs_id_index = std::unordered_map<std::string, uint32_t>();
163+
sample_id_index = std::unordered_map<std::string, uint32_t>();
164+
165+
#pragma omp parallel for schedule(static)
166+
for(int i = 0; i < 3; i++) {
167+
if(i == 0)
168+
create_id_index(obs_ids, obs_id_index);
169+
else if(i == 1)
170+
create_id_index(sample_ids, sample_id_index);
171+
else if(i == 2)
172+
malloc_resident(n_obs);
173+
}
174+
175+
#pragma omp parallel for schedule(static)
176+
for(unsigned int i = 0; i < n_obs; i++) {
177+
int32_t start = indptr[i];
178+
int32_t end = indptr[i + 1];
179+
unsigned int count = end - start;
180+
181+
uint32_t* index_ptr = (indices + start);
182+
double* data_ptr = (data + start);
183+
184+
obs_indices_resident[i] = index_ptr;
185+
obs_data_resident[i] = data_ptr;
186+
obs_counts_resident[i] = count;
187+
}
188+
sample_counts = get_sample_counts();
101189
}
102190

103191
void biom::set_nnz() {
192+
if(!has_hdf5_backing) {
193+
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
194+
__FILE__, __LINE__);
195+
exit(EXIT_FAILURE);
196+
}
197+
104198
// should these be cached?
105199
DataType dtype = obs_data.getDataType();
106200
DataSpace dataspace = obs_data.getSpace();
@@ -111,6 +205,12 @@ void biom::set_nnz() {
111205
}
112206

113207
void biom::load_ids(const char *path, std::vector<std::string> &ids) {
208+
if(!has_hdf5_backing) {
209+
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
210+
__FILE__, __LINE__);
211+
exit(EXIT_FAILURE);
212+
}
213+
114214
DataSet ds_ids = file.openDataSet(path);
115215
DataType dtype = ds_ids.getDataType();
116216
DataSpace dataspace = ds_ids.getSpace();
@@ -138,6 +238,12 @@ void biom::load_ids(const char *path, std::vector<std::string> &ids) {
138238
}
139239

140240
void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
241+
if(!has_hdf5_backing) {
242+
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
243+
__FILE__, __LINE__);
244+
exit(EXIT_FAILURE);
245+
}
246+
141247
DataSet ds = file.openDataSet(path);
142248
DataType dtype = ds.getDataType();
143249
DataSpace dataspace = ds.getSpace();
@@ -159,7 +265,7 @@ void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
159265
free(dataout);
160266
}
161267

162-
void biom::create_id_index(std::vector<std::string> &ids,
268+
void biom::create_id_index(const std::vector<std::string> &ids,
163269
std::unordered_map<std::string, uint32_t> &map) {
164270
uint32_t count = 0;
165271
map.reserve(ids.size());
@@ -169,6 +275,12 @@ void biom::create_id_index(std::vector<std::string> &ids,
169275
}
170276

171277
unsigned int biom::get_obs_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
278+
if(!has_hdf5_backing) {
279+
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
280+
__FILE__, __LINE__);
281+
exit(EXIT_FAILURE);
282+
}
283+
172284
uint32_t idx = obs_id_index.at(id);
173285
uint32_t start = obs_indptr[idx];
174286
uint32_t end = obs_indptr[idx + 1];
@@ -270,6 +382,12 @@ void biom::get_obs_data_range(const std::string &id, unsigned int start, unsigne
270382
}
271383

272384
unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
385+
if(!has_hdf5_backing) {
386+
fprintf(stderr, "Lacks HDF5 backing; [%s]:%d\n",
387+
__FILE__, __LINE__);
388+
exit(EXIT_FAILURE);
389+
}
390+
273391
uint32_t idx = sample_id_index.at(id);
274392
uint32_t start = sample_indptr[idx];
275393
uint32_t end = sample_indptr[idx + 1];
@@ -310,6 +428,7 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur
310428

311429
double* biom::get_sample_counts() {
312430
double *sample_counts = (double*)calloc(sizeof(double), n_samples);
431+
313432
for(unsigned int i = 0; i < n_obs; i++) {
314433
unsigned int count = obs_counts_resident[i];
315434
uint32_t *indices = obs_indices_resident[i];

0 commit comments

Comments
 (0)