@@ -26,7 +26,7 @@ const std::string SAMPLE_INDICES = std::string("/sample/matrix/indices");
26
26
const std::string SAMPLE_DATA = std::string(" /sample/matrix/data" );
27
27
const std::string SAMPLE_IDS = std::string(" /sample/ids" );
28
28
29
- biom::biom (std::string filename) {
29
+ biom::biom (std::string filename) : has_hdf5_backing( true ) {
30
30
file = H5File (filename.c_str (), H5F_ACC_RDONLY);
31
31
32
32
/* establish the datasets */
@@ -55,9 +55,51 @@ biom::biom(std::string filename) {
55
55
obs_id_index = std::unordered_map<std::string, uint32_t >();
56
56
sample_id_index = std::unordered_map<std::string, uint32_t >();
57
57
58
- create_id_index (obs_ids, obs_id_index);
59
- create_id_index (sample_ids, sample_id_index);
58
+ #pragma omp parallel for schedule(static)
59
+ for (int i = 0 ; i < 3 ; i++) {
60
+ if (i == 0 )
61
+ create_id_index (obs_ids, obs_id_index);
62
+ else if (i == 1 )
63
+ create_id_index (sample_ids, sample_id_index);
64
+ else if (i == 2 )
65
+ malloc_resident (n_obs);
66
+ }
67
+
68
+ uint32_t *current_indices = NULL ;
69
+ double *current_data = NULL ;
70
+ for (unsigned int i = 0 ; i < obs_ids.size (); i++) {
71
+ std::string id_ = obs_ids[i];
72
+ unsigned int n = get_obs_data_direct (id_, current_indices, current_data);
73
+ obs_counts_resident[i] = n;
74
+ obs_indices_resident[i] = current_indices;
75
+ obs_data_resident[i] = current_data;
76
+ }
77
+ sample_counts = get_sample_counts ();
78
+ }
79
+
80
+ biom::~biom () {
81
+ if (has_hdf5_backing) {
82
+ if (obs_indices_resident != NULL && obs_data_resident != NULL ) {
83
+ for (unsigned int i = 0 ; i < n_obs; i++) {
84
+ if (obs_indices_resident[i] != NULL )
85
+ free (obs_indices_resident[i]);
86
+ if (obs_data_resident[i] != NULL )
87
+ free (obs_data_resident[i]);
88
+ }
89
+ }
90
+
91
+ if (obs_indices_resident != NULL )
92
+ free (obs_indices_resident);
93
+ if (obs_data_resident != NULL )
94
+ free (obs_data_resident);
95
+ if (obs_counts_resident != NULL )
96
+ free (obs_counts_resident);
97
+ }
98
+ // else, it is the responsibility of the entity constructing this object
99
+ // to clean itself up
100
+ }
60
101
102
+ void biom::malloc_resident (uint32_t n_obs) {
61
103
/* load obs sparse data */
62
104
obs_indices_resident = (uint32_t **)malloc (sizeof (uint32_t **) * n_obs);
63
105
if (obs_indices_resident == NULL ) {
@@ -77,30 +119,82 @@ biom::biom(std::string filename) {
77
119
sizeof (unsigned int ) * n_obs, __FILE__, __LINE__);
78
120
exit (EXIT_FAILURE);
79
121
}
122
+ }
80
123
81
- uint32_t *current_indices = NULL ;
82
- double *current_data = NULL ;
83
- for (unsigned int i = 0 ; i < obs_ids.size (); i++) {
84
- std::string id_ = obs_ids[i];
85
- unsigned int n = get_obs_data_direct (id_, current_indices, current_data);
86
- obs_counts_resident[i] = n;
87
- obs_indices_resident[i] = current_indices;
88
- obs_data_resident[i] = current_data;
89
- }
90
- sample_counts = get_sample_counts ();
124
+ biom::biom () : has_hdf5_backing(false ) {
125
+ n_obs = 0 ;
126
+ malloc_resident (0 );
91
127
}
92
128
93
- biom::~biom () {
94
- for (unsigned int i = 0 ; i < n_obs; i++) {
95
- free (obs_indices_resident[i]);
96
- free (obs_data_resident[i]);
129
+ // not using const on indices/indptr/data as the pointers are being borrowed
130
+ biom::biom (char ** obs_ids_in,
131
+ char ** samp_ids_in,
132
+ uint32_t * indices,
133
+ uint32_t * indptr,
134
+ double * data,
135
+ const int n_obs,
136
+ const int n_samples,
137
+ const int nnz) : has_hdf5_backing(false ) {
138
+
139
+ this ->nnz = nnz;
140
+ this ->n_samples = n_samples;
141
+ this ->n_obs = n_obs;
142
+
143
+ sample_ids = std::vector<std::string>();
144
+ sample_ids.resize (n_samples);
145
+ obs_ids = std::vector<std::string>();
146
+ obs_ids.resize (n_obs);
147
+
148
+ #pragma omp parallel for schedule(static)
149
+ for (int x = 0 ; x < 2 ; x++) {
150
+ if (x == 0 ) {
151
+ for (int i = 0 ; i < n_obs; i++) {
152
+ obs_ids[i] = std::string (obs_ids_in[i]);
153
+ }
154
+ } else {
155
+ for (int i = 0 ; i < n_samples; i++) {
156
+ sample_ids[i] = std::string (samp_ids_in[i]);
157
+ }
158
+ }
97
159
}
98
- free (obs_indices_resident);
99
- free (obs_data_resident);
100
- free (obs_counts_resident);
160
+
161
+ /* define a mapping between an ID and its corresponding offset */
162
+ obs_id_index = std::unordered_map<std::string, uint32_t >();
163
+ sample_id_index = std::unordered_map<std::string, uint32_t >();
164
+
165
+ #pragma omp parallel for schedule(static)
166
+ for (int i = 0 ; i < 3 ; i++) {
167
+ if (i == 0 )
168
+ create_id_index (obs_ids, obs_id_index);
169
+ else if (i == 1 )
170
+ create_id_index (sample_ids, sample_id_index);
171
+ else if (i == 2 )
172
+ malloc_resident (n_obs);
173
+ }
174
+
175
+ #pragma omp parallel for schedule(static)
176
+ for (unsigned int i = 0 ; i < n_obs; i++) {
177
+ int32_t start = indptr[i];
178
+ int32_t end = indptr[i + 1 ];
179
+ unsigned int count = end - start;
180
+
181
+ uint32_t * index_ptr = (indices + start);
182
+ double * data_ptr = (data + start);
183
+
184
+ obs_indices_resident[i] = index_ptr;
185
+ obs_data_resident[i] = data_ptr;
186
+ obs_counts_resident[i] = count;
187
+ }
188
+ sample_counts = get_sample_counts ();
101
189
}
102
190
103
191
void biom::set_nnz () {
192
+ if (!has_hdf5_backing) {
193
+ fprintf (stderr, " Lacks HDF5 backing; [%s]:%d\n " ,
194
+ __FILE__, __LINE__);
195
+ exit (EXIT_FAILURE);
196
+ }
197
+
104
198
// should these be cached?
105
199
DataType dtype = obs_data.getDataType ();
106
200
DataSpace dataspace = obs_data.getSpace ();
@@ -111,6 +205,12 @@ void biom::set_nnz() {
111
205
}
112
206
113
207
void biom::load_ids (const char *path, std::vector<std::string> &ids) {
208
+ if (!has_hdf5_backing) {
209
+ fprintf (stderr, " Lacks HDF5 backing; [%s]:%d\n " ,
210
+ __FILE__, __LINE__);
211
+ exit (EXIT_FAILURE);
212
+ }
213
+
114
214
DataSet ds_ids = file.openDataSet (path);
115
215
DataType dtype = ds_ids.getDataType ();
116
216
DataSpace dataspace = ds_ids.getSpace ();
@@ -138,6 +238,12 @@ void biom::load_ids(const char *path, std::vector<std::string> &ids) {
138
238
}
139
239
140
240
void biom::load_indptr (const char *path, std::vector<uint32_t > &indptr) {
241
+ if (!has_hdf5_backing) {
242
+ fprintf (stderr, " Lacks HDF5 backing; [%s]:%d\n " ,
243
+ __FILE__, __LINE__);
244
+ exit (EXIT_FAILURE);
245
+ }
246
+
141
247
DataSet ds = file.openDataSet (path);
142
248
DataType dtype = ds.getDataType ();
143
249
DataSpace dataspace = ds.getSpace ();
@@ -159,7 +265,7 @@ void biom::load_indptr(const char *path, std::vector<uint32_t> &indptr) {
159
265
free (dataout);
160
266
}
161
267
162
- void biom::create_id_index (std::vector<std::string> &ids,
268
+ void biom::create_id_index (const std::vector<std::string> &ids,
163
269
std::unordered_map<std::string, uint32_t > &map) {
164
270
uint32_t count = 0 ;
165
271
map.reserve (ids.size ());
@@ -169,6 +275,12 @@ void biom::create_id_index(std::vector<std::string> &ids,
169
275
}
170
276
171
277
unsigned int biom::get_obs_data_direct (const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
278
+ if (!has_hdf5_backing) {
279
+ fprintf (stderr, " Lacks HDF5 backing; [%s]:%d\n " ,
280
+ __FILE__, __LINE__);
281
+ exit (EXIT_FAILURE);
282
+ }
283
+
172
284
uint32_t idx = obs_id_index.at (id);
173
285
uint32_t start = obs_indptr[idx];
174
286
uint32_t end = obs_indptr[idx + 1 ];
@@ -270,6 +382,12 @@ void biom::get_obs_data_range(const std::string &id, unsigned int start, unsigne
270
382
}
271
383
272
384
unsigned int biom::get_sample_data_direct (const std::string &id, uint32_t *& current_indices_out, double *& current_data_out) {
385
+ if (!has_hdf5_backing) {
386
+ fprintf (stderr, " Lacks HDF5 backing; [%s]:%d\n " ,
387
+ __FILE__, __LINE__);
388
+ exit (EXIT_FAILURE);
389
+ }
390
+
273
391
uint32_t idx = sample_id_index.at (id);
274
392
uint32_t start = sample_indptr[idx];
275
393
uint32_t end = sample_indptr[idx + 1 ];
@@ -310,6 +428,7 @@ unsigned int biom::get_sample_data_direct(const std::string &id, uint32_t *& cur
310
428
311
429
double * biom::get_sample_counts () {
312
430
double *sample_counts = (double *)calloc (sizeof (double ), n_samples);
431
+
313
432
for (unsigned int i = 0 ; i < n_obs; i++) {
314
433
unsigned int count = obs_counts_resident[i];
315
434
uint32_t *indices = obs_indices_resident[i];
0 commit comments