Skip to content

Commit

Permalink
feat(unique-id): ignore missing files
Browse files Browse the repository at this point in the history
  • Loading branch information
paulineribeyre committed Sep 26, 2018
1 parent ee37322 commit a90ced8
Showing 1 changed file with 36 additions and 23 deletions.
59 changes: 36 additions & 23 deletions peregrine/utils/pybdbag.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,13 @@ def create_bdbag(bag_info, payload, max_row=1000):
for node_name, json_data in payload.iteritems():
header_set = set()
data_file_headers = set()
has_file_header = False
for dict_row in json_data:
for key in dict_row.keys():
if (dict_row[key] is not None and dict_row[key] != []):
header_set.update([key])
if 'file' in key:
has_file_header = True
words = key.split('-')
if len(words) > 1 and is_category(words[-2], data_files):
data_file_headers.update([key])
Expand All @@ -82,31 +85,41 @@ def create_bdbag(bag_info, payload, max_row=1000):

with open(bag_path + '/data/' + node_name + '.tsv', 'w') as tsvfile:
writer = csv.writer(tsvfile, delimiter='\t')
row = []
for h in header_set:
words = h.split('-')
header = words[-1]
row = row + [header]
if header[0] == '_':
unique_id_header = 'entity:' + header[1:]
row.insert(0, unique_id_header)
writer.writerow(row)

nrow = 0
for dict_row in json_data:
row = [str(uuid.uuid4())] # unique id

# if there is no 'file' header, all the rows are missing files
# so we would ignore all the rows
if has_file_header:
row = []
for h in header_set:
if dict_row.get(h):
value = dict_row[h]
if 'file_dos' in h:
value = 'dos://' + value
row = row + [value]
else:
row = row + ["None"]
nrow = nrow + 1
words = h.split('-')
header = words[-1]
row = row + [header]
if header[0] == '_':
unique_id_header = 'entity:' + header[1:]
row.insert(0, unique_id_header)
writer.writerow(row)
if nrow >= max_row:
break

nrow = 0
for dict_row in json_data:
row = [str(uuid.uuid4())] # unique id
add_row = True
for h in header_set:
if dict_row.get(h):
value = dict_row[h]
if 'file_dos' in h:
value = 'dos://' + value
row = row + [value]
else if 'file' in h:
# ignoring missing file rows
add_row = False
break
else:
row = row + ["None"]
if add_row:
nrow = nrow + 1
writer.writerow(row)
if nrow >= max_row:
break

with open(bag_path + '/fetch.txt', 'w') as fetch_file:
for item in data_file_uuids:
Expand Down

0 comments on commit a90ced8

Please sign in to comment.