-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathschema_compressor.py
792 lines (634 loc) · 29.2 KB
/
schema_compressor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Purpose
-------
This module is used by the Chewie-NS to generate compressed
versions of schemas.
It has two execution modes: single and global. The former
enables the compression of a single schema and the latter
enables the compression of all schemas in the Chewie-NS.
Both modes verify if there is a compressed version for each
schema and if it is necessary to update it based on the last
modification date of the schema.
Expected input
--------------
It is necessary to specify the execution mode through the
following argument:
- ``-m``, ``mode`` :
- e.g.: ``single`` or ``global``
The ``single`` mode also receives the identifier of a species
and the identifier of a schema for that species:
- ``--sp``, ``species_id`` :
- e.g.: ``1``
- ``--sc``, ``schema_id`` :
- e.g.: ``4``
Code documentation
------------------
"""
import os
import sys
import time
import pickle
import shutil
import logging
import argparse
import datetime as dt
from SPARQLWrapper import SPARQLWrapper
from config import Config
from app.utils import sparql_queries as sq
from app.utils import auxiliary_functions as aux
from app.utils import PrepExternalSchema
logfile = './log_files/schema_compression.log'
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%Y-%m-%dT%H:%M:%S',
filename=logfile)
def change_lock(schema_uri, action, virtuoso_graph, local_sparql, virtuoso_user, virtuoso_pass):
""" Changes the locking state of a schema in the Chewie-NS.
Parameters
----------
schema_uri : str
The URI of the schema in the Chewie-NS.
action : str
'LOCKED' if the schema should be locked or
'Unlocked' if it should be unlocked.
Returns
-------
True if the action is performed successfully,
False otherwise.
"""
del_lock_query = (sq.DELETE_SCHEMA_LOCK.format(virtuoso_graph,
schema_uri))
del_lock_result = aux.send_data(del_lock_query,
local_sparql,
virtuoso_user,
virtuoso_pass)
del_status = del_lock_result.status_code
if del_status > 201:
return [False, del_lock_result.content]
else:
# insert new locking value
add_lock_query = (sq.INSERT_SCHEMA_LOCK.format(virtuoso_graph,
schema_uri,
action))
add_lock_result = aux.send_data(add_lock_query,
local_sparql,
virtuoso_user,
virtuoso_pass)
add_status = add_lock_result.status_code
if add_status > 201:
return [False, add_lock_result.content]
else:
return True
def get_species(local_sparql, virtuoso_graph):
""" Gets the list of species in the Chewie-NS.
This function has no arguments but expects
that the SPARQL endpoint and default Virtuoso
Graph be set as OS environment variables.
Returns
-------
species_list : dict
A dictionary with species URIs as keys and species
names as values. None if species has no schemas.
"""
# get the list of species in NS
species_result = aux.get_data(SPARQLWrapper(local_sparql),
(sq.SELECT_SPECIES.format(virtuoso_graph,
' typon:name ?name. ')))
species = species_result['results']['bindings']
if len(species) == 0:
species_list = None
else:
species_list = {s['species']['value']: s['name']['value']
for s in species}
return species_list
def species_schemas(species_uri, schemas, local_sparql, virtuoso_graph):
""" Gets the list of schemas for a species.
Parameters
----------
species_uri : str
The URI of the species in the Chewie-NS.
schemas : dict
An empty dictionary to store schemas' data.
Returns
-------
A list with the following variables:
- status (int): status code of the response.
- schemas (dict): A dictionary with the species
URI as key and a list of tuples as value.
Each tuple has a schema URI and the name of
that schema.
"""
result = aux.get_data(SPARQLWrapper(local_sparql),
(sq.SELECT_SPECIES_SCHEMAS.format(virtuoso_graph,
species_uri)))
try:
ns_schemas = result['results']['bindings']
if len(ns_schemas) > 0:
for schema in ns_schemas:
schemas.setdefault(species_uri, []).append((schema['schemas']['value'],
schema['name']['value']))
except Exception:
logging.warning('Could not retrieve schemas for '
'{0}. Exception:\n{1}'.format(species_uri, result))
return schemas
def determine_date(schema_uri, local_sparql, virtuoso_graph):
""" Gets the last modification date for a schema.
Parameters
----------
schema_uri : str
The URI of the schema in the Chewie-NS.
Returns
-------
A list with the following variables:
- last_date (str): The last modification date in
the format YYYY-MM-DDTHH:MM:SS.f.
- lock_state (str): Locking state of the schema.
- schema_info (dict): A dictionary with schema
properties values.
"""
# get schema last modification date
date_result = aux.get_data(SPARQLWrapper(local_sparql),
(sq.SELECT_SPECIES_SCHEMA.format(virtuoso_graph, schema_uri)))
schema_info = date_result['results']['bindings'][0]
lock_state = schema_info['Schema_lock']['value']
last_date = schema_info['last_modified']['value']
return [last_date, lock_state, schema_info]
def compress_determiner(schemas, species_id, sp_name, compressed_schemas,
to_compress, old_zips, local_sparql, virtuoso_graph):
""" Determines if it is necessary to generate compressed
versions of schemas.
Parameters
----------
schemas : list
List with sublists, each sublist has the name and
the URI for a schema of the species.
species_id : str
Identifier of the species in the Chewie-NS.
sp_name : str
Scientific name of the species.
compressed_schemas : list
List with all the compressed schema versions that are
currently available.
to_compress : list
A list to store data about schemas that
need to be compressed.
old_zips : dict
A dictionary to store the ZIP filenames of
the outdated compressed versions.
Returns
-------
A list with the following variables:
to_compress : list
A list with one sublist per schema that needs to
be compressed. Each sublist has the following variables:
- schema_uri (str): the URI of the schema in the Chewie-NS.
- schema_date (str): last modification date of the schema.
- schema_bsr (str): BLAST Score Ratio value used to create the
schema and perform allele calling.
- schema_ml (str): minimum sequence length value.
- schema_tt (str): genetic code used to predict and translate
coding sequences.
- schema_ptf (str): BLAKE2 hash of the Prodigal training file
associated with the schema.
- schema_st (str): sequence size variation percentage
threshold.
- chewie_version (str): version of the chewBBACA suite
used to create the schema and perform allele calling.
- sp_name (str): name of the schema's species.
- schema_prefix (str): filename prefix of the compressed
version of the schema.
- schema_name (str): name of the schema.
- schema_lock (str): locking state of the schema.
old_zips : dict
A dictionary with schema URIs as keys and
ZIP filenames of the compressed versions
that are outdated or None if there is no
compressed version.
"""
for schema in schemas:
schema_name = schema[1]
schema_uri = schema[0]
schema_id = schema_uri.split('/')[-1]
schema_prefix = '{0}_{1}'.format(species_id, schema_id)
last_date, lock_state, schema_info = determine_date(schema_uri, local_sparql, virtuoso_graph)
schema_date = last_date
schema_lock = lock_state
schema_bsr = schema_info['bsr']['value']
schema_ml = schema_info['minimum_locus_length']['value']
schema_tt = schema_info['translation_table']['value']
schema_ptf = schema_info['prodigal_training_file']['value']
schema_st = schema_info['size_threshold']['value']
chewie_version = schema_info['chewBBACA_version']['value']
# get all compressed versions that have the schema prefix
comp_schema = [f for f in compressed_schemas if f.startswith(schema_prefix)]
# there is no compressed version
if len(comp_schema) == 0:
to_compress.append([schema_uri, schema_date, schema_bsr,
schema_ml, schema_tt, schema_ptf,
schema_st, chewie_version, sp_name,
schema_prefix, schema_name, schema_lock])
logging.info('{0} ({1}) is novel schema to compress.'.format(schema_uri, schema_name))
old_zips[schema_uri] = None
# there is a compressed version
elif len(comp_schema) == 1:
comp_date = comp_schema[0].split('_')[-1]
comp_date = comp_date.split('.zip')[0]
# check if schema has been altered since compression date
if comp_date != schema_date:
to_compress.append([schema_uri, schema_date, schema_bsr,
schema_ml, schema_tt, schema_ptf,
schema_st, chewie_version, sp_name,
schema_prefix, schema_name, schema_lock])
logging.info('{0} ({1}) compressed version is '
'outdated.'.format(schema_uri, schema_name))
old_zips[schema_uri] = comp_schema[0]
else:
logging.info('{0} ({1}) is up-to-date.'.format(schema_uri, schema_name))
elif len(comp_schema) > 1:
logging.warning('{0} ({1}) is already being compressed.'.format(schema_uri, schema_name))
return [to_compress, old_zips]
def schema_loci(schema_uri, local_sparql, virtuoso_graph):
""" Gets the list of loci for a schema.
Parameters
----------
schema_uri : str
The URI of the schema in the Chewie-NS.
Returns
-------
loci_list : list of tup
A list with tuples. Each tuple has two
elements, a locus name and a locus URI.
"""
# get loci
loci_result = aux.get_data(SPARQLWrapper(local_sparql),
(sq.SELECT_SCHEMA_LOCI.format(virtuoso_graph, schema_uri)))
# check if schema has loci
loci_list = loci_result['results']['bindings']
if loci_list != []:
loci_list = [(l['name']['value'], l['locus']['value']) for l in loci_list]
return loci_list
def fasta_sequences(locus, date, local_sparql, virtuoso_graph):
""" Get the DNA sequences of all alleles of a locus.
Parameters
----------
locus : str
The URI of the locus in the Chewie-NS.
date : str
Last modification date of the schema in
the format YYYY-MM-DDTHH:MM:SS.f.
Returns
-------
fasta_seqs : list of dict
A list with one dictionary per allele.
Each dictionary has the identifier and the DNA
sequence of an allele.
"""
# setting [SPARQL] ResultSetMaxRows = 400000 in virtuoso.ini
# is important to return all sequences at once
fasta_result = aux.get_data(SPARQLWrapper(local_sparql),
(sq.SELECT_LOCUS_FASTA_BY_DATE.format(virtuoso_graph, locus, date)))
# virtuoso returned an error because request length exceeded maximum value of Temp Col
# get each allele separately
try:
fasta_seqs = fasta_result['results']['bindings']
# virtuoso returned an error
# probably because sequence/request length exceeded maximum value
except:
logging.warning('Could not retrieve FASTA records for locus {0}\n'
'Response content:\n{1}\nTrying to get each sequence '
'separately...\n'.format(locus, fasta_result))
# get each allele separately
result = aux.get_data(SPARQLWrapper(local_sparql),
(sq.SELECT_LOCUS_SEQS_BY_DATE.format(virtuoso_graph, locus, date)))
try:
fasta_seqs = result['results']['bindings']
if len(fasta_seqs) == 0:
logging.warning('Locus {0} has 0 sequences.'.format(locus))
return False
except:
logging.warning('Could not retrieve sequences hashes '
'for locus {0}.'.format(locus))
return False
total = 0
hashes = []
for s in range(len(fasta_seqs)):
# get the sequence corresponding to the hash
result2 = aux.get_data(SPARQLWrapper(local_sparql),
(sq.SELECT_SEQ_FASTA.format(virtuoso_graph, fasta_seqs[s]['sequence']['value'])))
hashes.append(fasta_seqs[s]['sequence']['value'])
fasta_seqs[s]['nucSeq'] = result2['results']['bindings'][0]['nucSeq']
total += 1
return fasta_seqs
def create_fasta(loci_list, date, temp_dir, local_sparql, virtuoso_graph):
""" Creates FASTA files for the loci of a schema.
Parameters
----------
loci_list : list of tup
A list with tuples, one tuple per locus.
Each tuple has two elements: the locus name
and the locus URI.
date : str
Last modification date of the schema. The
function will get all sequences that were
inserted before this date.
temp_dir : str
The path to the directory where the FASTA files
will be created.
Returns
-------
temp_files : list
List of paths to the FASTA files that were created.
"""
# download FASTA sequences and save in temp directory
temp_files = []
for locus in loci_list:
locus_name = locus[0]
locus_uri = locus[1]
sequences = fasta_sequences(locus_uri, date, local_sparql, virtuoso_graph)
if sequences is False:
logging.warning('Cannot continue compression '
'process for schema. Could not '
'retrieve sequences for one or more loci.')
return False
fasta_seqs = [(f['allele_id']['value'], f['nucSeq']['value']) for f in sequences]
fasta_lines = ['>{0}_{1}\n{2}'.format(locus_name, s[0], s[1]) for s in fasta_seqs]
fasta_text = '\n'.join(fasta_lines)
temp_file = '{0}/{1}.fasta'.format(temp_dir, locus_name)
temp_files.append(temp_file)
with open(temp_file, 'w') as f:
f.write(fasta_text)
return temp_files
def compress_schema(schema, old_zip, local_sparql, virtuoso_graph):
""" Generates a compressed version of a schema that is in
the Chewie-NS.
Parameters
----------
schema : list
One of the sublists with data about a schema returned
by the :py:func:`compress_determiner` function.
old_zip : str
Path to the outdated compressed version of the schema
(None if there is no compressed version).
Returns
-------
0 if the compression process completed successfully,
1 otherwise.
"""
schema_ptf_path = os.path.join(Config.SCHEMAS_PTF, schema[5])
# tutorial schemas are small and compression might start
# before the PTF has been fully uploaded
waited = 0
while waited < 360 and os.path.isfile(schema_ptf_path) is False:
waited += 20
time.sleep(20)
if os.path.isfile(schema_ptf_path) is False:
logging.warning('Could not find training file for schema {0} ({1}).'
' Aborting schema compression.'.format(schema[0], schema[-2]))
return 1
loci_list = schema_loci(schema[0], local_sparql, virtuoso_graph)
if len(loci_list) == 0:
logging.info('Could not retrieve loci for {0} ({1}).'.format(schema[0], schema[-2]))
return 1
# create temp folder
temp_dir = os.path.join(Config.SCHEMAS_ZIP, '{0}_temp'.format(schema[-3]))
os.mkdir(temp_dir)
logging.info('Downloading Fasta files for schema {0} ({1})'.format(schema[0], schema[-2]))
temp_files = create_fasta(loci_list, schema[1], temp_dir, local_sparql, virtuoso_graph)
if temp_files is False:
shutil.rmtree(temp_dir)
return 1
# run PrepExternalSchema
logging.info('Adapting schema {0} ({1})'.format(schema[0], schema[-2]))
output_directory = os.path.join(Config.SCHEMAS_ZIP, '{0}_{1}'.format(schema[-3], schema[1]))
adapted = PrepExternalSchema.main(temp_dir, output_directory, 6,
float(schema[2]), 0,
int(schema[4]), schema[5],
None, os.path.join('/app', logfile))
if adapted is True:
# copy training file to schema directory
ptf_basename = '{0}.trn'.format(schema[-4])
ptf_file = os.path.join(output_directory, ptf_basename)
shutil.copy(schema_ptf_path, ptf_file)
# write schema config file
schema_config = aux.write_schema_config(schema[2], schema[5],
schema[4], schema[3],
schema[7], schema[6],
output_directory)
# write config file with schema last modification date
ns_config = os.path.join(output_directory, '.ns_config')
with open(ns_config, 'wb') as nc:
ns_info = [schema[1], schema[0]]
pickle.dump(ns_info, nc)
# create hidden file with genes/loci list
genes_list_file = aux.write_gene_list(output_directory)
# remove old zip archive
if old_zip is not None:
os.remove(old_zip)
# compress new version
shutil.make_archive(output_directory, 'zip', output_directory)
elif adapted is not True:
logging.warning('Could not adapt {0} ({1}).'.format(schema[0], schema[-2]))
# remove temp directories and files
shutil.rmtree(temp_dir)
shutil.rmtree(output_directory)
return 0 if adapted is True else 1
def parse_arguments():
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-m', type=str,
dest='mode', required=True,
choices=['global', 'single'],
help='Execution mode. The "single" mode '
'compresses a single schema and the '
'"global" mode compresses all schemas '
'in the Chewie-NS. A compressed version '
'of a schema will only be generated if '
'there is no compressed version or the '
'curretn one is outdated.')
parser.add_argument('--sp', type=str, default=None,
dest='species_id', required=False,
help='The identifier of the species in the '
'Chewie-NS (only relevant for the "single" '
'mode).')
parser.add_argument('--sc', type=str, default=None,
dest='schema_id', required=False,
help='The identifier of the schema in the '
'Chewie-NS (only relevant for the "global" '
'mode).')
parser.add_argument('--g', type=str,
dest='virtuoso_graph',
default=os.environ.get('DEFAULTHGRAPH'),
help='')
parser.add_argument('--s', type=str,
dest='local_sparql',
default=os.environ.get('LOCAL_SPARQL'),
help='')
parser.add_argument('--b', type=str,
dest='base_url',
default=os.environ.get('BASE_URL'),
help='')
parser.add_argument('--u', type=str,
dest='virtuoso_user',
default=os.environ.get('VIRTUOSO_USER'),
help='')
parser.add_argument('--p', type=str,
dest='virtuoso_pass',
default=os.environ.get('VIRTUOSO_PASS'),
help='')
args = parser.parse_args()
return [args.mode, args.species_id, args.schema_id,
args.virtuoso_graph, args.local_sparql, args.base_url,
args.virtuoso_user, args.virtuoso_pass]
def global_compressor(graph, sparql, base_url):
""" Determines which schemas need to be compressed and generates
compressed versions of those schemas.
"""
# get date
start_date = dt.datetime.now()
start_date_str = dt.datetime.strftime(start_date, '%Y-%m-%dT%H:%M:%S')
logging.info('Started global compressor at: {0}'.format(start_date_str))
species_list = get_species(sparql, graph)
if species_list is None:
logging.warning('Could not retrieve any species from the NS.\n\n')
sys.exit(0)
else:
logging.info('Species in NS: {0}'.format(','.join(list(species_list.values()))))
# get all schemas for all species
schemas = {}
for species in species_list:
schemas = species_schemas(species, schemas, sparql, graph)
if len(schemas) > 0:
current_schemas = schemas.get(species, None)
if current_schemas is not None:
current_schemas_strs = ['{0}, {1}'.format(s[0], s[1]) for s in current_schemas]
logging.info('Found {0} schemas for {1} ({2}): {3}'.format(len(current_schemas),
species,
species_list[species],
';'.join(current_schemas_strs)))
if len(schemas) == 0:
logging.warning('Could not find schemas for any species.\n\n')
sys.exit(0)
# list compressed schemas
compressed_schemas = os.listdir(Config.SCHEMAS_ZIP)
# iterate over each species schemas
# decide which schemas to compress
to_compress = []
old_zips = {}
for species in schemas:
sp_name = '_'.join(species_list[species].split(' '))
sp_schemas = schemas[species]
species_id = species.split('/')[-1]
to_compress, old_zips = compress_determiner(sp_schemas, species_id,
sp_name, compressed_schemas,
to_compress, old_zips,
sparql, graph)
# exclude locked schemas
locked = []
for s in to_compress:
if s[-1] != 'Unlocked':
logging.warning('{0} ({1}) is locked.'.format(s[0], s[-2]))
locked.append(s[0])
del(old_zips[s[0]])
to_compress = [s for s in to_compress if s[0] not in locked]
old_zips = {f: os.path.join(Config.SCHEMAS_ZIP, z) if z is not None else None for f, z in old_zips.items()}
if len(to_compress) == 0:
logging.info('No schemas to update.\n\n')
sys.exit(0)
else:
schemas = ['{0} ({1})'.format(s[0], s[-2]) for s in to_compress]
logging.info('Schemas to compress: {0}'.format(';'.join(schemas)))
# for each schema: get loci, download FASTA to temp folder, apply PrepExternalSchema and compress
for schema in to_compress:
response = compress_schema(schema, old_zips[schema[0]], sparql, graph)
if response == 0:
logging.info('Successfully compressed schema {0} '
'({1})'.format(schema[0], schema[-2]))
else:
logging.info('Could not compress schema {0} '
'({1})'.format(schema[0], schema[-2]))
end_date = dt.datetime.now()
end_date_str = dt.datetime.strftime(end_date, '%Y-%m-%dT%H:%M:%S')
logging.info('Finished global compressor at: {0}\n\n'.format(end_date_str))
def single_compressor(species_id, schema_id, graph, sparql, base_url, user, password):
""" Determines if a schema needs to be compressed and
generates a compressed version if needed.
"""
logging.info('Started single compressor for schema {0} '
'of species {1}'.format(schema_id, species_id))
# check if species exists
species_uri = '{0}species/{1}'.format(base_url, species_id)
species_result = aux.get_data(SPARQLWrapper(sparql),
sq.SELECT_SINGLE_SPECIES.format(graph, species_uri))
result_data = species_result['results']['bindings']
if len(result_data) == 0:
logging.warning('Could not find species with identifier {0}. '
'Aborting schema compression.\n\n'.format(species_id))
sys.exit(1)
sp_name = result_data[0]['name']['value']
sp_name = '_'.join(sp_name.split(' '))
# get schema info
# construct schema URI
schema_uri = '{0}/schemas/{1}'.format(species_uri, schema_id)
schema_info = aux.get_data(SPARQLWrapper(sparql),
(sq.SELECT_SPECIES_SCHEMA.format(graph, schema_uri)))
schema_properties = schema_info['results']['bindings']
if len(schema_properties) == 0:
logging.warning('Could not find properties values for schema with identifier {0}. '
'Aborting schema compression.\n\n'.format(schema_id))
sys.exit(1)
schema_name = schema_properties[0]['name']['value']
schemas = [(schema_uri, schema_name)]
# list compressed schemas
compressed_schemas = os.listdir(Config.SCHEMAS_ZIP)
to_compress = []
old_zips = {}
to_compress, old_zip = compress_determiner(schemas, species_id,
sp_name, compressed_schemas,
to_compress, old_zips,
sparql, graph)
if len(to_compress) == 0:
logging.info('Aborting schema compression.\n\n')
sys.exit(0)
else:
schemas = ['{0} ({1})'.format(s[0], s[-2]) for s in to_compress]
logging.info('Schema to compress: {0}'.format(';'.join(schemas)))
# check if schema is locked
schema_lock = aux.get_data(SPARQLWrapper(sparql),
(sq.ASK_SCHEMA_LOCK.format(schema_uri)))
lock_status = schema_lock['boolean']
if lock_status is True:
# lock schema
locked = change_lock(schema_uri, 'LOCKED', graph,
sparql, user, password)
if isinstance(locked, list) is True:
logging.warning('Could not lock schema {0}. Response:'
'\n{1}\n\n'.format(schema_uri, locked[1]))
sys.exit(1)
single_schema_name = to_compress[0][-2]
if old_zip[schema_uri] is not None:
old_zip[schema_uri] = os.path.join(Config.SCHEMAS_ZIP, old_zip[schema_uri])
# adapt and compress schema
response = compress_schema(to_compress[0], old_zip[schema_uri], sparql, graph)
if response == 0:
logging.info('Successfully compressed schema {0} '
'({1})'.format(schema_uri, single_schema_name))
else:
logging.info('Could not compress schema {0} '
'({1})'.format(schema_uri, single_schema_name))
# unlock schema
unlocked = change_lock(schema_uri, 'Unlocked', graph,
sparql, user, password)
if isinstance(unlocked, list) is True:
logging.warning('Could not unlock schema at the end of compression process.')
logging.info('Finished single compressor for schema {0} '
'of species {1}'.format(schema_id, species_id))
if __name__ == '__main__':
args = parse_arguments()
if args[0] == 'global':
global_compressor(args[3], args[4], args[5])
elif args[0] == 'single':
single_compressor(args[1], args[2], args[3],
args[4], args[5], args[6],
args[7])