Skip to content

Commit

Permalink
add xopen threading to db build scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Jan 30, 2025
1 parent 5c80bdf commit 555d52a
Show file tree
Hide file tree
Showing 5 changed files with 8 additions and 8 deletions.
2 changes: 1 addition & 1 deletion db-scripts/annotate-swissprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def is_taxon_child(child, LCA, taxonomy):
conn.execute('PRAGMA threads = 2;')
conn.commit()
conn.row_factory = sqlite3.Row
with xopen(str(xml_path), mode="rb") as fh, alive_bar() as bar:
with xopen(str(xml_path), mode="rb", threads=2) as fh, alive_bar() as bar:
ups_entries = []
i = 0
for event, elem in et.iterparse(fh, tag='{*}entry'):
Expand Down
2 changes: 1 addition & 1 deletion db-scripts/extract-pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@


non_families = 0
with xopen(str(pfam_path)) as fh_pfam, family_path.open('w') as fh_family, non_family_path.open('w') as fh_non_family:
with xopen(str(pfam_path), threads=2) as fh_pfam, family_path.open('w') as fh_family, non_family_path.open('w') as fh_non_family:
entries = fh_pfam.read().split('//')
for entry_text in entries:
id = None
Expand Down
4 changes: 2 additions & 2 deletions db-scripts/init-psc.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def is_taxon_child(child, LCA, taxonomy):
psc_seqs = 0
psc_sorf_seqs = 0
print('parse & store PSC information...')
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, xopen(str(uniref90_path), mode='rb') as fh_xml, psc_path.open(mode='wt') as fh_fasta_psc, psc_sorf_path.open(mode='wt') as fh_fasta_psc_sorf, alive_bar() as bar:
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, xopen(str(uniref90_path), mode='rb', threads=2) as fh_xml, psc_path.open(mode='wt') as fh_fasta_psc, psc_sorf_path.open(mode='wt') as fh_fasta_psc_sorf, alive_bar() as bar:
conn.execute('PRAGMA page_size = 4096;')
conn.execute('PRAGMA cache_size = 100000;')
conn.execute('PRAGMA locking_mode = EXCLUSIVE;')
Expand Down Expand Up @@ -169,7 +169,7 @@ def is_taxon_child(child, LCA, taxonomy):

print(f'UniParc ({len(uniref90_uniparc_ids)})...')
log_psc.debug('lookup non-representative UniParc seed sequences: %s', len(uniref90_uniparc_ids))
with xopen(str(uniparc_path), mode='rt') as fh_uniparc, psc_path.open(mode='at') as fh_fasta_psc, psc_sorf_path.open(mode='at') as fh_fasta_sorf, alive_bar() as bar:
with xopen(str(uniparc_path), mode='rt', threads=2) as fh_uniparc, psc_path.open(mode='at') as fh_fasta_psc, psc_sorf_path.open(mode='at') as fh_fasta_sorf, alive_bar() as bar:
for record in SeqIO.parse(fh_uniparc, 'fasta'):
uniref90_id = uniref90_uniparc_ids.get(record.id, None)
if(uniref90_id):
Expand Down
4 changes: 2 additions & 2 deletions db-scripts/init-pscc.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def is_taxon_child(child, LCA, taxonomy):
pscc_total = 0
uniref50_uniparc_ids = {}
print('parse & store PSCC information...')
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, xopen(str(uniref50_path), mode='rb') as fh_xml, pscc_path.open(mode='wt') as fh_fasta_pscc, pscc_sorf_path.open(mode='wt') as fh_fasta_pscc_sorf, alive_bar() as bar:
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, xopen(str(uniref50_path), mode='rb', threads=2) as fh_xml, pscc_path.open(mode='wt') as fh_fasta_pscc, pscc_sorf_path.open(mode='wt') as fh_fasta_pscc_sorf, alive_bar() as bar:
conn.execute('PRAGMA page_size = 4096;')
conn.execute('PRAGMA cache_size = 100000;')
conn.execute('PRAGMA locking_mode = EXCLUSIVE;')
Expand Down Expand Up @@ -169,7 +169,7 @@ def is_taxon_child(child, LCA, taxonomy):

print(f'UniParc ({len(uniref50_uniparc_ids)})...')
log_pscc.debug('lookup non-representative UniParc seed sequences: %s', len(uniref50_uniparc_ids))
with xopen(str(uniparc_path), mode='rt') as fh_uniparc, pscc_path.open(mode='at') as fh_fasta_psc, pscc_sorf_path.open(mode='at') as fh_fasta_sorf, alive_bar() as bar:
with xopen(str(uniparc_path), mode='rt', threads=2) as fh_uniparc, pscc_path.open(mode='at') as fh_fasta_psc, pscc_sorf_path.open(mode='at') as fh_fasta_sorf, alive_bar() as bar:
for record in SeqIO.parse(fh_uniparc, 'fasta'):
uniref50_id = uniref50_uniparc_ids.get(record.id, None)
if(uniref50_id):
Expand Down
4 changes: 2 additions & 2 deletions db-scripts/init-ups-ips.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def is_taxon_child(child, LCA, taxonomy):
db_updates = 0
ups_seqs = 0
ips_seqs = 0
with xopen(str(uniref100_path), mode='rb') as fh_xml, ips_path.open(mode='wt') as fh_fasta_ips, alive_bar() as bar:
with xopen(str(uniref100_path), mode='rb', threads=2) as fh_xml, ips_path.open(mode='wt') as fh_fasta_ips, alive_bar() as bar:
for event, elem in et.iterparse(fh_xml, tag='{*}entry'):
if('Fragment' not in elem.find('./{*}name').text): # skip protein fragments
common_tax_id = elem.find('./{*}property[@type="common taxon ID"]')
Expand Down Expand Up @@ -170,7 +170,7 @@ def is_taxon_child(child, LCA, taxonomy):
print(f'UniParc ({len(uniparc_to_uniref100)})...')
log_ups.debug('lookup non-representative UniParc member sequences: %s', len(uniparc_to_uniref100))
db_updates = 0
with xopen(str(uniparc_path), mode='rt') as fh_uniparc, alive_bar() as bar:
with xopen(str(uniparc_path), mode='rt', threads=2) as fh_uniparc, alive_bar() as bar:
for record in SeqIO.parse(fh_uniparc, 'fasta'):
uniparc_id = record.id
uniref100_id = uniparc_to_uniref100.get(uniparc_id, None)
Expand Down

0 comments on commit 555d52a

Please sign in to comment.