Skip to content

Commit

Permalink
1.8.6
Browse files Browse the repository at this point in the history
  • Loading branch information
mbaudis committed Jul 8, 2024
1 parent 64dcdb4 commit b0d7ac4
Show file tree
Hide file tree
Showing 26 changed files with 3,919 additions and 183 deletions.
1 change: 1 addition & 0 deletions housekeepers/frequencymapsCreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def frequencymaps_creator():

BYC.update({"BYC_FILTERS":[{"id":c_id}, {"id": "EDAM:operation_3961"}]})
BYC.update({"PAGINATED_STATUS": False})

prdbug(f'=> processing {c_id} with limit {BYC_PARS.get("limit")}')
RSS = ByconResultSets().datasetsResults()
pdb = ByconBundler().resultsets_frequencies_bundles(RSS)
Expand Down
95 changes: 95 additions & 0 deletions housekeepers/geosoftRetriever.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python3

import re, json, requests, yaml
from copy import deepcopy
from progress.bar import Bar
import sys, datetime

from bycon import *

loc_path = path.dirname( path.abspath(__file__) )
services_lib_path = path.join( loc_path, pardir, "services", "lib" )
services_tmp_path = path.join( loc_path, pardir, "tmp" )
sys.path.append( services_lib_path )
from bycon_bundler import ByconBundler
from datatable_utils import import_datatable_dict_line
from file_utils import read_tsv_to_dictlist, write_log
"""
"""

################################################################################
################################################################################
################################################################################

def main():
geosoft_retriever()

################################################################################

def geosoft_retriever():
initialize_bycon_service()
if len(BYC["BYC_DATASET_IDS"]) != 1:
print("No single existing dataset was provided with -d ...")
exit()
ds_id = BYC["BYC_DATASET_IDS"][0]
geo_url = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=brief&form=text&acc="
log = []

#----------------------- Read geo ids from database -----------------------#

mongo_client = MongoClient(host=DB_MONGOHOST)
ana_coll = mongo_client[ ds_id ][ "analyses" ]
geo_gsms = ana_coll.distinct("analysis_info.experiment_id", {"analysis_info.experiment_id":{"$regex":"geo"}})
data_no = len(geo_gsms)

bar = Bar("Retrieving ", max = data_no, suffix='%(percent)d%%'+" of "+str(data_no) ) if not BYC["TEST_MODE"] else False
up_no = 0

nm = re.compile( r'!Sample_title = (.+?)$' )

# TODO: More extraction; currently just legacy ID retrieval

for gsm in geo_gsms:
bar.next()
url = geo_url+gsm.replace("geo:", "")
# print(f'\n{url}')
if not (ana := ana_coll.find_one({"analysis_info.experiment_id": gsm})):
log.append(f'{gsm}\tnot found again in analyses')
continue
# existing ones are skipped
if len(ana["analysis_info"].get("experiment_title", "")) > 1:
continue
r = requests.get(f'{url}')
if r.ok:
# print(r.content)
fc = str(r.text)
for line in fc.splitlines():
line = str(line)
# print(f'...{line}')
if nm.match(line):
name = nm.match(line).group(1)
if len(name) > 0:
ana_coll.update_one({"_id": ana["_id"]}, {"$set":{"analysis_info.experiment_title": name}})
up_no +=1
else:
log.append(f'{gsm}\tno name extracted')

else:
log.append(f'{gsm}\terror')

#----------------------------- Summary ------------------------------------#

if not BYC["TEST_MODE"]:
bar.finish()
print(f'==> updated {up_no} analyses')

write_log(log, path.join( services_tmp_path, "geosoft_retriever_gsm" ))


################################################################################
################################################################################
################################################################################

if __name__ == '__main__':
main()
62 changes: 46 additions & 16 deletions housekeepers/templateTablesCreator.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#!/usr/bin/env python3

from os import path, pardir

from os import path, pardir, system
from bycon import *

dir_path = path.dirname( path.relpath(__file__) )
pkg_path = path.join( dir_path, pardir )

services_lib_path = path.join( pkg_path, "services", "lib" )
sys.path.append( services_lib_path )
from service_helpers import generate_id

"""
This script uses the `datatable_definitions.yaml` from `bycon` tpo generate import
Expand All @@ -30,30 +30,59 @@ def templates_creator():
initialize_bycon_service()
dt_m = BYC["datatable_mappings"].get("definitions", {})
rsrc_p = path.join(pkg_path, "rsrc", "templates")
s_no = 0
proceed = input(f'Do you want to create biosample_id and analysis_id values?\nEnter a number; hit ENTER for no id values: ')
if re.match(r'^\d+?$', proceed):
s_no = int(proceed)

if s_no > 0:
pre = "pgx"
proceed = input(f'Do you want a prefix instead of "{pre}"?\nPrefix: ')
if re.match(r'^\w+?$', proceed):
pre = proceed

tdir = f'{isotoday()}-{pre}'
proceed = input(f'Do you want a specific directory name instead of "{tdir}"?\nDir name (no path, just directory...): ')
if re.match(r'^\w+?$', proceed):
tdir = proceed
rsrc_p = path.join(rsrc_p, tdir)

ids = []
for i in range(s_no):
rid = generate_id()
ids.append({
"biosample_id": f'{pre}bios-{rid}',
"analysis_id": f'{pre}ana-{rid}',
"individual_id": f'{pre}ind-{rid}'
})


all_cols = []

for t_t, t_d in dt_m.items():
entity_cols = []
table = []
for p_n, p_d in t_d["parameters"].items():
p_t = p_d.get("type", "string")
# print(f'{t_t}: {p_n} ({p_t})')
prefs = p_d.get("prefix_split")
if prefs:
for p in prefs:
for t in ("id", "label"):
h = f'{p_n}_{t}___{p}'
entity_cols.append(h)
if "variant" not in t_t.lower() and h not in all_cols:
all_cols.append(h)
else:
entity_cols.append(p_n)
if "variant" not in t_t.lower() and p_n not in all_cols:
all_cols.append(p_n)
entity_cols.append(p_n)
if "variant" not in t_t.lower() and p_n not in all_cols:
all_cols.append(p_n)

table.append("\t".join(entity_cols))

if "variant" not in t_t.lower():
for id_s in ids:
d_line = []
for p_n in entity_cols:
t_v = ""
if p_n in id_s:
t_v = id_s.get(p_n)
d_line.append(t_v)
table.append("\t".join(d_line))

f_p = path.join(rsrc_p, t_t+"_template.tsv")
f = open(f_p, "w")
f.write("\t".join(entity_cols)+"\n")
f.write("\n".join(table))
f.close()
print(f'===> Wrote {f_p}')

Expand All @@ -62,6 +91,7 @@ def templates_creator():
f.write("\t".join(all_cols)+"\n")
f.close()
print(f'===> Wrote {f_p}')
system(f'open {rsrc_p}')

################################################################################
################################################################################
Expand Down
122 changes: 122 additions & 0 deletions importers/ISCNdefuser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python3

import re, json, sys, datetime, requests, yaml
from copy import deepcopy
from os import path
from progress.bar import Bar
from tabulate import tabulate

from bycon import *

loc_path = path.dirname( path.abspath(__file__) )
services_lib_path = path.join( loc_path, pardir, "services", "lib" )
services_tmp_path = path.join( loc_path, pardir, "tmp" )
sys.path.append( services_lib_path )
from bycon_bundler import ByconBundler
from datatable_utils import import_datatable_dict_line
from service_helpers import generate_id
"""
"""

################################################################################
################################################################################
################################################################################

def main():
iscn_defuser()

################################################################################

def iscn_defuser():
initialize_bycon_service()

# sheet => export => publish to web => tsv, seelcted sheet
g_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTrpE6-SQAT3jVavxzqQXwAs5ujr2lDJehYwKiFUWA-tEm2DyyGTwS1UcnJAYF5VZJs4SlojUtm-Rh7/pub?gid=726541419&single=true&output=tsv"
argdefs = BYC.get("argument_definitions", {})
cb_pat = re.compile( argdefs["cyto_bands"]["items"]["pattern"] )
input_file = BYC_PARS.get("inputfile")
output_file = BYC_PARS.get("outputfile")
if not input_file:
# print("No input file file specified (-i, --inputfile) => quitting ...")
# exit()
print("No inputfile file specified => pulling the online table ...")
input_file = path.join( services_tmp_path, "iscntable.tsv" )
print(f'... reading from {g_url}')
r = requests.get(g_url)
if r.ok:
with open(input_file, 'wb') as f:
f.write(r.content)
print(f"Wrote file to {input_file}")
else:
print(f'Download failed: status code {r.status_code}\n{r.text}')
exit()

if not output_file:
output_file = re.sub(".tsv", "_defused.tsv", input_file)

#-------------------------- Read ISCN from file ---------------------------#

vb = ByconBundler()
iscndata = vb.read_pgx_file(input_file)
for h in ["biosample_id", "iscn_fusions"]:
if h not in iscndata.fieldnames:
print(f'¡¡¡ "{h}" missing in header => giving up !!!')
exit()

#----------------------------- Summary ------------------------------------#

variants = []
v_s_id = "SO:0000806"
v_s_label = "fusion"
chro_names = ChroNames()
l_no = 0
bios_ids = []
for s in iscndata.data:
l_no += 1
if not (bs_id := s.get("biosample_id")):
print(f'¡¡¡ no biosample_id value in line {l_no} => skipping !!!')
continue
if bs_id in bios_ids:
print(f'¡¡¡ existing biosample_id {bs_id} value in line {l_no} => skipping !!!')
continue
bios_ids.append(bs_id)
if not (cs_id := s.get("analysis_id")):
cs_id = generate_id("pgxfcs")
for f_v_s in s.get("iscn_fusions").strip().split(','):
f_id = generate_id("fusionId")
for i_v in f_v_s.strip().split('::'):
if not cb_pat.match(i_v):
print(f'¡¡¡ {l_no} - {bs_id}: {i_v} looks strange => skipping !!!')
continue
cytoBands, chro, start, end, error = bands_from_cytobands(i_v)
chroBases = "{}:{}-{}".format(chro, start, end)
r_id = chro_names.refseq(chro)
# print(f'{bs_id} / {cs_id}: {i_v} - {chroBases} / {r_id}')

v = {
"biosample_id": bs_id,
"analysis_id": cs_id,
"sequence_id": r_id,
"start": str(start),
"end": str(end),
"variant_state_id": v_s_id,
"variant_state_label": v_s_label,
"variant_fusion_id": f_id
}
variants.append(v)

# print(tabulate(variants, headers='keys', tablefmt="tsv", stralign=None, numalign=None))

print(f'=> {len(bios_ids)} samples had a total of {len(variants)} variants')
deff = open(output_file, "w")
deff.write(tabulate(variants, headers='keys', tablefmt="tsv", stralign=None, numalign=None))
deff.close()
print(f'Wrote to {output_file}')

################################################################################
################################################################################
################################################################################

if __name__ == '__main__':
main()
6 changes: 3 additions & 3 deletions importers/ISCNsegmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def iscn_segmenter():

data_no = len(iscn_samples)
s_w_v_no = 0
print(f'=> The input file contains {data_no} items')
print(f'=> The input file contains {data_no} items')

pgxseg = open(output_file, "w")
pgxseg.write( "#meta=>biosample_count={}\n".format(iscn_no) )
Expand Down Expand Up @@ -105,8 +105,8 @@ def iscn_segmenter():
for v in v_instances:
pgxseg.write(pgxseg_variant_line(v)+"\n")

print("=> {} samples had variants".format(s_w_v_no))
print("Wrote to {}".format(output_file))
print(f'=> {s_w_v_no} samples had variants')
print(f'Wrote to {output_file}')

exit()

Expand Down
Loading

0 comments on commit b0d7ac4

Please sign in to comment.