1.8.6

progenetix · Jul 8, 2024 · b0d7ac4 · b0d7ac4
1 parent 64dcdb4
commit b0d7ac4
Show file tree

Hide file tree

Showing 26 changed files with 3,919 additions and 183 deletions.
diff --git a/housekeepers/frequencymapsCreator.py b/housekeepers/frequencymapsCreator.py
@@ -84,6 +84,7 @@ def frequencymaps_creator():
 
         BYC.update({"BYC_FILTERS":[{"id":c_id}, {"id": "EDAM:operation_3961"}]})
         BYC.update({"PAGINATED_STATUS": False})
+
         prdbug(f'=> processing {c_id} with limit {BYC_PARS.get("limit")}')
         RSS = ByconResultSets().datasetsResults()
         pdb = ByconBundler().resultsets_frequencies_bundles(RSS)

diff --git a/housekeepers/geosoftRetriever.py b/housekeepers/geosoftRetriever.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+
+import re, json, requests, yaml
+from copy import deepcopy
+from progress.bar import Bar
+import sys, datetime
+
+from bycon import *
+
+loc_path = path.dirname( path.abspath(__file__) )
+services_lib_path = path.join( loc_path, pardir, "services", "lib" )
+services_tmp_path = path.join( loc_path, pardir, "tmp" )
+sys.path.append( services_lib_path )
+from bycon_bundler import ByconBundler
+from datatable_utils import import_datatable_dict_line
+from file_utils import read_tsv_to_dictlist, write_log
+"""
+
+"""
+
+################################################################################
+################################################################################
+################################################################################
+
+def main():
+    geosoft_retriever()
+
+################################################################################
+
+def geosoft_retriever():
+    initialize_bycon_service()
+    if len(BYC["BYC_DATASET_IDS"]) != 1:
+        print("No single existing dataset was provided with -d ...")
+        exit()
+    ds_id = BYC["BYC_DATASET_IDS"][0]
+    geo_url = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=brief&form=text&acc="
+    log = []
+
+    #----------------------- Read geo ids from database -----------------------#
+
+    mongo_client = MongoClient(host=DB_MONGOHOST)    
+    ana_coll = mongo_client[ ds_id ][ "analyses" ]
+    geo_gsms = ana_coll.distinct("analysis_info.experiment_id", {"analysis_info.experiment_id":{"$regex":"geo"}})
+    data_no = len(geo_gsms)
+
+    bar = Bar("Retrieving ", max = data_no, suffix='%(percent)d%%'+" of "+str(data_no) ) if not BYC["TEST_MODE"] else False
+    up_no = 0
+
+    nm = re.compile( r'!Sample_title = (.+?)$' )
+
+    # TODO: More extraction; currently just legacy ID retrieval
+
+    for gsm in geo_gsms:
+        bar.next()
+        url = geo_url+gsm.replace("geo:", "")
+        # print(f'\n{url}')
+        if not (ana := ana_coll.find_one({"analysis_info.experiment_id": gsm})):
+            log.append(f'{gsm}\tnot found again in analyses')
+            continue
+        # existing ones are skipped
+        if len(ana["analysis_info"].get("experiment_title", "")) > 1:
+            continue
+        r =  requests.get(f'{url}')
+        if r.ok:
+            # print(r.content)
+            fc = str(r.text)
+            for line in fc.splitlines():
+                line = str(line)
+                # print(f'...{line}')
+                if nm.match(line):
+                    name = nm.match(line).group(1)
+                    if len(name) > 0:
+                        ana_coll.update_one({"_id": ana["_id"]}, {"$set":{"analysis_info.experiment_title": name}})
+                        up_no +=1
+                    else:
+                        log.append(f'{gsm}\tno name extracted')
+
+        else:
+            log.append(f'{gsm}\terror')
+
+    #----------------------------- Summary ------------------------------------#
+
+    if not BYC["TEST_MODE"]:
+        bar.finish()
+        print(f'==> updated {up_no} analyses')
+
+    write_log(log, path.join( services_tmp_path, "geosoft_retriever_gsm" ))
+
+
+################################################################################
+################################################################################
+################################################################################
+
+if __name__ == '__main__':
+    main()
diff --git a/housekeepers/templateTablesCreator.py b/housekeepers/templateTablesCreator.py
@@ -1,14 +1,14 @@
 #!/usr/bin/env python3
 
-from os import path, pardir
-
+from os import path, pardir, system
 from bycon import *
 
 dir_path = path.dirname( path.relpath(__file__) )
 pkg_path = path.join( dir_path, pardir )
 
 services_lib_path = path.join( pkg_path, "services", "lib" )
 sys.path.append( services_lib_path )
+from service_helpers import generate_id
 
 """
 This script uses the `datatable_definitions.yaml` from `bycon` tpo generate import
@@ -30,30 +30,59 @@ def templates_creator():
     initialize_bycon_service()
     dt_m = BYC["datatable_mappings"].get("definitions", {})
     rsrc_p = path.join(pkg_path, "rsrc", "templates")
+    s_no = 0
+    proceed = input(f'Do you want to create biosample_id and analysis_id values?\nEnter a number; hit ENTER for no id values: ')
+    if re.match(r'^\d+?$', proceed):
+        s_no = int(proceed)
+
+    if s_no > 0:
+        pre = "pgx"
+        proceed = input(f'Do you want a prefix instead of "{pre}"?\nPrefix: ')
+        if re.match(r'^\w+?$', proceed):
+            pre = proceed
+
+        tdir = f'{isotoday()}-{pre}'
+        proceed = input(f'Do you want a specific directory name instead of "{tdir}"?\nDir name (no path, just directory...): ')
+        if re.match(r'^\w+?$', proceed):
+            tdir = proceed
+            rsrc_p = path.join(rsrc_p, tdir)
+
+    ids = []
+    for i in range(s_no):
+        rid = generate_id()
+        ids.append({
+            "biosample_id": f'{pre}bios-{rid}',
+            "analysis_id": f'{pre}ana-{rid}',
+            "individual_id": f'{pre}ind-{rid}'
+        })
+
 
     all_cols = []
 
     for t_t, t_d in dt_m.items():
         entity_cols = []
+        table = []
         for p_n, p_d in t_d["parameters"].items():
             p_t = p_d.get("type", "string")
-            # print(f'{t_t}: {p_n} ({p_t})')
-            prefs = p_d.get("prefix_split")
-            if prefs:
-                for p in prefs:
-                    for t in ("id", "label"):
-                        h = f'{p_n}_{t}___{p}'
-                        entity_cols.append(h)
-                        if "variant" not in t_t.lower() and h not in all_cols:
-                            all_cols.append(h)
-            else:
-                entity_cols.append(p_n)
-                if "variant" not in t_t.lower() and p_n not in all_cols:
-                    all_cols.append(p_n)
+            entity_cols.append(p_n)
+            if "variant" not in t_t.lower() and p_n not in all_cols:
+                all_cols.append(p_n)
+
+        table.append("\t".join(entity_cols))
+
+        if "variant" not in t_t.lower():
+            for id_s in ids:
+                d_line = []
+                for p_n in entity_cols:
+                    t_v = ""
+                    if p_n in id_s:
+                        t_v = id_s.get(p_n)
+                    d_line.append(t_v)
+                table.append("\t".join(d_line))
 
         f_p = path.join(rsrc_p, t_t+"_template.tsv")
         f = open(f_p, "w")
-        f.write("\t".join(entity_cols)+"\n")
+        f.write("\n".join(table))
         f.close()
         print(f'===> Wrote {f_p}')
 
@@ -62,6 +91,7 @@ def templates_creator():
     f.write("\t".join(all_cols)+"\n")
     f.close()
     print(f'===> Wrote {f_p}')
+    system(f'open {rsrc_p}')
 
 ################################################################################
 ################################################################################

diff --git a/importers/ISCNdefuser.py b/importers/ISCNdefuser.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+import re, json, sys, datetime, requests, yaml
+from copy import deepcopy
+from os import path
+from progress.bar import Bar
+from tabulate import tabulate
+
+from bycon import *
+
+loc_path = path.dirname( path.abspath(__file__) )
+services_lib_path = path.join( loc_path, pardir, "services", "lib" )
+services_tmp_path = path.join( loc_path, pardir, "tmp" )
+sys.path.append( services_lib_path )
+from bycon_bundler import ByconBundler
+from datatable_utils import import_datatable_dict_line
+from service_helpers import generate_id
+"""
+
+"""
+
+################################################################################
+################################################################################
+################################################################################
+
+def main():
+    iscn_defuser()
+
+################################################################################
+
+def iscn_defuser():
+    initialize_bycon_service()
+
+    # sheet => export => publish to web => tsv, seelcted sheet
+    g_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTrpE6-SQAT3jVavxzqQXwAs5ujr2lDJehYwKiFUWA-tEm2DyyGTwS1UcnJAYF5VZJs4SlojUtm-Rh7/pub?gid=726541419&single=true&output=tsv"
+    argdefs = BYC.get("argument_definitions", {})
+    cb_pat = re.compile( argdefs["cyto_bands"]["items"]["pattern"] )
+    input_file = BYC_PARS.get("inputfile")
+    output_file = BYC_PARS.get("outputfile")
+    if not input_file:
+        # print("No input file file specified (-i, --inputfile) => quitting ...")
+        # exit()
+        print("No inputfile file specified => pulling the online table ...")
+        input_file = path.join( services_tmp_path, "iscntable.tsv" )
+        print(f'... reading from {g_url}')
+        r =  requests.get(g_url)
+        if r.ok:
+            with open(input_file, 'wb') as f:
+                f.write(r.content)
+            print(f"Wrote file to {input_file}")
+        else:
+            print(f'Download failed: status code {r.status_code}\n{r.text}')
+            exit()
+
+    if not output_file:
+        output_file = re.sub(".tsv", "_defused.tsv", input_file)
+
+    #-------------------------- Read ISCN from file ---------------------------#
+
+    vb = ByconBundler()
+    iscndata = vb.read_pgx_file(input_file)
+    for h in ["biosample_id", "iscn_fusions"]:
+        if h not in iscndata.fieldnames:
+            print(f'¡¡¡ "{h}" missing in header => giving up !!!')
+            exit()
+
+    #----------------------------- Summary ------------------------------------#
+
+    variants = []
+    v_s_id = "SO:0000806"
+    v_s_label = "fusion"
+    chro_names = ChroNames()
+    l_no = 0
+    bios_ids = []
+    for s in iscndata.data:
+        l_no += 1
+        if not (bs_id := s.get("biosample_id")):
+            print(f'¡¡¡ no biosample_id value in line {l_no} => skipping !!!')
+            continue
+        if bs_id in bios_ids:
+            print(f'¡¡¡ existing biosample_id {bs_id} value in line {l_no} => skipping !!!')
+            continue
+        bios_ids.append(bs_id)
+        if not (cs_id := s.get("analysis_id")):
+            cs_id = generate_id("pgxfcs")
+        for f_v_s in s.get("iscn_fusions").strip().split(','):
+            f_id = generate_id("fusionId")
+            for i_v in f_v_s.strip().split('::'):
+                if not cb_pat.match(i_v):
+                    print(f'¡¡¡ {l_no} - {bs_id}: {i_v} looks strange => skipping !!!')
+                    continue
+                cytoBands, chro, start, end, error = bands_from_cytobands(i_v)
+                chroBases = "{}:{}-{}".format(chro, start, end)
+                r_id = chro_names.refseq(chro)
+                # print(f'{bs_id} / {cs_id}: {i_v} - {chroBases} / {r_id}')
+
+                v = {
+                    "biosample_id": bs_id,
+                    "analysis_id": cs_id,
+                    "sequence_id": r_id,
+                    "start": str(start),
+                    "end": str(end),
+                    "variant_state_id": v_s_id,
+                    "variant_state_label": v_s_label,
+                    "variant_fusion_id": f_id
+                }
+                variants.append(v)
+
+    # print(tabulate(variants, headers='keys', tablefmt="tsv", stralign=None, numalign=None))
+
+    print(f'=> {len(bios_ids)} samples had a total of {len(variants)} variants')
+    deff = open(output_file, "w")
+    deff.write(tabulate(variants, headers='keys', tablefmt="tsv", stralign=None, numalign=None))
+    deff.close()
+    print(f'Wrote to {output_file}')
+
+################################################################################
+################################################################################
+################################################################################
+
+if __name__ == '__main__':
+    main()
diff --git a/importers/ISCNsegmenter.py b/importers/ISCNsegmenter.py
@@ -70,7 +70,7 @@ def iscn_segmenter():
 
 	data_no = len(iscn_samples)
 	s_w_v_no = 0
-    print(f'=> The input file contains {data_no} items')
+	print(f'=> The input file contains {data_no} items')
 
 	pgxseg = open(output_file, "w")
 	pgxseg.write( "#meta=>biosample_count={}\n".format(iscn_no) )
@@ -105,8 +105,8 @@ def iscn_segmenter():
 			for v in v_instances:
 				pgxseg.write(pgxseg_variant_line(v)+"\n")
 
-	print("=> {} samples had variants".format(s_w_v_no))
-	print("Wrote to {}".format(output_file))
+	print(f'=> {s_w_v_no} samples had variants')
+	print(f'Wrote to {output_file}')
 
 	exit()