1
1
"""Process GWAS Catalog summary statistics in batch job."""
2
2
3
- import os
4
- from ot_orchestration .utils import GCSIOManager
5
- from ot_orchestration .types import Manifest_Object
3
+ from ot_orchestration .utils import IOManager , GWASCatalogPipelineManifest
6
4
import logging
7
5
import subprocess
8
6
import click
7
+ import os
8
+ import sys
9
9
10
+ MANIFEST_PATH_ENV_VAR = "MANIFEST_PATH"
10
11
11
- def harmonise (manifest : Manifest_Object ) -> Manifest_Object :
12
+
13
+ def harmonise_step (
14
+ manifest : GWASCatalogPipelineManifest ,
15
+ ) -> GWASCatalogPipelineManifest :
12
16
"""Run Harmonisation."""
13
17
raw_path = manifest ["rawPath" ]
14
18
harmonised_path = manifest ["harmonisedPath" ]
15
19
study_id = manifest ["studyId" ]
20
+ manifest_path = manifest ["manifestPath" ]
21
+ pass_harmonisation = manifest ["passHarmonisation" ]
22
+ logging .info ("Running %s for %s" , "harmonisation" , study_id )
23
+
16
24
command = [
17
- "poetry" ,
18
- "run" ,
19
25
"gentropy" ,
20
26
"step=gwas_catalog_sumstat_preprocess" ,
21
27
f'step.raw_sumstats_path="{ raw_path } "' ,
@@ -26,34 +32,38 @@ def harmonise(manifest: Manifest_Object) -> Manifest_Object:
26
32
"+step.session.extended_spark_conf={spark.kryoserializer.buffer.max:'500m'}" ,
27
33
"+step.session.extended_spark_conf={spark.driver.maxResultSize:'5g'}" ,
28
34
]
29
- if GCSIOManager ().exists (harmonised_path ):
30
- logging .info ("Harmonisation result exists for %s. Skipping" , study_id )
31
- manifest ["passHarmonisation" ] = True
35
+ if IOManager ().resolve (harmonised_path ).exists ():
36
+ if not pass_harmonisation :
37
+ logging .info ("Harmonisation result exists for %s. Skipping" , study_id )
38
+ manifest ["passHarmonisation" ] = True
32
39
return manifest
33
40
41
+ logging .info ("Running command %s" , " " .join (command ))
42
+ command = ["echo" , "RUNNING!" ]
34
43
result = subprocess .run (args = command , capture_output = True )
44
+ logging .info (result )
35
45
if result .returncode != 0 :
36
46
logging .error ("Harmonisation for study %s failed!" , study_id )
37
47
error_msg = result .stderr .decode ()
38
48
logging .error (error_msg )
39
49
manifest ["passHarmonisation" ] = False
40
- logging .info ("Dumping manifest to %s" , manifest [ "manifestPath" ] )
41
- GCSIOManager ().dump (manifest [ "manifestPath" ], manifest )
42
- exit (1 )
50
+ logging .info ("Dumping manifest to %s" , manifest_path )
51
+ IOManager ().resolve ( manifest_path ). dump (manifest )
52
+ sys . exit (1 )
43
53
44
54
logging .info ("Harmonisation for study %s succeded!" , study_id )
45
55
manifest ["passHarmonisation" ] = True
46
56
return manifest
47
57
48
58
49
- def qc (manifest : Manifest_Object ) -> Manifest_Object :
59
+ def qc_step (manifest : GWASCatalogPipelineManifest ) -> GWASCatalogPipelineManifest :
50
60
"""Run QC."""
51
61
harmonised_path = manifest ["harmonisedPath" ]
52
62
qc_path = manifest ["qcPath" ]
53
63
study_id = manifest ["studyId" ]
64
+ manifest_path = manifest ["manifestPath" ]
65
+
54
66
command = [
55
- "poetry" ,
56
- "run" ,
57
67
"gentropy" ,
58
68
"step=summary_statistics_qc" ,
59
69
f'step.gwas_path="{ harmonised_path } "' ,
@@ -65,9 +75,8 @@ def qc(manifest: Manifest_Object) -> Manifest_Object:
65
75
"+step.session.extended_spark_conf={spark.kryoserializer.buffer.max:'500m'}" ,
66
76
"+step.session.extended_spark_conf={spark.driver.maxResultSize:'5g'}" ,
67
77
]
68
- result_exists = GCSIOManager ().exists (qc_path )
69
- logging .info ("Result exists: %s" , result_exists )
70
- if GCSIOManager ().exists (qc_path ):
78
+ result_exists = IOManager ().resolve (qc_path ).exists ()
79
+ if result_exists :
71
80
logging .info ("QC result exists for %s. Skipping" , study_id )
72
81
manifest ["passQC" ] = True
73
82
return manifest
@@ -78,76 +87,51 @@ def qc(manifest: Manifest_Object) -> Manifest_Object:
78
87
error_msg = result .stderr .decode ()
79
88
logging .error (error_msg )
80
89
manifest ["passQC" ] = False
81
- logging .info ("Dumping manifest to %s" , manifest [ "manifestPath" ] )
82
- GCSIOManager ().dump (manifest [ "manifestPath" ], manifest )
90
+ logging .info ("Dumping manifest to %s" , manifest_path )
91
+ IOManager ().resolve ( manifest_path ). dump (manifest )
83
92
exit (1 )
84
93
85
94
logging .info ("QC for study %s succeded!" , study_id )
86
95
manifest ["passQC" ] = True
87
96
return manifest
88
97
89
98
90
- def qc_consolidation (manifest : Manifest_Object ) -> Manifest_Object :
91
- pass
99
+ def qc_consolidation_step (
100
+ manifest : GWASCatalogPipelineManifest ,
101
+ ) -> GWASCatalogPipelineManifest :
102
+ """Check if sumstats pass qc thresholds."""
103
+ return manifest
92
104
93
105
94
- def clumping (manifest : Manifest_Object ) -> Manifest_Object :
106
+ def clump_step (manifest : GWASCatalogPipelineManifest ) -> GWASCatalogPipelineManifest :
95
107
"""Run Clumping."""
96
- harmonised_path = manifest ["harmonisedPath" ]
97
- clumping_path = manifest ["clumpingPath" ]
98
- study_id = manifest ["studyId" ]
99
- command = [
100
- "poetry" ,
101
- "run" ,
102
- "gentropy" ,
103
- "step=clumping" ,
104
- f'step.gwas_path="{ harmonised_path } "' ,
105
- f'step.output_path="{ clumping_path } "' ,
106
- f'step.study_id="{ study_id } "' ,
107
- "+step.session.extended_spark_conf={spark.jars:'https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar'}" ,
108
- "+step.session.extended_spark_conf={spark.dynamicAllocation.enabled:'false'}" ,
109
- "+step.session.extended_spark_conf={spark.driver.memory:'30g'}" ,
110
- "+step.session.extended_spark_conf={spark.kryoserializer.buffer.max:'500m'}" ,
111
- "+step.session.extended_spark_conf={spark.driver.maxResultSize:'5g'}" ,
112
- ]
113
- if GCSIOManager ().exists (clumping_path ):
114
- logging .info ("Clumping result exists for %s. Skipping" , study_id )
115
- manifest ["passClumping" ] = True
116
- return manifest
117
-
118
- result = subprocess .run (args = command , capture_output = True )
119
- if result .returncode != 0 :
120
- logging .error ("Clumping for study %s failed!" , study_id )
121
- error_msg = result .stderr .decode ()
122
- logging .error (error_msg )
123
- manifest ["passClumping" ] = False
124
- logging .info ("Dumping manifest to %s" , manifest ["manifestPath" ])
125
- GCSIOManager ().dump (manifest ["manifestPath" ], manifest )
126
- exit (1 )
127
108
return manifest
128
109
129
110
130
111
@click .command ()
131
- def gwas_catalog_process_in_batch ():
132
- """Run gwas catalog processing of summary statistics in batch. This includes harmonisation, QC and clumping."""
133
- PROCESSING_ORDER = ["harmonisation" ]
134
- MANIFEST_PATH = os .environ .get ("MANIFEST_PATH" )
135
- if MANIFEST_PATH is None :
136
- logging .error ("MANIFEST_PATH not set!" )
137
- exit (1 )
138
-
139
- manifest = GCSIOManager ().load (MANIFEST_PATH )
140
- study = manifest ["studyId" ]
141
- PROCESSING_STEPS = {"harmonisation" : harmonise , "qc" : qc , "clumping" : clumping }
142
- for step in PROCESSING_ORDER :
143
- if manifest [f"pass{ step .capitalize ()} " ]:
144
- logging .info ("Skipping %s" , step )
145
- continue
146
- logging .info ("Running %s for %s" , step , study )
147
- manifest = PROCESSING_STEPS [step ](manifest )
148
- logging .info ("Finished %s for %s" , step , study )
149
-
150
- GCSIOManager ().dump (MANIFEST_PATH , manifest )
151
-
152
-
153
- __all__ = ["gwas_catalog_process_in_batch" ]
112
+ def gwas_catalog_pipeline ():
113
+ """Run gwas catalog processing of summary statistics in batch.
114
+
115
+ This includes harmonisation, QC and clumping.
116
+ This command requires setting the `MANIFEST_PATH` in the
117
+ environment. The variable should be the reference to the path with the
118
+ manifest file.
119
+ """
120
+ logging .debug ("Reading MANIFEST_PATH env variable" )
121
+ manifest_path = os .getenv (MANIFEST_PATH_ENV_VAR )
122
+ if not manifest_path :
123
+ logging .error ("MANIFEST_PATH environment variable is missing" )
124
+ sys .exit (1 )
125
+ logging .debug ("MANIFEST_PATH: %s" , manifest_path )
126
+ manifest = GWASCatalogPipelineManifest .from_file (manifest_path )
127
+ logging .debug ("MANIFEST: %s" , manifest )
128
+ # for now dummy implementatin of the pipeline processing order
129
+ manifest = harmonise_step (manifest )
130
+ manifest = qc_step (manifest )
131
+ manifest = qc_consolidation_step (manifest )
132
+ manifest = clump_step (manifest )
133
+
134
+ IOManager ().resolve (manifest_path ).dump (manifest )
135
+
136
+
137
+ __all__ = ["gwas_catalog_pipeline" ]
0 commit comments