From e797799035d31a2b0add057c4e776e1d3ef55b85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Thu, 15 Jun 2023 19:16:48 +0100 Subject: [PATCH 1/6] feat: add multiple_testing_correction.py --- analysis/README.md | 7 + analysis/python/compare_baseline.ipynb | 657 ++++++++++++++---- .../python/multiple_testing_correction.py | 40 ++ 3 files changed, 580 insertions(+), 124 deletions(-) create mode 100644 analysis/python/multiple_testing_correction.py diff --git a/analysis/README.md b/analysis/README.md index 3f6b776..971c730 100644 --- a/analysis/README.md +++ b/analysis/README.md @@ -24,5 +24,12 @@ gcloud dataproc jobs submit pyspark \ --region=europe-west1 \ --project=open-targets-eu-dev \ analysis/python/enrichments.py -- config.yml --stratify-therapeutic-area non_oncology + +# Adjust the p values for multiple testing using the Benjamini-Hochberg procedure by providing the path to the file with the results of the enrichment analysis +gcloud dataproc jobs submit pyspark \ + --cluster=il-big-stop-reasons \ + --region=europe-west1 \ + --project=open-targets-eu-dev \ + analysis/python/multiple_testing_correction.py -- "gs://ot-team/irene/stop_reasons/predictions_aggregations_non_oncology" ``` - Visualization of the results. \ No newline at end of file diff --git a/analysis/python/compare_baseline.ipynb b/analysis/python/compare_baseline.ipynb index aa48c69..3cc22e8 100644 --- a/analysis/python/compare_baseline.ipynb +++ b/analysis/python/compare_baseline.ipynb @@ -14,22 +14,29 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 80, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+----------+----------+---------+-------------+\n", - "| prediction| comparison|comparisonType|predictionType| a|predictionTotal|comparisonTotal| total| b| c| d| or_result| lower_ci| upper_ci| pvalue|\n", - "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+----------+----------+---------+-------------+\n", - "| Safety_Sideeffects|affected_pathway| byDatatype| reason| 456| 2732| 44764|413172| 2276|44308|366132| 1.6555722| 1.4964652|1.8315957|2.1668734E-20|\n", - "|Insufficient_Enro...|affected_pathway| byDatatype| reason|2178| 18439| 44764|413172|16261|42586|352147| 1.1075613| 1.057939| 1.159511| 1.5909267E-5|\n", - "| Negative|affected_pathway| byDatatype| reason| 573| 5576| 44764|413172| 5003|44191|363405| 0.9418488| 0.863369|1.0274624| 0.18574768|\n", - "|Business_Administ...|affected_pathway| byDatatype| reason|1312| 11630| 44764|413172|10318|43452|358090| 1.0479021|0.98855203|1.1108155| 0.11557128|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 226| 2653| 44764|413172| 2427|44538|365981|0.76518506|0.66744256|0.8772413| 7.778494E-5|\n", - "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+----------+----------+---------+-------------+\n", + "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n", + "| prediction| comparison|comparisonType|predictionType| a|predictionTotal|comparisonTotal| total| b| c| d|or_result| lower_ci| upper_ci| pvalue|\n", + "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n", + "|Insufficient_Enro...|affected_pathway| byDatatype| reason|2291| 18439| 46211|413172|16148|43920|350813|1.1332343| 1.083513|1.1852372| 6.968556E-8|\n", + "| Negative|affected_pathway| byDatatype| reason| 615| 5576| 46211|413172| 4961|45596|362000|0.9842099|0.9045937|1.0708333| 0.7321643|\n", + "|Business_Administ...|affected_pathway| byDatatype| reason|1317| 11630| 46211|413172|10313|44894|356648|1.0145005|0.9571549|1.0752817| 0.62243146|\n", + "| Invalid_Reason|affected_pathway| byDatatype| reason| 422| 2577| 46211|413172| 2155|45789|364806|1.5601487|1.4049416| 1.732502|2.1676067E-15|\n", + "| Study_Design|affected_pathway| byDatatype| reason| 405| 3166| 46211|413172| 2761|45806|364200| 1.166289|1.0503099| 1.295075| 0.0046313014|\n", + "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n", "only showing top 5 rows\n", "\n", "+--------------------+----------+--------------+--------------+----+---------------+---------------+------+----+------+------+----------+----------+----------+-------------+\n", @@ -56,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 81, "metadata": {}, "outputs": [ { @@ -66,26 +73,26 @@ "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n", "| prediction| comparison|comparisonType|predictionType|or_result_observed|lower_ci_observed|upper_ci_observed|pvalue_observed|or_result_permuted|lower_ci_permuted|upper_ci_permuted|pvalue_permuted|\n", "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.7569264| 0.6600593| 0.8680094| 4.1372376E-5|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.7477239| 0.64919406| 0.86120784| 3.0507868E-5|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.72785324| 0.6332734| 0.83655864| 3.588353E-6|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.7667227| 0.670729| 0.87645483| 6.3741165E-5|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.66826046| 0.57837003| 0.7721216| 9.3277785E-9|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.76111776| 0.66259146| 0.87429476| 7.314266E-5|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.7052045| 0.613212| 0.8109974| 3.5039437E-7|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.7261657| 0.63306683| 0.8329557| 2.3254256E-6|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.76518506| 0.66744256| 0.8772413| 7.778494E-5|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 0.99938464| 0.91808873| 1.0878792| 1.0|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 1.00379| 0.9203073| 1.0948457| 0.92929024|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 0.9924268| 0.9114749| 1.0805684| 0.879578|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 0.9880351| 0.9086467| 1.0743597| 0.7985208|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 0.9493068| 0.87049305| 1.0352563| 0.2506197|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 1.0490831| 0.9641333| 1.1415178| 0.26321337|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 1.0264277| 0.9443765| 1.115608| 0.53365433|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 1.0213302| 0.9400097| 1.1096858| 0.6086663|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 0.9418488| 0.863369| 1.0274624| 0.18574768|\n", - "|Business_Administ...|affected_pathway| byDatatype| reason| 1.0135593| 0.9628919| 1.066893| 0.60830927| 1.019448| 0.9612728| 1.0811441| 0.51626134|\n", - "|Business_Administ...|affected_pathway| byDatatype| reason| 1.0135593| 0.9628919| 1.066893| 0.60830927| 1.0313885| 0.97135615| 1.095131| 0.314432|\n", + "|Insufficient_Enro...|affected_pathway| byDatatype| reason| 1.101286| 1.0580636| 1.1462741| 2.9514772E-6| 1.1332343| 1.083513| 1.1852372| 6.968556E-8|\n", + "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 0.9842099| 0.9045937| 1.0708333| 0.7321643|\n", + "|Business_Administ...|affected_pathway| byDatatype| reason| 1.0135593| 0.9628919| 1.066893| 0.60830927| 1.0145005| 0.9571549| 1.0752817| 0.62243146|\n", + "| Invalid_Reason|affected_pathway| byDatatype| reason| 1.42308| 1.2919756| 1.5674883| 4.5850663E-12| 1.5601487| 1.4049416| 1.732502| 2.1676067E-15|\n", + "| Study_Design|affected_pathway| byDatatype| reason| 1.1850995| 1.080571| 1.2997394| 3.9830338E-4| 1.166289| 1.0503099| 1.295075| 0.0046313014|\n", + "| Safety_Sideeffects|affected_pathway| byDatatype| reason| 1.6450619| 1.5032661| 1.8002325| 5.306468E-25| 1.5841677| 1.4315631| 1.75304| 2.4105537E-17|\n", + "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.82520795| 0.72417057| 0.9403422| 0.0036548723|\n", + "| Study_Staff_Moved|affected_pathway| byDatatype| reason| 1.0708259| 0.9271349| 1.2367867| 0.3469289| 1.1133336| 0.94795865| 1.3075589| 0.19965|\n", + "| No_Context|affected_pathway| byDatatype| reason| 0.685027| 0.5684092| 0.8255707| 4.0033254E-5| 0.82053685| 0.6722922| 1.0014704| 0.053746197|\n", + "| Negative| literature| byDatatype| reason| 0.8895437| 0.84362316| 0.93796384| 1.4934009E-5| 0.9543413| 0.90287566| 1.0087405| 0.10023123|\n", + "|Business_Administ...| literature| byDatatype| reason| 0.96253645| 0.92767036| 0.998713| 0.043106694| 1.0164348| 0.97820574| 1.0561578| 0.40482116|\n", + "|Insufficient_Enro...| literature| byDatatype| reason| 0.9052195| 0.87882197| 0.9324099| 4.2594296E-11| 1.0470023| 1.0154496| 1.0795352| 0.0033233627|\n", + "| Invalid_Reason| literature| byDatatype| reason| 1.0314267| 0.9545476| 1.1144975| 0.44084844| 1.2810183| 1.1842574| 1.3856851| 9.200067E-10|\n", + "| Covid19| literature| byDatatype| reason| 0.8594809| 0.7210249| 1.0245243| 0.09864084| 0.5638818| 0.45956767| 0.69187355| 1.1856974E-8|\n", + "| Study_Design| literature| byDatatype| reason| 0.948647| 0.88451016| 1.0174345| 0.14340925| 1.0617356| 0.98768425| 1.141339| 0.105820864|\n", + "| Safety_Sideeffects| literature| byDatatype| reason| 1.3222082| 1.2257016| 1.4263134| 4.225119E-13| 1.4366288| 1.3318783| 1.549618| 1.605722E-20|\n", + "| Logistics_Resources| literature| byDatatype| reason| 0.76919395| 0.7120554| 0.83091766| 2.2282516E-11| 0.80852926| 0.74461913| 0.87792474| 3.4295346E-7|\n", + "| Study_Staff_Moved| literature| byDatatype| reason| 0.9657753| 0.86894673| 1.0733938| 0.5355322| 0.937642| 0.8390834| 1.0477773| 0.26166993|\n", + "| Another_Study| literature| byDatatype| reason| 0.71158177| 0.64844126| 0.78087044| 5.357373E-13| 0.76391065| 0.69150585| 0.8438966| 7.8315104E-8|\n", + "| Negative| rna_expression| byDatatype| reason| 0.9926047| 0.92027634| 1.0706177| 0.8624524| 1.0168192| 0.9524448| 1.0855448| 0.6150149|\n", "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n", "only showing top 20 rows\n", "\n" @@ -115,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 82, "metadata": {}, "outputs": [ { @@ -125,11 +132,11 @@ "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n", "| prediction| comparison|comparisonType|predictionType|or_result_permuted|lower_ci_permuted|upper_ci_permuted|pvalue_permuted|\n", "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n", - "| Safety_Sideeffects|affected_pathway| byDatatype| reason| 1.6555722| 1.4964652| 1.8315957| 2.1668734E-20|\n", - "|Insufficient_Enro...|affected_pathway| byDatatype| reason| 1.1075613| 1.057939| 1.159511| 1.5909267E-5|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.9418488| 0.863369| 1.0274624| 0.18574768|\n", - "|Business_Administ...|affected_pathway| byDatatype| reason| 1.0479021| 0.98855203| 1.1108155| 0.11557128|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.76518506| 0.66744256| 0.8772413| 7.778494E-5|\n", + "|Insufficient_Enro...|affected_pathway| byDatatype| reason| 1.1332343| 1.083513| 1.1852372| 6.968556E-8|\n", + "| Negative|affected_pathway| byDatatype| reason| 0.9842099| 0.9045937| 1.0708333| 0.7321643|\n", + "|Business_Administ...|affected_pathway| byDatatype| reason| 1.0145005| 0.9571549| 1.0752817| 0.62243146|\n", + "| Invalid_Reason|affected_pathway| byDatatype| reason| 1.5601487| 1.4049416| 1.732502| 2.1676067E-15|\n", + "| Study_Design|affected_pathway| byDatatype| reason| 1.166289| 1.0503099| 1.295075| 0.0046313014|\n", "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n", "only showing top 5 rows\n", "\n" @@ -142,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 83, "metadata": {}, "outputs": [ { @@ -165,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 84, "metadata": {}, "outputs": [ { @@ -174,17 +181,9 @@ "100" ] }, - "execution_count": 7, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23/06/09 14:34:33 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2438016 ms exceeds timeout 120000 ms\n", - "23/06/09 14:34:33 WARN SparkContext: Killing executors is not supported by current scheduler.\n" - ] } ], "source": [ @@ -207,37 +206,44 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 91, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "+--------------------+----------------+------------------------------+----------------+---------------+---------------+------------------+\n", - "| prediction| comparison|permuted_greater_than_observed|pvalue_empirical|pvalue_observed|pvalue_permuted|or_result_observed|\n", - "+--------------------+----------------+------------------------------+----------------+---------------+---------------+------------------+\n", - "| Safety_Sideeffects| literature| 1| 0.01| 4.225119E-13| 5.7093783E-13| 1.3222082|\n", - "| Safety_Sideeffects| literature| 1| 0.01| 4.225119E-13| 3.3565586E-17| 1.3222082|\n", - "| Safety_Sideeffects| literature| 1| 0.01| 4.225119E-13| 2.538905E-23| 1.3222082|\n", - "| Safety_Sideeffects| literature| 1| 0.01| 4.225119E-13| 1.9547876E-20| 1.3222082|\n", - "| Safety_Sideeffects| literature| 1| 0.01| 4.225119E-13| 3.9431886E-19| 1.3222082|\n", - "| Safety_Sideeffects| literature| 1| 0.01| 4.225119E-13| 4.9306253E-21| 1.3222082|\n", - "| Safety_Sideeffects| literature| 1| 0.01| 4.225119E-13| 5.2965625E-20| 1.3222082|\n", - "| Safety_Sideeffects| literature| 1| 0.01| 4.225119E-13| 7.503974E-21| 1.3222082|\n", - "| Safety_Sideeffects| literature| 1| 0.01| 4.225119E-13| 1.3370162E-20| 1.3222082|\n", - "|Business_Administ...|somatic_mutation| 2| 0.02| 0.5010471| 0.48032835| 1.0204865|\n", - "|Business_Administ...|somatic_mutation| 2| 0.02| 0.5010471| 0.4706476| 1.0204865|\n", - "|Business_Administ...|somatic_mutation| 2| 0.02| 0.5010471| 0.15366814| 1.0204865|\n", - "|Business_Administ...|somatic_mutation| 2| 0.02| 0.5010471| 0.040951073| 1.0204865|\n", - "|Business_Administ...|somatic_mutation| 2| 0.02| 0.5010471| 0.886372| 1.0204865|\n", - "|Business_Administ...|somatic_mutation| 2| 0.02| 0.5010471| 0.5447119| 1.0204865|\n", - "|Business_Administ...|somatic_mutation| 2| 0.02| 0.5010471| 0.051056553| 1.0204865|\n", - "|Business_Administ...|somatic_mutation| 2| 0.02| 0.5010471| 0.3950442| 1.0204865|\n", - "|Business_Administ...|somatic_mutation| 2| 0.02| 0.5010471| 0.48515812| 1.0204865|\n", - "| Another_Study|affected_pathway| 4| 0.04| 0.62539124| 0.24647969| 0.96532875|\n", - "| Another_Study|affected_pathway| 4| 0.04| 0.62539124| 0.64555955| 0.96532875|\n", - "+--------------------+----------------+------------------------------+----------------+---------------+---------------+------------------+\n", + "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n", + "| prediction| comparison|permuted_greater_than_observed|pvalue_empirical|pvalue_observed|or_result_observed|\n", + "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n", + "| Safety_Sideeffects| literature| 2| 0.02| 4.225119E-13| 1.3222082|\n", + "|Business_Administ...| somatic_mutation| 22| 0.22| 0.5010471| 1.0204865|\n", + "| Another_Study| affected_pathway| 47| 0.47| 0.62539124| 0.96532875|\n", + "| Logistics_Resources| somatic_mutation| 82| 0.82| 0.13673647| 0.90625197|\n", + "|Business_Administ...| rna_expression| 59| 0.59| 0.37541625| 0.976|\n", + "| Negative| somatic_mutation| 100| 1.0| 6.9009767E-9| 1.2699835|\n", + "|Business_Administ...| known_drug| 0| 0.0| 1.0| 0.028963346|\n", + "| Invalid_Reason|genetic_association| 97| 0.97| 0.024117803| 0.83866763|\n", + "| Regulatory| rna_expression| 100| 1.0| 1.3482296E-9| 0.54306966|\n", + "|Insufficient_Enro...| affected_pathway| 34| 0.34| 2.9514772E-6| 1.101286|\n", + "| No_Context| affected_pathway| 97| 0.97| 4.0033254E-5| 0.685027|\n", + "|Insufficient_Enro...|genetic_association| 43| 0.43| 0.0| 0.60888106|\n", + "| Invalid_Reason| known_drug| 0| 0.0| 1.0| 0.006276258|\n", + "| Interim_Analysis| literature| 9| 0.09| 0.10653863| 0.7150094|\n", + "| Invalid_Reason| animal_model| 87| 0.87| 0.065790154| 0.8683092|\n", + "|Insufficient_Enro...| animal_model| 93| 0.93| 6.694221E-29| 0.71514577|\n", + "|Business_Administ...| affected_pathway| 28| 0.28| 0.60830927| 1.0135593|\n", + "| Study_Design| literature| 84| 0.84| 0.14340925| 0.948647|\n", + "| Interim_Analysis| rna_expression| 55| 0.55| 0.042635407| 0.46051452|\n", + "| Success| literature| 1| 0.01| 0.69708806| 1.2096671|\n", + "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n", "only showing top 20 rows\n", "\n" ] @@ -247,7 +253,8 @@ "corrected_combined = (\n", " calculate_empirical_pvalue(combined)\n", " # .select(\"permuted_greater_than_observed\").distinct()\n", - " .join(combined.select(\"prediction\", \"comparison\", \"pvalue_observed\", \"pvalue_permuted\", \"or_result_observed\"), on=[\"prediction\", \"comparison\"], how=\"inner\")\n", + " .join(combined.select(\"prediction\", \"comparison\", \"pvalue_observed\", \"or_result_observed\"), on=[\"prediction\", \"comparison\"], how=\"inner\")\n", + " .distinct()\n", ")\n", "\n", "corrected_combined.show()" @@ -255,72 +262,66 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 92, "metadata": {}, "outputs": [ { - "name": "stdout", + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corrected_combined.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stderr", "output_type": "stream", "text": [ - "The following packages are already present in the pyproject.toml and will be skipped:\n", - "\n", - " • \u001b[36mmatplotlib\u001b[39m\n", - "\n", - "If you want to update it to the latest compatible version, you can use `poetry update package`.\n", - "If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.\n", - "\n", - "Nothing to add.\n", - "The following packages are already present in the pyproject.toml and will be skipped:\n", - "\n", - " • \u001b[36mplotly.express\u001b[39m\n", - "\n", - "If you want to update it to the latest compatible version, you can use `poetry update package`.\n", - "If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.\n", - "\n", - "Nothing to add.\n", - "The following packages are already present in the pyproject.toml and will be skipped:\n", - "\n", - " • \u001b[36mnbformat\u001b[39m\n", - "\n", - "If you want to update it to the latest compatible version, you can use `poetry update package`.\n", - "If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.\n", - "\n", - "Nothing to add.\n", - "Using version \u001b[39;1m^1.1.0\u001b[39;22m for \u001b[36mchart-studio\u001b[39m\n", - "\n", - "\u001b[34mUpdating dependencies\u001b[39m\n", - "\u001b[2K\u001b[34mResolving dependencies...\u001b[39m \u001b[39;2m(4.1s)\u001b[39;22m\n", - "\n", - "\u001b[34mWriting lock file\u001b[39m\n", - "\n", - "\u001b[39;1mPackage operations\u001b[39;22m: \u001b[34m2\u001b[39m installs, \u001b[34m0\u001b[39m updates, \u001b[34m0\u001b[39m removals\n", - "\n", - " \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.3.4\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mPending...\u001b[39m\n", - "\u001b[1A\u001b[0J \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.3.4\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mDownloading...\u001b[39m \u001b[39;1m0%\u001b[39;22m\n", - "\u001b[1A\u001b[0J \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.3.4\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mDownloading...\u001b[39m \u001b[39;1m100%\u001b[39;22m\n", - "\u001b[1A\u001b[0J \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.3.4\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mInstalling...\u001b[39m\n", - "\u001b[1A\u001b[0J \u001b[32;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[32m1.3.4\u001b[39m\u001b[39m)\u001b[39m\n", - " \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.1.0\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mPending...\u001b[39m\n", - "\u001b[1A\u001b[0J \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.1.0\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mDownloading...\u001b[39m \u001b[39;1m0%\u001b[39;22m\n", - "\u001b[1A\u001b[0J \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.1.0\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mDownloading...\u001b[39m \u001b[39;1m100%\u001b[39;22m\n", - "\u001b[1A\u001b[0J \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.1.0\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mInstalling...\u001b[39m\n", - "\u001b[1A\u001b[0J \u001b[32;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[32m1.1.0\u001b[39m\u001b[39m)\u001b[39m\n", - "\n", - "\u001b[31;1mCould not find a matching version of package chart_studio.plotly\u001b[39;22m\n" + " \r" ] + }, + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" } ], + "source": [ + "corrected_combined.select(\"prediction\", \"comparison\").distinct().count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!poetry add matplotlib\n", "!poetry add plotly.express\n", - "!poetry add nbformat\n", "!poetry add chart_studio\n", "!poetry add chart_studio.plotly\n" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 95, "metadata": {}, "outputs": [ { @@ -329,7 +330,7 @@ "'https://plotly.com/~irenelopezs/63/'" ] }, - "execution_count": 79, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } @@ -347,9 +348,16 @@ "\n", "pdf = corrected_combined.toPandas()\n", "pdf['or_result_observed'] = pdf['or_result_observed'].astype(str)\n", + "pdf['significance'] = 'Remains Significant'\n", + "pdf.loc[(pdf['pvalue_observed'] < 0.05) & (pdf['pvalue_empirical'] >= 0.05), 'significance'] = 'No Longer Significant'\n", + "pdf.loc[(pdf['pvalue_observed'] >= 0.05) & (pdf['pvalue_empirical'] < 0.05), 'significance'] = 'Now Significant'\n", + "pdf.loc[(pdf['pvalue_observed'] >= 0.05) & (pdf['pvalue_empirical'] >= 0.05), 'significance'] = 'Never Significant'\n", + "\n", "\n", "fig = px.scatter(pdf, x=\"pvalue_observed\", y=\"pvalue_empirical\",\n", " hover_data=[\"comparison\", \"prediction\", \"or_result_observed\"],\n", + " color='significance',\n", + " color_discrete_sequence=[\"green\", \"red\", \"blue\", \"black\"],\n", " title=\"Comparison of Observed and Empirical P-Values\")\n", "fig.add_shape(type=\"line\", \n", " x0=pdf['pvalue_observed'].min(), x1=pdf['pvalue_observed'].max(), \n", @@ -364,6 +372,407 @@ "\n", "py.plot(fig, filename=\"stop_reasons_pvalue_scatter\", auto_open=True)\n" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## P value correction for false discovery rate" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
predictioncomparisoncomparisonTypepredictionTypeapredictionTotalcomparisonTotaltotalbcdor_resultlower_ciupper_cipvalue
0Insufficient_EnrollmenteuropepmcbyDatasourcereason86151843920284941317298241942342004990.9052190.8788220.9324104.259430e-11
1Business_AdministrativeeuropepmcbyDatasourcereason56021163020284941317260281972472042950.9625360.9276700.9987134.310669e-02
2NegativeeuropepmcbyDatasourcereason2577557620284941317229992002722073240.8895440.8436230.9379641.493401e-05
3Invalid_ReasoneuropepmcbyDatasourcereason1285257720284941317212922015642090311.0314270.9545481.1144984.408484e-01
4Covid19europepmcbyDatasourcereason2285032028494131722752026212100480.8594810.7210251.0245249.864084e-02
\n", + "
" + ], + "text/plain": [ + " prediction comparison comparisonType predictionType a \\\n", + "0 Insufficient_Enrollment europepmc byDatasource reason 8615 \n", + "1 Business_Administrative europepmc byDatasource reason 5602 \n", + "2 Negative europepmc byDatasource reason 2577 \n", + "3 Invalid_Reason europepmc byDatasource reason 1285 \n", + "4 Covid19 europepmc byDatasource reason 228 \n", + "\n", + " predictionTotal comparisonTotal total b c d or_result \\\n", + "0 18439 202849 413172 9824 194234 200499 0.905219 \n", + "1 11630 202849 413172 6028 197247 204295 0.962536 \n", + "2 5576 202849 413172 2999 200272 207324 0.889544 \n", + "3 2577 202849 413172 1292 201564 209031 1.031427 \n", + "4 503 202849 413172 275 202621 210048 0.859481 \n", + "\n", + " lower_ci upper_ci pvalue \n", + "0 0.878822 0.932410 4.259430e-11 \n", + "1 0.927670 0.998713 4.310669e-02 \n", + "2 0.843623 0.937964 1.493401e-05 \n", + "3 0.954548 1.114498 4.408484e-01 \n", + "4 0.721025 1.024524 9.864084e-02 " + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "observed = spark.read.parquet(\"/Users/irenelopez/MEGAsync/EBI/repos/stopReasons/data/predictions_aggregations_all\").toPandas()\n", + "\n", + "observed.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
predictioncomparisoncomparisonTypepredictionTypeapredictionTotalcomparisonTotaltotalbcdor_resultlower_ciupper_cipvaluerejectspvalue_corrected
0Insufficient_EnrollmenteuropepmcbyDatasourcereason86151843920284941317298241942342004990.9052190.8788220.9324104.259430e-11True1.723030e-10
1Business_AdministrativeeuropepmcbyDatasourcereason56021163020284941317260281972472042950.9625360.9276700.9987134.310669e-02False7.268878e-02
2NegativeeuropepmcbyDatasourcereason2577557620284941317229992002722073240.8895440.8436230.9379641.493401e-05True4.078030e-05
3Invalid_ReasoneuropepmcbyDatasourcereason1285257720284941317212922015642090311.0314270.9545481.1144984.408484e-01False5.564568e-01
4Covid19europepmcbyDatasourcereason2285032028494131722752026212100480.8594810.7210251.0245249.864084e-02False1.523175e-01
\n", + "
" + ], + "text/plain": [ + " prediction comparison comparisonType predictionType a \\\n", + "0 Insufficient_Enrollment europepmc byDatasource reason 8615 \n", + "1 Business_Administrative europepmc byDatasource reason 5602 \n", + "2 Negative europepmc byDatasource reason 2577 \n", + "3 Invalid_Reason europepmc byDatasource reason 1285 \n", + "4 Covid19 europepmc byDatasource reason 228 \n", + "\n", + " predictionTotal comparisonTotal total b c d or_result \\\n", + "0 18439 202849 413172 9824 194234 200499 0.905219 \n", + "1 11630 202849 413172 6028 197247 204295 0.962536 \n", + "2 5576 202849 413172 2999 200272 207324 0.889544 \n", + "3 2577 202849 413172 1292 201564 209031 1.031427 \n", + "4 503 202849 413172 275 202621 210048 0.859481 \n", + "\n", + " lower_ci upper_ci pvalue rejects pvalue_corrected \n", + "0 0.878822 0.932410 4.259430e-11 True 1.723030e-10 \n", + "1 0.927670 0.998713 4.310669e-02 False 7.268878e-02 \n", + "2 0.843623 0.937964 1.493401e-05 True 4.078030e-05 \n", + "3 0.954548 1.114498 4.408484e-01 False 5.564568e-01 \n", + "4 0.721025 1.024524 9.864084e-02 False 1.523175e-01 " + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from statsmodels.stats.multitest import fdrcorrection\n", + "\n", + "pvals = observed['pvalue'].values\n", + "\n", + "rejected, pvals_corrected = fdrcorrection(pvals, alpha=0.05, method=\"indep\", is_sorted=False)\n", + "observed['rejects'] = rejected\n", + "observed[\"pvalue_corrected\"] = pvals_corrected\n", + "\n", + "observed.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "observed[\"significance\"] = \"Remains Significant\"\n", + "observed.loc[(pdf[\"pvalue\"] < 0.05) & (pdf[\"pvalue_corrected\"] >= 0.05), \"significance\"] = \"No Longer Significant\"\n", + "observed.loc[(pdf[\"pvalue_observed\"] >= 0.05) & (pdf[\"pvalue_corrected\"] < 0.05), \"significance\"] = \"Now Significant\"\n", + "observed.loc[(pdf[\"pvalue_observed\"] >= 0.05) & (pdf[\"pvalue_corrected\"] >= 0.05), \"significance\"] = \"Never Significant\"\n", + "\n", + "\n", + "fig = px.scatter(pdf, x=\"pvalue_observed\", y=\"pvalue_empirical\",\n", + " hover_data=[\"comparison\", \"prediction\", \"or_result_observed\"],\n", + " color=\"significance\",\n", + " color_discrete_sequence=[\"green\", \"red\", \"blue\", \"black\"],\n", + " title=\"Comparison of Observed and Empirical P-Values\")\n", + "fig.add_shape(type=\"line\", \n", + " x0=pdf[\"pvalue_observed\"].min(), x1=pdf[\"pvalue_observed\"].max(), \n", + " y0=0.05, y1=0.05,\n", + " line=dict(color=\"Red\", dash=\"dash\"))\n", + "\n", + "fig.add_shape(type=\"line\", \n", + " x0=0.05, x1=0.05,\n", + " y0=pdf[\"pvalue_empirical\"].min(), y1=pdf[\"pvalue_empirical\"].max(), \n", + " line=dict(color=\"Red\", dash=\"dash\"))\n", + "fig.update_layout(xaxis_title=\"Observed P-Values\", yaxis_title=\"Empirical P-Values\")\n", + "\n", + "py.plot(fig, filename=\"stop_reasons_pvalue_scatter\", auto_open=True)" + ] } ], "metadata": { diff --git a/analysis/python/multiple_testing_correction.py b/analysis/python/multiple_testing_correction.py new file mode 100644 index 0000000..54be37d --- /dev/null +++ b/analysis/python/multiple_testing_correction.py @@ -0,0 +1,40 @@ +"""Applying Benjamini/Hochberg correction to account for false discovery rate.""" + +import typer +from pyspark.sql import SparkSession +from statsmodels.stats.multitest import fdrcorrection + + +def apply_fdr_correction(df): + """Apply Benjamini/Hochberg correction to account for false discovery rate.""" + pvals = df["pvalue"].values + _, pvals_corrected = fdrcorrection( + pvals, alpha=0.05, method="indep", is_sorted=False + ) + df["pvalue_corrected"] = pvals_corrected + return df + + +def main(data_path): + """Apply Benjamini/Hochberg correction to account for false discovery rate in every predictionType/comparisonType pair. + + Args: + data_path (str): Path to the parquet file containing the results of the enrichment analysis. + """ + spark = SparkSession.builder.getOrCreate() + enrichments = spark.read.parquet(data_path).toPandas() + + enrichments = enrichments.groupby(["predictionType", "comparisonType"]).apply( + apply_fdr_correction + ) + output_data_path = f"{data_path}_corrected" + ( + spark.createDataFrame(enrichments) + .write.mode("overwrite") + .parquet(output_data_path) + ) + print(f"Adjusted data saved to {output_data_path}.") + + +if __name__ == "__main__": + typer.run(main) From 88c7b94b5f0a355de8b10e90e52c63fe9a9d5a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Thu, 15 Jun 2023 19:40:07 +0100 Subject: [PATCH 2/6] feat: add plot_pvalue_scatter --- .../python/multiple_testing_correction.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/analysis/python/multiple_testing_correction.py b/analysis/python/multiple_testing_correction.py index 54be37d..79fe22e 100644 --- a/analysis/python/multiple_testing_correction.py +++ b/analysis/python/multiple_testing_correction.py @@ -15,6 +15,66 @@ def apply_fdr_correction(df): return df +def plot_pvalue_scatter(df, figure_name, username, api_key): + """Plot a scatter plot of the observed p-values vs. the adjusted p-values after multiple testing correction. + + The plot is saved to the plotly cloud and can be accessed via the link in the terminal output. + + Args: + df (pandas.DataFrame): DataFrame containing the results of the enrichment analysis. + figure_name (str): Name of the plotly figure. + username (str): Username for the plotly account. + api_key (str): API key for the plotly account. + """ + import chart_studio + import chart_studio.plotly as py + import plotly.express as px + + chart_studio.tools.set_credentials_file(username=username, api_key=api_key) + + df["significance"] = "Remains Significant" + df.loc[ + (df["pvalue"] < 0.05) & (df["pvalue_corrected"] >= 0.05), "significance" + ] = "No Longer Significant" + df.loc[ + (df["pvalue"] >= 0.05) & (df["pvalue_corrected"] < 0.05), "significance" + ] = "Now Significant" + df.loc[ + (df["pvalue"] >= 0.05) & (df["pvalue_corrected"] >= 0.05), "significance" + ] = "Never Significant" + + fig = px.scatter( + df, + x="pvalue", + y="pvalue_corrected", + hover_data=["comparison", "prediction", "or_result"], + color="significance", + color_discrete_sequence=["green", "red", "blue", "black"], + log_x=True, + log_y=True, + title="Comparison of df and Empirical P-Values", + ) + fig.add_shape( + type="line", + x0=df["pvalue"].min(), + x1=df["pvalue"].max(), + y0=0.05, + y1=0.05, + line=dict(color="Red", dash="dash"), + ) + + fig.add_shape( + type="line", + x0=0.05, + x1=0.05, + y0=df["pvalue_corrected"].min(), + y1=df["pvalue_corrected"].max(), + line=dict(color="Red", dash="dash"), + ) + fig.update_layout(xaxis_title="Observed P-Values", yaxis_title="Corrected P-Values") + py.plot(fig, filename=figure_name, auto_open=True) + + def main(data_path): """Apply Benjamini/Hochberg correction to account for false discovery rate in every predictionType/comparisonType pair. From ffadc5b8134de2a6179d3c21908089097bbac560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Thu, 15 Jun 2023 19:43:52 +0100 Subject: [PATCH 3/6] fix: remove compare_baseline.ipynb from git tracking --- analysis/python/compare_baseline.ipynb | 800 ------------------------- 1 file changed, 800 deletions(-) delete mode 100644 analysis/python/compare_baseline.ipynb diff --git a/analysis/python/compare_baseline.ipynb b/analysis/python/compare_baseline.ipynb deleted file mode 100644 index 3cc22e8..0000000 --- a/analysis/python/compare_baseline.ipynb +++ /dev/null @@ -1,800 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import SparkSession, Window\n", - "import pyspark.sql.functions as F\n", - "\n", - "spark = SparkSession.builder.getOrCreate()" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n", - "| prediction| comparison|comparisonType|predictionType| a|predictionTotal|comparisonTotal| total| b| c| d|or_result| lower_ci| upper_ci| pvalue|\n", - "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n", - "|Insufficient_Enro...|affected_pathway| byDatatype| reason|2291| 18439| 46211|413172|16148|43920|350813|1.1332343| 1.083513|1.1852372| 6.968556E-8|\n", - "| Negative|affected_pathway| byDatatype| reason| 615| 5576| 46211|413172| 4961|45596|362000|0.9842099|0.9045937|1.0708333| 0.7321643|\n", - "|Business_Administ...|affected_pathway| byDatatype| reason|1317| 11630| 46211|413172|10313|44894|356648|1.0145005|0.9571549|1.0752817| 0.62243146|\n", - "| Invalid_Reason|affected_pathway| byDatatype| reason| 422| 2577| 46211|413172| 2155|45789|364806|1.5601487|1.4049416| 1.732502|2.1676067E-15|\n", - "| Study_Design|affected_pathway| byDatatype| reason| 405| 3166| 46211|413172| 2761|45806|364200| 1.166289|1.0503099| 1.295075| 0.0046313014|\n", - "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n", - "only showing top 5 rows\n", - "\n", - "+--------------------+----------+--------------+--------------+----+---------------+---------------+------+----+------+------+----------+----------+----------+-------------+\n", - "| prediction|comparison|comparisonType|predictionType| a|predictionTotal|comparisonTotal| total| b| c| d| or_result| lower_ci| upper_ci| pvalue|\n", - "+--------------------+----------+--------------+--------------+----+---------------+---------------+------+----+------+------+----------+----------+----------+-------------+\n", - "|Insufficient_Enro...| europepmc| byDatasource| reason|8615| 18439| 202849|413172|9824|194234|200499| 0.9052195|0.87882197| 0.9324099|4.2594296E-11|\n", - "|Business_Administ...| europepmc| byDatasource| reason|5602| 11630| 202849|413172|6028|197247|204295|0.96253645|0.92767036| 0.998713| 0.043106694|\n", - "| Negative| europepmc| byDatasource| reason|2577| 5576| 202849|413172|2999|200272|207324| 0.8895437|0.84362316|0.93796384| 1.4934009E-5|\n", - "| Invalid_Reason| europepmc| byDatasource| reason|1285| 2577| 202849|413172|1292|201564|209031| 1.0314267| 0.9545476| 1.1144975| 0.44084844|\n", - "| Covid19| europepmc| byDatasource| reason| 228| 503| 202849|413172| 275|202621|210048| 0.8594809| 0.7210249| 1.0245243| 0.09864084|\n", - "+--------------------+----------+--------------+--------------+----+---------------+---------------+------+----+------+------+----------+----------+----------+-------------+\n", - "only showing top 5 rows\n", - "\n" - ] - } - ], - "source": [ - "permuted = spark.read.parquet(\"/Users/irenelopez/MEGAsync/EBI/repos/stopReasons/data/baseline_predictions_aggregations/*\")\n", - "observed = spark.read.parquet(\"/Users/irenelopez/MEGAsync/EBI/repos/stopReasons/data/predictions_aggregations_all\")\n", - "\n", - "permuted.show(5)\n", - "observed.show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n", - "| prediction| comparison|comparisonType|predictionType|or_result_observed|lower_ci_observed|upper_ci_observed|pvalue_observed|or_result_permuted|lower_ci_permuted|upper_ci_permuted|pvalue_permuted|\n", - "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n", - "|Insufficient_Enro...|affected_pathway| byDatatype| reason| 1.101286| 1.0580636| 1.1462741| 2.9514772E-6| 1.1332343| 1.083513| 1.1852372| 6.968556E-8|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304| 0.9842099| 0.9045937| 1.0708333| 0.7321643|\n", - "|Business_Administ...|affected_pathway| byDatatype| reason| 1.0135593| 0.9628919| 1.066893| 0.60830927| 1.0145005| 0.9571549| 1.0752817| 0.62243146|\n", - "| Invalid_Reason|affected_pathway| byDatatype| reason| 1.42308| 1.2919756| 1.5674883| 4.5850663E-12| 1.5601487| 1.4049416| 1.732502| 2.1676067E-15|\n", - "| Study_Design|affected_pathway| byDatatype| reason| 1.1850995| 1.080571| 1.2997394| 3.9830338E-4| 1.166289| 1.0503099| 1.295075| 0.0046313014|\n", - "| Safety_Sideeffects|affected_pathway| byDatatype| reason| 1.6450619| 1.5032661| 1.8002325| 5.306468E-25| 1.5841677| 1.4315631| 1.75304| 2.4105537E-17|\n", - "| Logistics_Resources|affected_pathway| byDatatype| reason| 0.78194696| 0.695937| 0.8785868| 2.4237832E-5| 0.82520795| 0.72417057| 0.9403422| 0.0036548723|\n", - "| Study_Staff_Moved|affected_pathway| byDatatype| reason| 1.0708259| 0.9271349| 1.2367867| 0.3469289| 1.1133336| 0.94795865| 1.3075589| 0.19965|\n", - "| No_Context|affected_pathway| byDatatype| reason| 0.685027| 0.5684092| 0.8255707| 4.0033254E-5| 0.82053685| 0.6722922| 1.0014704| 0.053746197|\n", - "| Negative| literature| byDatatype| reason| 0.8895437| 0.84362316| 0.93796384| 1.4934009E-5| 0.9543413| 0.90287566| 1.0087405| 0.10023123|\n", - "|Business_Administ...| literature| byDatatype| reason| 0.96253645| 0.92767036| 0.998713| 0.043106694| 1.0164348| 0.97820574| 1.0561578| 0.40482116|\n", - "|Insufficient_Enro...| literature| byDatatype| reason| 0.9052195| 0.87882197| 0.9324099| 4.2594296E-11| 1.0470023| 1.0154496| 1.0795352| 0.0033233627|\n", - "| Invalid_Reason| literature| byDatatype| reason| 1.0314267| 0.9545476| 1.1144975| 0.44084844| 1.2810183| 1.1842574| 1.3856851| 9.200067E-10|\n", - "| Covid19| literature| byDatatype| reason| 0.8594809| 0.7210249| 1.0245243| 0.09864084| 0.5638818| 0.45956767| 0.69187355| 1.1856974E-8|\n", - "| Study_Design| literature| byDatatype| reason| 0.948647| 0.88451016| 1.0174345| 0.14340925| 1.0617356| 0.98768425| 1.141339| 0.105820864|\n", - "| Safety_Sideeffects| literature| byDatatype| reason| 1.3222082| 1.2257016| 1.4263134| 4.225119E-13| 1.4366288| 1.3318783| 1.549618| 1.605722E-20|\n", - "| Logistics_Resources| literature| byDatatype| reason| 0.76919395| 0.7120554| 0.83091766| 2.2282516E-11| 0.80852926| 0.74461913| 0.87792474| 3.4295346E-7|\n", - "| Study_Staff_Moved| literature| byDatatype| reason| 0.9657753| 0.86894673| 1.0733938| 0.5355322| 0.937642| 0.8390834| 1.0477773| 0.26166993|\n", - "| Another_Study| literature| byDatatype| reason| 0.71158177| 0.64844126| 0.78087044| 5.357373E-13| 0.76391065| 0.69150585| 0.8438966| 7.8315104E-8|\n", - "| Negative| rna_expression| byDatatype| reason| 0.9926047| 0.92027634| 1.0706177| 0.8624524| 1.0168192| 0.9524448| 1.0855448| 0.6150149|\n", - "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], - "source": [ - "# reshape data before joining\n", - "\n", - "common_cols = [\"prediction\", \"comparison\", \"comparisonType\", \"predictionType\"]\n", - "results_cols = [\"or_result\", \"lower_ci\", \"upper_ci\", \"pvalue\"]\n", - "\n", - "\n", - "permuted = permuted.select(\n", - " common_cols + [F.col(c).alias(f\"{c}_permuted\") for c in results_cols]\n", - ")\n", - "\n", - "observed = observed.select(\n", - " common_cols + [F.col(c).alias(f\"{c}_observed\") for c in results_cols]\n", - ")\n", - "\n", - "combined = observed.join(permuted, on=common_cols, how=\"inner\")\n", - "\n", - "combined.show()\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n", - "| prediction| comparison|comparisonType|predictionType|or_result_permuted|lower_ci_permuted|upper_ci_permuted|pvalue_permuted|\n", - "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n", - "|Insufficient_Enro...|affected_pathway| byDatatype| reason| 1.1332343| 1.083513| 1.1852372| 6.968556E-8|\n", - "| Negative|affected_pathway| byDatatype| reason| 0.9842099| 0.9045937| 1.0708333| 0.7321643|\n", - "|Business_Administ...|affected_pathway| byDatatype| reason| 1.0145005| 0.9571549| 1.0752817| 0.62243146|\n", - "| Invalid_Reason|affected_pathway| byDatatype| reason| 1.5601487| 1.4049416| 1.732502| 2.1676067E-15|\n", - "| Study_Design|affected_pathway| byDatatype| reason| 1.166289| 1.0503099| 1.295075| 0.0046313014|\n", - "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n", - "only showing top 5 rows\n", - "\n" - ] - } - ], - "source": [ - "permuted.show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n", - "|prediction| comparison|comparisonType|predictionType|or_result_observed|lower_ci_observed|upper_ci_observed|pvalue_observed|\n", - "+----------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n", - "| Negative|affected_pathway| byDatatype| reason| 0.99127966| 0.92052126| 1.0674771| 0.83585304|\n", - "| Negative|affected_pathway| byDatatype| metareason| 0.99127966| 0.92052126| 1.0674771| 0.83585304|\n", - "+----------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n", - "\n" - ] - } - ], - "source": [ - "observed.filter(F.col(\"prediction\") == \"Negative\").filter(F.col(\"comparison\") == \"affected_pathway\").show(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "100" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "combined.select(\"prediction\", \"comparison\").distinct().count()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "def calculate_empirical_pvalue(df):\n", - " return(\n", - " df.withColumn(\"permuted_greater_than_observed\", F.when(F.col(\"pvalue_permuted\") > F.col(\"pvalue_observed\"), F.lit(1)).otherwise(F.lit(0)))\n", - " .groupBy(\"comparison\", \"prediction\").agg(F.sum(\"permuted_greater_than_observed\").alias(\"permuted_greater_than_observed\"))\n", - " .withColumn(\"pvalue_empirical\", F.col(\"permuted_greater_than_observed\") / 100)\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n", - "| prediction| comparison|permuted_greater_than_observed|pvalue_empirical|pvalue_observed|or_result_observed|\n", - "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n", - "| Safety_Sideeffects| literature| 2| 0.02| 4.225119E-13| 1.3222082|\n", - "|Business_Administ...| somatic_mutation| 22| 0.22| 0.5010471| 1.0204865|\n", - "| Another_Study| affected_pathway| 47| 0.47| 0.62539124| 0.96532875|\n", - "| Logistics_Resources| somatic_mutation| 82| 0.82| 0.13673647| 0.90625197|\n", - "|Business_Administ...| rna_expression| 59| 0.59| 0.37541625| 0.976|\n", - "| Negative| somatic_mutation| 100| 1.0| 6.9009767E-9| 1.2699835|\n", - "|Business_Administ...| known_drug| 0| 0.0| 1.0| 0.028963346|\n", - "| Invalid_Reason|genetic_association| 97| 0.97| 0.024117803| 0.83866763|\n", - "| Regulatory| rna_expression| 100| 1.0| 1.3482296E-9| 0.54306966|\n", - "|Insufficient_Enro...| affected_pathway| 34| 0.34| 2.9514772E-6| 1.101286|\n", - "| No_Context| affected_pathway| 97| 0.97| 4.0033254E-5| 0.685027|\n", - "|Insufficient_Enro...|genetic_association| 43| 0.43| 0.0| 0.60888106|\n", - "| Invalid_Reason| known_drug| 0| 0.0| 1.0| 0.006276258|\n", - "| Interim_Analysis| literature| 9| 0.09| 0.10653863| 0.7150094|\n", - "| Invalid_Reason| animal_model| 87| 0.87| 0.065790154| 0.8683092|\n", - "|Insufficient_Enro...| animal_model| 93| 0.93| 6.694221E-29| 0.71514577|\n", - "|Business_Administ...| affected_pathway| 28| 0.28| 0.60830927| 1.0135593|\n", - "| Study_Design| literature| 84| 0.84| 0.14340925| 0.948647|\n", - "| Interim_Analysis| rna_expression| 55| 0.55| 0.042635407| 0.46051452|\n", - "| Success| literature| 1| 0.01| 0.69708806| 1.2096671|\n", - "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], - "source": [ - "corrected_combined = (\n", - " calculate_empirical_pvalue(combined)\n", - " # .select(\"permuted_greater_than_observed\").distinct()\n", - " .join(combined.select(\"prediction\", \"comparison\", \"pvalue_observed\", \"or_result_observed\"), on=[\"prediction\", \"comparison\"], how=\"inner\")\n", - " .distinct()\n", - ")\n", - "\n", - "corrected_combined.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 92, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "100" - ] - }, - "execution_count": 92, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "corrected_combined.count()" - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "data": { - "text/plain": [ - "100" - ] - }, - "execution_count": 90, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "corrected_combined.select(\"prediction\", \"comparison\").distinct().count()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!poetry add matplotlib\n", - "!poetry add plotly.express\n", - "!poetry add chart_studio\n", - "!poetry add chart_studio.plotly\n" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'https://plotly.com/~irenelopezs/63/'" - ] - }, - "execution_count": 95, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import chart_studio\n", - "import chart_studio.plotly as py\n", - "import plotly.express as px\n", - "\n", - "\n", - "username = 'irenelopezs'\n", - "api_key = 'RaERO2wbBPVpLGiNNL1y'\n", - "\n", - "chart_studio.tools.set_credentials_file(username=username, api_key=api_key)\n", - "\n", - "pdf = corrected_combined.toPandas()\n", - "pdf['or_result_observed'] = pdf['or_result_observed'].astype(str)\n", - "pdf['significance'] = 'Remains Significant'\n", - "pdf.loc[(pdf['pvalue_observed'] < 0.05) & (pdf['pvalue_empirical'] >= 0.05), 'significance'] = 'No Longer Significant'\n", - "pdf.loc[(pdf['pvalue_observed'] >= 0.05) & (pdf['pvalue_empirical'] < 0.05), 'significance'] = 'Now Significant'\n", - "pdf.loc[(pdf['pvalue_observed'] >= 0.05) & (pdf['pvalue_empirical'] >= 0.05), 'significance'] = 'Never Significant'\n", - "\n", - "\n", - "fig = px.scatter(pdf, x=\"pvalue_observed\", y=\"pvalue_empirical\",\n", - " hover_data=[\"comparison\", \"prediction\", \"or_result_observed\"],\n", - " color='significance',\n", - " color_discrete_sequence=[\"green\", \"red\", \"blue\", \"black\"],\n", - " title=\"Comparison of Observed and Empirical P-Values\")\n", - "fig.add_shape(type=\"line\", \n", - " x0=pdf['pvalue_observed'].min(), x1=pdf['pvalue_observed'].max(), \n", - " y0=0.05, y1=0.05,\n", - " line=dict(color=\"Red\", dash=\"dash\"))\n", - "\n", - "fig.add_shape(type=\"line\", \n", - " x0=0.05, x1=0.05,\n", - " y0=pdf['pvalue_empirical'].min(), y1=pdf['pvalue_empirical'].max(), \n", - " line=dict(color=\"Red\", dash=\"dash\"))\n", - "fig.update_layout(xaxis_title=\"Observed P-Values\", yaxis_title=\"Empirical P-Values\")\n", - "\n", - "py.plot(fig, filename=\"stop_reasons_pvalue_scatter\", auto_open=True)\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## P value correction for false discovery rate" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predictioncomparisoncomparisonTypepredictionTypeapredictionTotalcomparisonTotaltotalbcdor_resultlower_ciupper_cipvalue
0Insufficient_EnrollmenteuropepmcbyDatasourcereason86151843920284941317298241942342004990.9052190.8788220.9324104.259430e-11
1Business_AdministrativeeuropepmcbyDatasourcereason56021163020284941317260281972472042950.9625360.9276700.9987134.310669e-02
2NegativeeuropepmcbyDatasourcereason2577557620284941317229992002722073240.8895440.8436230.9379641.493401e-05
3Invalid_ReasoneuropepmcbyDatasourcereason1285257720284941317212922015642090311.0314270.9545481.1144984.408484e-01
4Covid19europepmcbyDatasourcereason2285032028494131722752026212100480.8594810.7210251.0245249.864084e-02
\n", - "
" - ], - "text/plain": [ - " prediction comparison comparisonType predictionType a \\\n", - "0 Insufficient_Enrollment europepmc byDatasource reason 8615 \n", - "1 Business_Administrative europepmc byDatasource reason 5602 \n", - "2 Negative europepmc byDatasource reason 2577 \n", - "3 Invalid_Reason europepmc byDatasource reason 1285 \n", - "4 Covid19 europepmc byDatasource reason 228 \n", - "\n", - " predictionTotal comparisonTotal total b c d or_result \\\n", - "0 18439 202849 413172 9824 194234 200499 0.905219 \n", - "1 11630 202849 413172 6028 197247 204295 0.962536 \n", - "2 5576 202849 413172 2999 200272 207324 0.889544 \n", - "3 2577 202849 413172 1292 201564 209031 1.031427 \n", - "4 503 202849 413172 275 202621 210048 0.859481 \n", - "\n", - " lower_ci upper_ci pvalue \n", - "0 0.878822 0.932410 4.259430e-11 \n", - "1 0.927670 0.998713 4.310669e-02 \n", - "2 0.843623 0.937964 1.493401e-05 \n", - "3 0.954548 1.114498 4.408484e-01 \n", - "4 0.721025 1.024524 9.864084e-02 " - ] - }, - "execution_count": 97, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "observed = spark.read.parquet(\"/Users/irenelopez/MEGAsync/EBI/repos/stopReasons/data/predictions_aggregations_all\").toPandas()\n", - "\n", - "observed.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predictioncomparisoncomparisonTypepredictionTypeapredictionTotalcomparisonTotaltotalbcdor_resultlower_ciupper_cipvaluerejectspvalue_corrected
0Insufficient_EnrollmenteuropepmcbyDatasourcereason86151843920284941317298241942342004990.9052190.8788220.9324104.259430e-11True1.723030e-10
1Business_AdministrativeeuropepmcbyDatasourcereason56021163020284941317260281972472042950.9625360.9276700.9987134.310669e-02False7.268878e-02
2NegativeeuropepmcbyDatasourcereason2577557620284941317229992002722073240.8895440.8436230.9379641.493401e-05True4.078030e-05
3Invalid_ReasoneuropepmcbyDatasourcereason1285257720284941317212922015642090311.0314270.9545481.1144984.408484e-01False5.564568e-01
4Covid19europepmcbyDatasourcereason2285032028494131722752026212100480.8594810.7210251.0245249.864084e-02False1.523175e-01
\n", - "
" - ], - "text/plain": [ - " prediction comparison comparisonType predictionType a \\\n", - "0 Insufficient_Enrollment europepmc byDatasource reason 8615 \n", - "1 Business_Administrative europepmc byDatasource reason 5602 \n", - "2 Negative europepmc byDatasource reason 2577 \n", - "3 Invalid_Reason europepmc byDatasource reason 1285 \n", - "4 Covid19 europepmc byDatasource reason 228 \n", - "\n", - " predictionTotal comparisonTotal total b c d or_result \\\n", - "0 18439 202849 413172 9824 194234 200499 0.905219 \n", - "1 11630 202849 413172 6028 197247 204295 0.962536 \n", - "2 5576 202849 413172 2999 200272 207324 0.889544 \n", - "3 2577 202849 413172 1292 201564 209031 1.031427 \n", - "4 503 202849 413172 275 202621 210048 0.859481 \n", - "\n", - " lower_ci upper_ci pvalue rejects pvalue_corrected \n", - "0 0.878822 0.932410 4.259430e-11 True 1.723030e-10 \n", - "1 0.927670 0.998713 4.310669e-02 False 7.268878e-02 \n", - "2 0.843623 0.937964 1.493401e-05 True 4.078030e-05 \n", - "3 0.954548 1.114498 4.408484e-01 False 5.564568e-01 \n", - "4 0.721025 1.024524 9.864084e-02 False 1.523175e-01 " - ] - }, - "execution_count": 99, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from statsmodels.stats.multitest import fdrcorrection\n", - "\n", - "pvals = observed['pvalue'].values\n", - "\n", - "rejected, pvals_corrected = fdrcorrection(pvals, alpha=0.05, method=\"indep\", is_sorted=False)\n", - "observed['rejects'] = rejected\n", - "observed[\"pvalue_corrected\"] = pvals_corrected\n", - "\n", - "observed.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "observed[\"significance\"] = \"Remains Significant\"\n", - "observed.loc[(pdf[\"pvalue\"] < 0.05) & (pdf[\"pvalue_corrected\"] >= 0.05), \"significance\"] = \"No Longer Significant\"\n", - "observed.loc[(pdf[\"pvalue_observed\"] >= 0.05) & (pdf[\"pvalue_corrected\"] < 0.05), \"significance\"] = \"Now Significant\"\n", - "observed.loc[(pdf[\"pvalue_observed\"] >= 0.05) & (pdf[\"pvalue_corrected\"] >= 0.05), \"significance\"] = \"Never Significant\"\n", - "\n", - "\n", - "fig = px.scatter(pdf, x=\"pvalue_observed\", y=\"pvalue_empirical\",\n", - " hover_data=[\"comparison\", \"prediction\", \"or_result_observed\"],\n", - " color=\"significance\",\n", - " color_discrete_sequence=[\"green\", \"red\", \"blue\", \"black\"],\n", - " title=\"Comparison of Observed and Empirical P-Values\")\n", - "fig.add_shape(type=\"line\", \n", - " x0=pdf[\"pvalue_observed\"].min(), x1=pdf[\"pvalue_observed\"].max(), \n", - " y0=0.05, y1=0.05,\n", - " line=dict(color=\"Red\", dash=\"dash\"))\n", - "\n", - "fig.add_shape(type=\"line\", \n", - " x0=0.05, x1=0.05,\n", - " y0=pdf[\"pvalue_empirical\"].min(), y1=pdf[\"pvalue_empirical\"].max(), \n", - " line=dict(color=\"Red\", dash=\"dash\"))\n", - "fig.update_layout(xaxis_title=\"Observed P-Values\", yaxis_title=\"Empirical P-Values\")\n", - "\n", - "py.plot(fig, filename=\"stop_reasons_pvalue_scatter\", auto_open=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 95777aca0e434c0bbcd973ec6472ad0422dd7fb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Thu, 6 Jul 2023 12:14:11 +0100 Subject: [PATCH 4/6] chore: add data to gitignore --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6a1dcce..e1f3820 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,8 @@ derby.log .vscode/dryrun.log .vscode/targets.log .vscode/configurationCache.log -temp \ No newline at end of file +temp +data/baseline_predictions_aggregations +data/predictions_aggregations_* +data/chembl* +reports/* \ No newline at end of file From de7010b1150cd0e16ab41ce2835867321f8e018d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Sat, 9 Sep 2023 00:22:54 +0100 Subject: [PATCH 5/6] feat: redefine dataset of permuted associations --- analysis/README.md | 9 +- analysis/python/enrichments.py | 1 - analysis/python/enrichments_permuted.py | 474 ++++++++++++++++++++++++ config.yml | 1 + 4 files changed, 483 insertions(+), 2 deletions(-) create mode 100644 analysis/python/enrichments_permuted.py diff --git a/analysis/README.md b/analysis/README.md index 971c730..1aa4da6 100644 --- a/analysis/README.md +++ b/analysis/README.md @@ -6,7 +6,7 @@ This pipeline consists of the following steps: - Enrichment analysis. To launch the script as a Pyspark job on a cluster: ```bash -gcloud dataproc clusters create il-big-stop-reasons \ +gcloud dataproc clusters create il-big-stop-reasons-2 \ --image-version=2.1 \ --project=open-targets-eu-dev \ --region=europe-west1 \ @@ -25,6 +25,13 @@ gcloud dataproc jobs submit pyspark \ --project=open-targets-eu-dev \ analysis/python/enrichments.py -- config.yml --stratify-therapeutic-area non_oncology +# To determine a baseline between all sets of comparisons, run the analysis with a randomly generated set of associations +gcloud dataproc jobs submit pyspark \ + --cluster=il-big-stop-reasons \ + --files=config.yml \ + --region=europe-west1 \ + --project=open-targets-eu-dev + # Adjust the p values for multiple testing using the Benjamini-Hochberg procedure by providing the path to the file with the results of the enrichment analysis gcloud dataproc jobs submit pyspark \ --cluster=il-big-stop-reasons \ diff --git a/analysis/python/enrichments.py b/analysis/python/enrichments.py index c351f2f..b7ef9e6 100644 --- a/analysis/python/enrichments.py +++ b/analysis/python/enrichments.py @@ -2,7 +2,6 @@ from enum import Enum from functools import reduce -import pandas as pd import pyspark.sql.functions as F import typer from omegaconf import OmegaConf diff --git a/analysis/python/enrichments_permuted.py b/analysis/python/enrichments_permuted.py new file mode 100644 index 0000000..0ab3c38 --- /dev/null +++ b/analysis/python/enrichments_permuted.py @@ -0,0 +1,474 @@ +"""Analysis of the results on all clinical trials.""" +from enum import Enum +from functools import reduce + +import pyspark.sql.functions as F +import typer +from omegaconf import OmegaConf +from pyspark import SparkConf +from pyspark.sql import SparkSession +from pyspark.sql.types import FloatType, StringType, StructField, StructType +from pyspark.sql.window import Window +from scipy.stats import fisher_exact +from statsmodels.stats.contingency_tables import Table2x2 + +CLINVAR_VALIDS = [ + # ClinVar evidence we are interested + "affects", + "risk factor", + "pathogenic", + "likely pathogenic", + "protective", + "drug response", +] + +NON_NEUTRAL_PREDICTIONS = [ + # Stop predictions from Olesya + "Negative", + "Safety_Sideeffects", + "Success", + "Business_Administrative", + "Invalid_Reason", +] + +STOPPED_STATUS = ["Terminated", "Withdrawn", "Suspended"] + + +class ModeByTA(str, Enum): + """Mode of analysis by therapeutic area.""" + + ALL = "all" + ONCOLOGY = "oncology" + NON_ONCOLOGY = "non_oncology" + + +def expand_disease_index(disease): + """Expand disease index to include ancestors to account for differences in granularity in the mapping.""" + return ( + disease.select( + F.col("id").alias("diseaseId"), + F.explode("ancestors").alias("propagatedDiseaseId"), + ) + .union( + disease.select( + F.col("id").alias("diseaseId"), F.col("id").alias("propagatedDiseaseId") + ) + ) + .distinct() + ) + + +def prepare_l2g(evidence): + """Prepare l2g evidence with some arbitrary cut-offs.""" + return ( + evidence.filter(F.col("datasourceId") == "ot_genetics_portal") + .groupBy("targetId", "diseaseId") + .agg(F.max("score").alias("max_l2g")) + .withColumn( + "l2g_075", + F.when(F.col("max_l2g") > 0.75, "l2g_0.75"), + ) + .withColumn("l2g_05", F.when(F.col("max_l2g") > 0.5, "l2g_0.5")) + .withColumn( + "l2g_025", + F.when(F.col("max_l2g") > 0.25, "l2g_0.25"), + ) + .withColumn("l2g_01", F.when(F.col("max_l2g") > 0.1, "l2g_0.1")) + .withColumn( + "l2g_005", + F.when(F.col("max_l2g") > 0.05, "l2g_0.05"), + ) + ) + + +def extract_therapeutic_area(disease): + """Assigns the most severe therapeutic are to a disease.""" + taDf = spark.createDataFrame( + data=[ + ("MONDO_0045024", "cell proliferation disorder", "Oncology"), + ("EFO_0005741", "infectious disease", "Other"), + ("OTAR_0000014", "pregnancy or perinatal disease", "Other"), + ("EFO_0005932", "animal disease", "Other"), + ("MONDO_0024458", "disease of visual system", "Other"), + ("EFO_0000319", "cardiovascular disease", "Other"), + ("EFO_0009605", "pancreas disease", "Other"), + ("EFO_0010282", "gastrointestinal disease", "Other"), + ("OTAR_0000017", "reproductive system or breast disease", "Other"), + ("EFO_0010285", "integumentary system disease", "Other"), + ("EFO_0001379", "endocrine system disease", "Other"), + ("OTAR_0000010", "respiratory or thoracic disease", "Other"), + ("EFO_0009690", "urinary system disease", "Other"), + ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"), + ("MONDO_0021205", "disease of ear", "Other"), + ("EFO_0000540", "immune system disease", "Other"), + ("EFO_0005803", "hematologic disease", "Other"), + ("EFO_0000618", "nervous system disease", "Other"), + ("MONDO_0002025", "psychiatric disorder", "Other"), + ("OTAR_0000020", "nutritional or metabolic disease", "Other"), + ("OTAR_0000018", "genetic, familial or congenital disease", "Other"), + ("OTAR_0000009", "injury, poisoning or other complication", "Other"), + ("EFO_0000651", "phenotype", "Other"), + ("EFO_0001444", "measurement", "Other"), + ("GO_0008150", "biological process", "Other"), + ], + schema=StructType( + [ + StructField("taId", StringType(), True), + StructField("taLabel", StringType(), True), + StructField("taLabelSimple", StringType(), True), + ] + ), + ).withColumn("taRank", F.monotonically_increasing_id()) + wByDisease = Window.partitionBy("diseaseId") + return ( + disease.withColumn("taId", F.explode("therapeuticAreas")) + .select(F.col("id").alias("diseaseId"), "taId") + .join(taDf, on="taId", how="left") + .withColumn("minRank", F.min("taRank").over(wByDisease)) + .filter(F.col("taRank") == F.col("minRank")) + .drop("taRank", "minRank") + ) + + +def prepare_genetic_constraint(target): + """Prepare genetic constraint data for a given gene.""" + return target.withColumn("gc", F.explode("constraint.upperBin6")).select( + F.col("id").alias("targetId"), + F.col("gc").cast("string"), + ) + + +def prepare_pli(target): + """Prepare pLI (predicted loss of function intolerant) data for a given gene.""" + return ( + target.withColumn("gc", F.explode("constraint")) + .filter(F.col("gc.constraintType") == "lof") + .select(F.col("id").alias("targetId"), F.col("gc.score").alias("pLI")) + .withColumn( + "lof_tolerance", + F.when(F.col("pLI") > 0.9, F.lit("LoF intolerant")).otherwise( + F.when(F.col("pLI") < 0.1, F.lit("LoF tolerant")) + ), + ) + .drop("pLI") + ) + + +def extract_target_partners(interaction): + """Extract all partners of a given target.""" + allInteractions = ( + interaction.filter(F.col("sourceDatabase") == "intact") + .filter(F.col("scoring") > 0.42) + .filter(F.col("targetB").isNotNull()) + .select("targetA", "targetB") + .distinct() + ) + return ( + allInteractions.union( + allInteractions.select( + F.col("targetA").alias("targetB"), F.col("targetB").alias("targetA") + ) + ) + .distinct() + .groupBy("targetA") + .agg(F.count("targetB").alias("partners")) + .withColumn( + "partnersBin", + F.when(F.col("partners") > 20, F.lit("greaterThan20")) + .when( + (F.col("partners") > 10) & (F.col("partners") <= 20), + F.lit("from11to20"), + ) + .when( + (F.col("partners") > 0) & (F.col("partners") <= 10), F.lit("from1to10") + ) + .otherwise(F.lit("none")), + ) + .select(F.col("targetA").alias("targetId"), F.col("partnersBin")) + ) + + +def prepare_hpa_expression(hpa): + """Prepare HPA expression data for a given target.""" + return hpa.select( + F.col("Ensembl").alias("targetId"), + F.col("RNA tissue distribution").alias("rnaDistribution"), + F.col("RNA tissue specificity").alias("rnaSpecificity"), + ) + + +def prepare_associations(evidence, disease_ancestors): + """Prepare a pseudo-associations dataset that consists of propagating the ontology across the evidence dataset and extract the maximum score per data source.""" + return ( + # Cleaned evidence (exclude "benign" clinvar genetic evidence) + evidence.withColumn("evaValids", F.array([F.lit(x) for x in CLINVAR_VALIDS])) + .withColumn("evaFilter", F.arrays_overlap("evaValids", "clinicalSignificances")) + .filter((F.col("evaFilter").isNull()) | (F.col("evaFilter"))) + # pseudo-associations: ontology propagation + max datasource score + .join(disease_ancestors, on="diseaseId", how="left") + .drop("diseaseId") + .withColumnRenamed("propagatedDiseaseId", "diseaseId") + .select("targetId", "diseaseId", "datasourceId", "datatypeId") + .distinct() + ) + + +def prepare_associations_universe(target, disease): + """Creates a ficticial set of associations based on the combination of all possible targets and diseases.""" + return ( + target.selectExpr("id as targetId") + .crossJoin(disease.selectExpr("id as diseaseId")) + .select( + "*", + F.lit("random").alias("datasourceId"), + F.lit("random").alias("datatypeId"), + ) + ) + + +def prepare_comparisons_df() -> list: + """Return list of all comparisons to be used in the analysis.""" + comparisons = spark.createDataFrame( + data=[ + ("datatypeId", "byDatatype"), + ], + schema=StructType( + [ + StructField("comparison", StringType(), True), + StructField("comparisonType", StringType(), True), + ] + ), + ) + + predictions = spark.createDataFrame( + data=[ + ("reason", "reason"), + ], + schema=StructType( + [ + StructField("prediction", StringType(), True), + StructField("predictionType", StringType(), True), + ] + ), + ) + return comparisons.join(predictions, how="full").collect() + + +def aggregations( + df, comparisonColumn, comparisonType, predictionColumn, predictionType +): + """Aggregate data to compute enrichment analysis.""" + comparison_counts = df.groupBy(comparisonColumn).agg( + F.countDistinct("id").alias("comparisonTotal") + ) + prediction_counts = df.groupBy(predictionColumn).agg( + F.countDistinct("id").alias("predictionTotal") + ) + intersection_counts = df.groupBy(comparisonColumn, predictionColumn).agg( + F.countDistinct("id").alias("a") + ) + + return ( + df.filter( + F.col(comparisonColumn).isNotNull() & F.col(predictionColumn).isNotNull() + ) + .withColumn("comparisonType", F.lit(comparisonType)) + .withColumn("predictionType", F.lit(predictionType)) + .join(comparison_counts, on=comparisonColumn, how="left") + .join(prediction_counts, on=predictionColumn, how="left") + .join(intersection_counts, on=[comparisonColumn, predictionColumn], how="left") + .select( + F.col(predictionColumn).alias("prediction"), + F.col(comparisonColumn).alias("comparison"), + "comparisonType", + "predictionType", + "a", + "predictionTotal", + "comparisonTotal", + "total", + ) + .distinct() + ) + + +def _compute_fisher_and_or(a, b, c, d) -> tuple: + """Compute Fisher's exact test by shaping each row into a 2x2 contingency table.""" + table = [[a, b], [c, d]] + _, pvalue = fisher_exact(table) + table2x2 = Table2x2(table) + or_result = table2x2.oddsratio + lower_ci, upper_ci = table2x2.oddsratio_confint() + return (float(or_result), float(lower_ci), float(upper_ci), float(pvalue)) + + +def run_analysis( + conf, + seed, + stratify_therapeutic_area, +): + """Run enrichment analysis.""" + # Load raw datasets + evidence = spark.read.parquet(conf.default.input.evidence_path) + disease = spark.read.parquet(conf.default.input.disease_path).persist() + target = spark.read.parquet(conf.default.input.target_path).persist() + interaction = spark.read.parquet(conf.default.input.interactions_path) + hpa = spark.read.json(conf.default.input.hpa_path) + stop_predictions = ( + spark.read.csv(conf.default.input.predictions_freeze_path, header=True) + # add prediction metaclass + .withColumn( + "metareason", + F.when( + F.col("prediction").isin(NON_NEUTRAL_PREDICTIONS), F.col("prediction") + ).otherwise(F.lit("Neutral")), + ) + ).withColumnRenamed("prediction", "reason") + + # Prepare processed datasets + disease_ancestors = expand_disease_index(disease) + l2g = prepare_l2g(evidence) + disease_ta = extract_therapeutic_area(disease) + target_gc = prepare_genetic_constraint(target) + target_pli = prepare_pli(target) + partners = extract_target_partners(interaction) + hpa_expr = prepare_hpa_expression(hpa) + associations = prepare_associations(evidence, disease_ancestors) + universe_size = associations.select("targetId", "diseaseId").distinct().count() + random_associations = ( + prepare_associations_universe(target, disease) + .sample(fraction=1.0, seed=seed) + .limit(universe_size) + .withColumn("seed", F.lit(seed)) + ) + + # Prepare clinical information + clinical = ( + evidence.filter(F.col("sourceId") == "chembl") + .withColumn("urls", F.explode("urls")) + .withColumn( + "nctid", F.regexp_extract(F.col("urls.url"), "(.+)(id=%22)(.+)(%22)", 3) + ) + .withColumn( + "nctid", F.when(F.col("nctid") != "", F.col("nctid")).otherwise(None) + ) + .withColumn( + "stopStatus", + F.when( + F.col("clinicalStatus").isin(STOPPED_STATUS), F.col("clinicalStatus") + ), + ) + .withColumn( + "isStopped", + F.when(F.col("clinicalStatus").isin(STOPPED_STATUS), F.lit("stopped")), + ) + .withColumn( + "phase4", + F.when(F.col("clinicalPhase") == 4, F.lit("Phase IV")), + ) + .withColumn( + "phase3", + F.when(F.col("clinicalPhase") >= 3, F.lit("Phase III+")), + ) + .withColumn( + "phase2", + F.when(F.col("clinicalPhase") >= 2, F.lit("Phase II+")), + ) + .select( + "targetId", + "diseaseId", + "nctid", + "clinicalStatus", + "clinicalPhase", + "studyStartDate", + "stopStatus", + "isStopped", + "phase4", + "phase3", + "phase2", + ) + .distinct() + # Create ID + .withColumn("id", F.xxhash64("targetId", "diseaseId", "nctid")) + # Bring reason and metareason for stoppage + .join(stop_predictions, on="nctid", how="left") + # L2G cut-offs + .join(l2g, on=["targetId", "diseaseId"], how="left") + # Disease therapeutic area (only one by disease) + .join(disease_ta, on="diseaseId", how="left") + # Target genetic constraint + .join(target_gc, on="targetId", how="left") + # Target lof tolerance + .join(target_pli, on="targetId", how="left") + # Expression specificity + .join(hpa_expr, on="targetId", how="left") + # Physical interaction partners + .join(partners, on="targetId", how="left") + .withColumn("partnersBin", F.coalesce(F.col("partnersBin"), F.lit("none"))) + # Datasources and Datatypes + .join(random_associations, on=["targetId", "diseaseId"], how="left") + ) + # Run analysis split by oncology/non_oncology data + if stratify_therapeutic_area.value == "oncology": + clinical = clinical.filter(F.col("taLabelSimple") == "Oncology") + elif stratify_therapeutic_area.value == "non_oncology": + clinical = clinical.filter(F.col("taLabelSimple") != "Oncology") + # Define total number + uniqIds = clinical.select("id").distinct().count() + clinical = clinical.withColumn("total", F.lit(uniqIds)).persist() + + ## Compute aggregations + agg_setups = prepare_comparisons_df() + all_comparisons = [] + for row in agg_setups: + out = aggregations(clinical, *row) + all_comparisons.append(out) + + ## Compute Fisher's exact test and odds ratio + schema = StructType( + [ + StructField("or_result", FloatType(), nullable=False), + StructField("lower_ci", FloatType(), nullable=False), + StructField("upper_ci", FloatType(), nullable=False), + StructField("pvalue", FloatType(), nullable=False), + ] + ) + compute_fisher_and_or_udf = F.udf(_compute_fisher_and_or, schema) + return ( + reduce(lambda x, y: x.unionByName(y), all_comparisons) + .coalesce(200) + .withColumn("b", F.col("predictionTotal") - F.col("a")) + .withColumn("c", F.col("comparisonTotal") - F.col("a")) + .withColumn("d", F.col("total") - F.col("a") - F.col("b") - F.col("c")) + .withColumn("result", compute_fisher_and_or_udf("a", "b", "c", "d")) + .select("*", "result.*") + .drop("result") + ) + + +def main( + config, + stratify_therapeutic_area: ModeByTA = ModeByTA.ALL, +): + """Generate enrichment reports on 100 random permutations of the data.""" + conf = OmegaConf.load(config) + print("Therapeutic area analysis mode:", stratify_therapeutic_area.value) + print("Config", conf) + for i in range(100): + print("Running seed", i) + analysis_result = run_analysis( + conf=conf, seed=i, stratify_therapeutic_area=stratify_therapeutic_area + ) + analysis_result.write.parquet( + f"{conf.default.permuted_enrichments_template}/{stratify_therapeutic_area.value}_seed_{i}", + mode="overwrite", + ) + + +if __name__ == "__main__": + sparkConf = ( + SparkConf() + .set("spark.hadoop.fs.gs.requester.pays.mode", "AUTO") + .set("spark.hadoop.fs.gs.requester.pays.project.id", "open-targets-eu-dev") + ) + spark = SparkSession.builder.config(conf=sparkConf).getOrCreate() + typer.run(main) diff --git a/config.yml b/config.yml index 40a00f5..4d27959 100644 --- a/config.yml +++ b/config.yml @@ -1,6 +1,7 @@ default: spark_master: yarn enrichments_template: gs://ot-team/irene/stop_reasons/predictions_aggregations + permuted_enrichments_template: gs://ot-team/irene/stop_reasons/baseline_predictions_aggregations_08092023 input: # gcp disease_path: gs://open-targets-data-releases/22.04/output/etl/parquet/diseases/ From 3d7359eda4f94a2ac7e0c71f5c0ac2f10e8f8250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Wed, 13 Sep 2023 13:42:37 +0100 Subject: [PATCH 6/6] feat: add --- analysis/python/enrichments_stop_phase.py | 127 ++++++++++++++++++++++ config.yml | 1 + 2 files changed, 128 insertions(+) create mode 100644 analysis/python/enrichments_stop_phase.py diff --git a/analysis/python/enrichments_stop_phase.py b/analysis/python/enrichments_stop_phase.py new file mode 100644 index 0000000..5e301d8 --- /dev/null +++ b/analysis/python/enrichments_stop_phase.py @@ -0,0 +1,127 @@ +"""Functions to calculate ORs between the stop reason category of a clinical trial and its clinical phase. Results available at Supplementary Table 7.""" + +from functools import reduce + +import pyspark.sql.functions as f +import typer +from omegaconf import OmegaConf +from pyspark import SparkConf +from pyspark.sql import SparkSession +from pyspark.sql.types import FloatType, StringType, StructField, StructType + +from analysis.python.utils import _compute_fisher_and_or, aggregations + + +def prepare_comparisons_df() -> list: + """Return list of all comparisons to be used in the analysis. + In this case, we only want to look at the comparison between the stop reason category and the clinical phase. + """ + comparisons = spark.createDataFrame( + data=[ + ("reason", "reason"), + ], + schema=StructType( + [ + StructField("comparison", StringType(), True), + StructField("comparisonType", StringType(), True), + ] + ), + ) + + predictions = spark.createDataFrame( + data=[ + ("phase4", "clinical"), + ("phase3", "clinical"), + ("phase32", "clinical"), + ("phase2", "clinical"), + ("phase21", "clinical"), + ("phase1", "clinical"), + ("phase1early", "clinical"), + ("phaseOther", "clinical"), + ], + schema=StructType( + [ + StructField("prediction", StringType(), True), + StructField("predictionType", StringType(), True), + ] + ), + ) + return comparisons.join(predictions, how="full").collect() + + +def prepare_studies(studies): + """Processes the studies dataset to keep only the stopped ones in a suitable format to run enrichments on.""" + formatted_studies = ( + studies.filter(f.col("why_stopped").isNotNull()) + .withColumnRenamed("prediction", "reason") + .withColumn("id", f.monotonically_increasing_id()) + .withColumn("phase4", f.when(f.col("phase") == "Phase 4", f.lit("Phase 4"))) + .withColumn("phase3", f.when(f.col("phase") == "Phase 3", f.lit("Phase 3"))) + .withColumn( + "phase32", + f.when(f.col("phase") == "Phase 2/Phase 3", f.lit("Phase 2/Phase 3")), + ) + .withColumn("phase2", f.when(f.col("phase") == "Phase 2", f.lit("Phase 2"))) + .withColumn( + "phase21", + f.when(f.col("phase") == "Phase 1/Phase 2", f.lit("Phase 1/Phase 2")), + ) + .withColumn("phase1", f.when(f.col("phase") == "Phase 1", f.lit("Phase 1"))) + .withColumn( + "phase1early", + f.when(f.col("phase") == "Early Phase 1", f.lit("Early Phase 1")), + ) + .withColumn("phaseOther", f.when(f.col("phase") == "nan", f.lit("Other"))) + ) + total_cts = formatted_studies.count() + return formatted_studies.withColumn("total", f.lit(total_cts)) + + +def main(config): + """Functions to calculate ORs between the stop reason category of a clinical trial and its clinical phase.""" + # Load + conf = OmegaConf.load(config) + studies = spark.read.csv(conf.default.input.predictions_freeze_path, header=True) + + # Process + formatted_studies = prepare_studies(studies) + agg_setups = prepare_comparisons_df() + + all_comparisons = [] + for row in agg_setups: + out = aggregations(formatted_studies, *row) + all_comparisons.append(out) + + schema = StructType( + [ + StructField("or_result", FloatType(), nullable=False), + StructField("lower_ci", FloatType(), nullable=False), + StructField("upper_ci", FloatType(), nullable=False), + StructField("pvalue", FloatType(), nullable=False), + ] + ) + compute_fisher_and_or_udf = f.udf(_compute_fisher_and_or, schema) + results = ( + reduce(lambda x, y: x.unionByName(y), all_comparisons) + .coalesce(200) + .withColumn("b", f.col("predictionTotal") - f.col("a")) + .withColumn("c", f.col("comparisonTotal") - f.col("a")) + .withColumn("d", f.col("total") - f.col("a") - f.col("b") - f.col("c")) + .withColumn("result", compute_fisher_and_or_udf("a", "b", "c", "d")) + .select("*", "result.*") + .drop("result") + ) + results.write.parquet( + conf.default.stop_vs_phase_enrichments_template, + mode="overwrite", + ) + + +if __name__ == "__main__": + sparkConf = ( + SparkConf() + .set("spark.hadoop.fs.gs.requester.pays.mode", "AUTO") + .set("spark.hadoop.fs.gs.requester.pays.project.id", "open-targets-eu-dev") + ) + spark = SparkSession.builder.getOrCreate() + typer.run(main) diff --git a/config.yml b/config.yml index 4d27959..261453e 100644 --- a/config.yml +++ b/config.yml @@ -2,6 +2,7 @@ default: spark_master: yarn enrichments_template: gs://ot-team/irene/stop_reasons/predictions_aggregations permuted_enrichments_template: gs://ot-team/irene/stop_reasons/baseline_predictions_aggregations_08092023 + stop_vs_phase_enrichments_template: gs://ot-team/irene/stop_reasons/reason_vs_phase_aggregations input: # gcp disease_path: gs://open-targets-data-releases/22.04/output/etl/parquet/diseases/