From e797799035d31a2b0add057c4e776e1d3ef55b85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Thu, 15 Jun 2023 19:16:48 +0100
Subject: [PATCH 1/6] feat: add multiple_testing_correction.py

---
 analysis/README.md                            |   7 +
 analysis/python/compare_baseline.ipynb        | 657 ++++++++++++++----
 .../python/multiple_testing_correction.py     |  40 ++
 3 files changed, 580 insertions(+), 124 deletions(-)
 create mode 100644 analysis/python/multiple_testing_correction.py

diff --git a/analysis/README.md b/analysis/README.md
index 3f6b776..971c730 100644
--- a/analysis/README.md
+++ b/analysis/README.md
@@ -24,5 +24,12 @@ gcloud dataproc jobs submit pyspark  \
         --region=europe-west1 \
         --project=open-targets-eu-dev \
         analysis/python/enrichments.py -- config.yml --stratify-therapeutic-area non_oncology
+
+# Adjust the p values for multiple testing using the Benjamini-Hochberg procedure by providing the path to the file with the results of the enrichment analysis
+gcloud dataproc jobs submit pyspark  \
+        --cluster=il-big-stop-reasons \
+        --region=europe-west1 \
+        --project=open-targets-eu-dev \
+        analysis/python/multiple_testing_correction.py  -- "gs://ot-team/irene/stop_reasons/predictions_aggregations_non_oncology"
 ```
 - Visualization of the results.
\ No newline at end of file
diff --git a/analysis/python/compare_baseline.ipynb b/analysis/python/compare_baseline.ipynb
index aa48c69..3cc22e8 100644
--- a/analysis/python/compare_baseline.ipynb
+++ b/analysis/python/compare_baseline.ipynb
@@ -14,22 +14,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 80,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+----------+----------+---------+-------------+\n",
-      "|          prediction|      comparison|comparisonType|predictionType|   a|predictionTotal|comparisonTotal| total|    b|    c|     d| or_result|  lower_ci| upper_ci|       pvalue|\n",
-      "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+----------+----------+---------+-------------+\n",
-      "|  Safety_Sideeffects|affected_pathway|    byDatatype|        reason| 456|           2732|          44764|413172| 2276|44308|366132| 1.6555722| 1.4964652|1.8315957|2.1668734E-20|\n",
-      "|Insufficient_Enro...|affected_pathway|    byDatatype|        reason|2178|          18439|          44764|413172|16261|42586|352147| 1.1075613|  1.057939| 1.159511| 1.5909267E-5|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason| 573|           5576|          44764|413172| 5003|44191|363405| 0.9418488|  0.863369|1.0274624|   0.18574768|\n",
-      "|Business_Administ...|affected_pathway|    byDatatype|        reason|1312|          11630|          44764|413172|10318|43452|358090| 1.0479021|0.98855203|1.1108155|   0.11557128|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason| 226|           2653|          44764|413172| 2427|44538|365981|0.76518506|0.66744256|0.8772413|  7.778494E-5|\n",
-      "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+----------+----------+---------+-------------+\n",
+      "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n",
+      "|          prediction|      comparison|comparisonType|predictionType|   a|predictionTotal|comparisonTotal| total|    b|    c|     d|or_result| lower_ci| upper_ci|       pvalue|\n",
+      "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n",
+      "|Insufficient_Enro...|affected_pathway|    byDatatype|        reason|2291|          18439|          46211|413172|16148|43920|350813|1.1332343| 1.083513|1.1852372|  6.968556E-8|\n",
+      "|            Negative|affected_pathway|    byDatatype|        reason| 615|           5576|          46211|413172| 4961|45596|362000|0.9842099|0.9045937|1.0708333|    0.7321643|\n",
+      "|Business_Administ...|affected_pathway|    byDatatype|        reason|1317|          11630|          46211|413172|10313|44894|356648|1.0145005|0.9571549|1.0752817|   0.62243146|\n",
+      "|      Invalid_Reason|affected_pathway|    byDatatype|        reason| 422|           2577|          46211|413172| 2155|45789|364806|1.5601487|1.4049416| 1.732502|2.1676067E-15|\n",
+      "|        Study_Design|affected_pathway|    byDatatype|        reason| 405|           3166|          46211|413172| 2761|45806|364200| 1.166289|1.0503099| 1.295075| 0.0046313014|\n",
+      "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n",
       "only showing top 5 rows\n",
       "\n",
       "+--------------------+----------+--------------+--------------+----+---------------+---------------+------+----+------+------+----------+----------+----------+-------------+\n",
@@ -56,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 81,
    "metadata": {},
    "outputs": [
     {
@@ -66,26 +73,26 @@
       "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n",
       "|          prediction|      comparison|comparisonType|predictionType|or_result_observed|lower_ci_observed|upper_ci_observed|pvalue_observed|or_result_permuted|lower_ci_permuted|upper_ci_permuted|pvalue_permuted|\n",
       "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|         0.7569264|        0.6600593|        0.8680094|   4.1372376E-5|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|         0.7477239|       0.64919406|       0.86120784|   3.0507868E-5|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|        0.72785324|        0.6332734|       0.83655864|    3.588353E-6|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|         0.7667227|         0.670729|       0.87645483|   6.3741165E-5|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|        0.66826046|       0.57837003|        0.7721216|   9.3277785E-9|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|        0.76111776|       0.66259146|       0.87429476|    7.314266E-5|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|         0.7052045|         0.613212|        0.8109974|   3.5039437E-7|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|         0.7261657|       0.63306683|        0.8329557|   2.3254256E-6|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|        0.76518506|       0.66744256|        0.8772413|    7.778494E-5|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|        0.99938464|       0.91808873|        1.0878792|            1.0|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|           1.00379|        0.9203073|        1.0948457|     0.92929024|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|         0.9924268|        0.9114749|        1.0805684|       0.879578|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|         0.9880351|        0.9086467|        1.0743597|      0.7985208|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|         0.9493068|       0.87049305|        1.0352563|      0.2506197|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|         1.0490831|        0.9641333|        1.1415178|     0.26321337|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|         1.0264277|        0.9443765|         1.115608|     0.53365433|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|         1.0213302|        0.9400097|        1.1096858|      0.6086663|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|         0.9418488|         0.863369|        1.0274624|     0.18574768|\n",
-      "|Business_Administ...|affected_pathway|    byDatatype|        reason|         1.0135593|        0.9628919|         1.066893|     0.60830927|          1.019448|        0.9612728|        1.0811441|     0.51626134|\n",
-      "|Business_Administ...|affected_pathway|    byDatatype|        reason|         1.0135593|        0.9628919|         1.066893|     0.60830927|         1.0313885|       0.97135615|         1.095131|       0.314432|\n",
+      "|Insufficient_Enro...|affected_pathway|    byDatatype|        reason|          1.101286|        1.0580636|        1.1462741|   2.9514772E-6|         1.1332343|         1.083513|        1.1852372|    6.968556E-8|\n",
+      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|         0.9842099|        0.9045937|        1.0708333|      0.7321643|\n",
+      "|Business_Administ...|affected_pathway|    byDatatype|        reason|         1.0135593|        0.9628919|         1.066893|     0.60830927|         1.0145005|        0.9571549|        1.0752817|     0.62243146|\n",
+      "|      Invalid_Reason|affected_pathway|    byDatatype|        reason|           1.42308|        1.2919756|        1.5674883|  4.5850663E-12|         1.5601487|        1.4049416|         1.732502|  2.1676067E-15|\n",
+      "|        Study_Design|affected_pathway|    byDatatype|        reason|         1.1850995|         1.080571|        1.2997394|   3.9830338E-4|          1.166289|        1.0503099|         1.295075|   0.0046313014|\n",
+      "|  Safety_Sideeffects|affected_pathway|    byDatatype|        reason|         1.6450619|        1.5032661|        1.8002325|   5.306468E-25|         1.5841677|        1.4315631|          1.75304|  2.4105537E-17|\n",
+      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|        0.82520795|       0.72417057|        0.9403422|   0.0036548723|\n",
+      "|   Study_Staff_Moved|affected_pathway|    byDatatype|        reason|         1.0708259|        0.9271349|        1.2367867|      0.3469289|         1.1133336|       0.94795865|        1.3075589|        0.19965|\n",
+      "|          No_Context|affected_pathway|    byDatatype|        reason|          0.685027|        0.5684092|        0.8255707|   4.0033254E-5|        0.82053685|        0.6722922|        1.0014704|    0.053746197|\n",
+      "|            Negative|      literature|    byDatatype|        reason|         0.8895437|       0.84362316|       0.93796384|   1.4934009E-5|         0.9543413|       0.90287566|        1.0087405|     0.10023123|\n",
+      "|Business_Administ...|      literature|    byDatatype|        reason|        0.96253645|       0.92767036|         0.998713|    0.043106694|         1.0164348|       0.97820574|        1.0561578|     0.40482116|\n",
+      "|Insufficient_Enro...|      literature|    byDatatype|        reason|         0.9052195|       0.87882197|        0.9324099|  4.2594296E-11|         1.0470023|        1.0154496|        1.0795352|   0.0033233627|\n",
+      "|      Invalid_Reason|      literature|    byDatatype|        reason|         1.0314267|        0.9545476|        1.1144975|     0.44084844|         1.2810183|        1.1842574|        1.3856851|   9.200067E-10|\n",
+      "|             Covid19|      literature|    byDatatype|        reason|         0.8594809|        0.7210249|        1.0245243|     0.09864084|         0.5638818|       0.45956767|       0.69187355|   1.1856974E-8|\n",
+      "|        Study_Design|      literature|    byDatatype|        reason|          0.948647|       0.88451016|        1.0174345|     0.14340925|         1.0617356|       0.98768425|         1.141339|    0.105820864|\n",
+      "|  Safety_Sideeffects|      literature|    byDatatype|        reason|         1.3222082|        1.2257016|        1.4263134|   4.225119E-13|         1.4366288|        1.3318783|         1.549618|   1.605722E-20|\n",
+      "| Logistics_Resources|      literature|    byDatatype|        reason|        0.76919395|        0.7120554|       0.83091766|  2.2282516E-11|        0.80852926|       0.74461913|       0.87792474|   3.4295346E-7|\n",
+      "|   Study_Staff_Moved|      literature|    byDatatype|        reason|         0.9657753|       0.86894673|        1.0733938|      0.5355322|          0.937642|        0.8390834|        1.0477773|     0.26166993|\n",
+      "|       Another_Study|      literature|    byDatatype|        reason|        0.71158177|       0.64844126|       0.78087044|   5.357373E-13|        0.76391065|       0.69150585|        0.8438966|   7.8315104E-8|\n",
+      "|            Negative|  rna_expression|    byDatatype|        reason|         0.9926047|       0.92027634|        1.0706177|      0.8624524|         1.0168192|        0.9524448|        1.0855448|      0.6150149|\n",
       "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n",
       "only showing top 20 rows\n",
       "\n"
@@ -115,7 +122,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 82,
    "metadata": {},
    "outputs": [
     {
@@ -125,11 +132,11 @@
       "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n",
       "|          prediction|      comparison|comparisonType|predictionType|or_result_permuted|lower_ci_permuted|upper_ci_permuted|pvalue_permuted|\n",
       "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n",
-      "|  Safety_Sideeffects|affected_pathway|    byDatatype|        reason|         1.6555722|        1.4964652|        1.8315957|  2.1668734E-20|\n",
-      "|Insufficient_Enro...|affected_pathway|    byDatatype|        reason|         1.1075613|         1.057939|         1.159511|   1.5909267E-5|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|         0.9418488|         0.863369|        1.0274624|     0.18574768|\n",
-      "|Business_Administ...|affected_pathway|    byDatatype|        reason|         1.0479021|       0.98855203|        1.1108155|     0.11557128|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.76518506|       0.66744256|        0.8772413|    7.778494E-5|\n",
+      "|Insufficient_Enro...|affected_pathway|    byDatatype|        reason|         1.1332343|         1.083513|        1.1852372|    6.968556E-8|\n",
+      "|            Negative|affected_pathway|    byDatatype|        reason|         0.9842099|        0.9045937|        1.0708333|      0.7321643|\n",
+      "|Business_Administ...|affected_pathway|    byDatatype|        reason|         1.0145005|        0.9571549|        1.0752817|     0.62243146|\n",
+      "|      Invalid_Reason|affected_pathway|    byDatatype|        reason|         1.5601487|        1.4049416|         1.732502|  2.1676067E-15|\n",
+      "|        Study_Design|affected_pathway|    byDatatype|        reason|          1.166289|        1.0503099|         1.295075|   0.0046313014|\n",
       "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n",
       "only showing top 5 rows\n",
       "\n"
@@ -142,7 +149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 83,
    "metadata": {},
    "outputs": [
     {
@@ -165,7 +172,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 84,
    "metadata": {},
    "outputs": [
     {
@@ -174,17 +181,9 @@
        "100"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 84,
      "metadata": {},
      "output_type": "execute_result"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "23/06/09 14:34:33 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2438016 ms exceeds timeout 120000 ms\n",
-      "23/06/09 14:34:33 WARN SparkContext: Killing executors is not supported by current scheduler.\n"
-     ]
     }
    ],
    "source": [
@@ -207,37 +206,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 91,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "+--------------------+----------------+------------------------------+----------------+---------------+---------------+------------------+\n",
-      "|          prediction|      comparison|permuted_greater_than_observed|pvalue_empirical|pvalue_observed|pvalue_permuted|or_result_observed|\n",
-      "+--------------------+----------------+------------------------------+----------------+---------------+---------------+------------------+\n",
-      "|  Safety_Sideeffects|      literature|                             1|            0.01|   4.225119E-13|  5.7093783E-13|         1.3222082|\n",
-      "|  Safety_Sideeffects|      literature|                             1|            0.01|   4.225119E-13|  3.3565586E-17|         1.3222082|\n",
-      "|  Safety_Sideeffects|      literature|                             1|            0.01|   4.225119E-13|   2.538905E-23|         1.3222082|\n",
-      "|  Safety_Sideeffects|      literature|                             1|            0.01|   4.225119E-13|  1.9547876E-20|         1.3222082|\n",
-      "|  Safety_Sideeffects|      literature|                             1|            0.01|   4.225119E-13|  3.9431886E-19|         1.3222082|\n",
-      "|  Safety_Sideeffects|      literature|                             1|            0.01|   4.225119E-13|  4.9306253E-21|         1.3222082|\n",
-      "|  Safety_Sideeffects|      literature|                             1|            0.01|   4.225119E-13|  5.2965625E-20|         1.3222082|\n",
-      "|  Safety_Sideeffects|      literature|                             1|            0.01|   4.225119E-13|   7.503974E-21|         1.3222082|\n",
-      "|  Safety_Sideeffects|      literature|                             1|            0.01|   4.225119E-13|  1.3370162E-20|         1.3222082|\n",
-      "|Business_Administ...|somatic_mutation|                             2|            0.02|      0.5010471|     0.48032835|         1.0204865|\n",
-      "|Business_Administ...|somatic_mutation|                             2|            0.02|      0.5010471|      0.4706476|         1.0204865|\n",
-      "|Business_Administ...|somatic_mutation|                             2|            0.02|      0.5010471|     0.15366814|         1.0204865|\n",
-      "|Business_Administ...|somatic_mutation|                             2|            0.02|      0.5010471|    0.040951073|         1.0204865|\n",
-      "|Business_Administ...|somatic_mutation|                             2|            0.02|      0.5010471|       0.886372|         1.0204865|\n",
-      "|Business_Administ...|somatic_mutation|                             2|            0.02|      0.5010471|      0.5447119|         1.0204865|\n",
-      "|Business_Administ...|somatic_mutation|                             2|            0.02|      0.5010471|    0.051056553|         1.0204865|\n",
-      "|Business_Administ...|somatic_mutation|                             2|            0.02|      0.5010471|      0.3950442|         1.0204865|\n",
-      "|Business_Administ...|somatic_mutation|                             2|            0.02|      0.5010471|     0.48515812|         1.0204865|\n",
-      "|       Another_Study|affected_pathway|                             4|            0.04|     0.62539124|     0.24647969|        0.96532875|\n",
-      "|       Another_Study|affected_pathway|                             4|            0.04|     0.62539124|     0.64555955|        0.96532875|\n",
-      "+--------------------+----------------+------------------------------+----------------+---------------+---------------+------------------+\n",
+      "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n",
+      "|          prediction|         comparison|permuted_greater_than_observed|pvalue_empirical|pvalue_observed|or_result_observed|\n",
+      "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n",
+      "|  Safety_Sideeffects|         literature|                             2|            0.02|   4.225119E-13|         1.3222082|\n",
+      "|Business_Administ...|   somatic_mutation|                            22|            0.22|      0.5010471|         1.0204865|\n",
+      "|       Another_Study|   affected_pathway|                            47|            0.47|     0.62539124|        0.96532875|\n",
+      "| Logistics_Resources|   somatic_mutation|                            82|            0.82|     0.13673647|        0.90625197|\n",
+      "|Business_Administ...|     rna_expression|                            59|            0.59|     0.37541625|             0.976|\n",
+      "|            Negative|   somatic_mutation|                           100|             1.0|   6.9009767E-9|         1.2699835|\n",
+      "|Business_Administ...|         known_drug|                             0|             0.0|            1.0|       0.028963346|\n",
+      "|      Invalid_Reason|genetic_association|                            97|            0.97|    0.024117803|        0.83866763|\n",
+      "|          Regulatory|     rna_expression|                           100|             1.0|   1.3482296E-9|        0.54306966|\n",
+      "|Insufficient_Enro...|   affected_pathway|                            34|            0.34|   2.9514772E-6|          1.101286|\n",
+      "|          No_Context|   affected_pathway|                            97|            0.97|   4.0033254E-5|          0.685027|\n",
+      "|Insufficient_Enro...|genetic_association|                            43|            0.43|            0.0|        0.60888106|\n",
+      "|      Invalid_Reason|         known_drug|                             0|             0.0|            1.0|       0.006276258|\n",
+      "|    Interim_Analysis|         literature|                             9|            0.09|     0.10653863|         0.7150094|\n",
+      "|      Invalid_Reason|       animal_model|                            87|            0.87|    0.065790154|         0.8683092|\n",
+      "|Insufficient_Enro...|       animal_model|                            93|            0.93|   6.694221E-29|        0.71514577|\n",
+      "|Business_Administ...|   affected_pathway|                            28|            0.28|     0.60830927|         1.0135593|\n",
+      "|        Study_Design|         literature|                            84|            0.84|     0.14340925|          0.948647|\n",
+      "|    Interim_Analysis|     rna_expression|                            55|            0.55|    0.042635407|        0.46051452|\n",
+      "|             Success|         literature|                             1|            0.01|     0.69708806|         1.2096671|\n",
+      "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n",
       "only showing top 20 rows\n",
       "\n"
      ]
@@ -247,7 +253,8 @@
     "corrected_combined = (\n",
     "    calculate_empirical_pvalue(combined)\n",
     "    # .select(\"permuted_greater_than_observed\").distinct()\n",
-    "    .join(combined.select(\"prediction\", \"comparison\", \"pvalue_observed\", \"pvalue_permuted\", \"or_result_observed\"), on=[\"prediction\", \"comparison\"], how=\"inner\")\n",
+    "    .join(combined.select(\"prediction\", \"comparison\", \"pvalue_observed\", \"or_result_observed\"), on=[\"prediction\", \"comparison\"], how=\"inner\")\n",
+    "    .distinct()\n",
     ")\n",
     "\n",
     "corrected_combined.show()"
@@ -255,72 +262,66 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 92,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "data": {
+      "text/plain": [
+       "100"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "corrected_combined.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "The following packages are already present in the pyproject.toml and will be skipped:\n",
-      "\n",
-      "  • \u001b[36mmatplotlib\u001b[39m\n",
-      "\n",
-      "If you want to update it to the latest compatible version, you can use `poetry update package`.\n",
-      "If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.\n",
-      "\n",
-      "Nothing to add.\n",
-      "The following packages are already present in the pyproject.toml and will be skipped:\n",
-      "\n",
-      "  • \u001b[36mplotly.express\u001b[39m\n",
-      "\n",
-      "If you want to update it to the latest compatible version, you can use `poetry update package`.\n",
-      "If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.\n",
-      "\n",
-      "Nothing to add.\n",
-      "The following packages are already present in the pyproject.toml and will be skipped:\n",
-      "\n",
-      "  • \u001b[36mnbformat\u001b[39m\n",
-      "\n",
-      "If you want to update it to the latest compatible version, you can use `poetry update package`.\n",
-      "If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.\n",
-      "\n",
-      "Nothing to add.\n",
-      "Using version \u001b[39;1m^1.1.0\u001b[39;22m for \u001b[36mchart-studio\u001b[39m\n",
-      "\n",
-      "\u001b[34mUpdating dependencies\u001b[39m\n",
-      "\u001b[2K\u001b[34mResolving dependencies...\u001b[39m \u001b[39;2m(4.1s)\u001b[39;22m\n",
-      "\n",
-      "\u001b[34mWriting lock file\u001b[39m\n",
-      "\n",
-      "\u001b[39;1mPackage operations\u001b[39;22m: \u001b[34m2\u001b[39m installs, \u001b[34m0\u001b[39m updates, \u001b[34m0\u001b[39m removals\n",
-      "\n",
-      "  \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.3.4\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mPending...\u001b[39m\n",
-      "\u001b[1A\u001b[0J  \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.3.4\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mDownloading...\u001b[39m \u001b[39;1m0%\u001b[39;22m\n",
-      "\u001b[1A\u001b[0J  \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.3.4\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mDownloading...\u001b[39m \u001b[39;1m100%\u001b[39;22m\n",
-      "\u001b[1A\u001b[0J  \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.3.4\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mInstalling...\u001b[39m\n",
-      "\u001b[1A\u001b[0J  \u001b[32;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mretrying\u001b[39m\u001b[39m (\u001b[39m\u001b[32m1.3.4\u001b[39m\u001b[39m)\u001b[39m\n",
-      "  \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.1.0\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mPending...\u001b[39m\n",
-      "\u001b[1A\u001b[0J  \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.1.0\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mDownloading...\u001b[39m \u001b[39;1m0%\u001b[39;22m\n",
-      "\u001b[1A\u001b[0J  \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.1.0\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mDownloading...\u001b[39m \u001b[39;1m100%\u001b[39;22m\n",
-      "\u001b[1A\u001b[0J  \u001b[34;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[39;1m1.1.0\u001b[39;22m\u001b[39m)\u001b[39m: \u001b[34mInstalling...\u001b[39m\n",
-      "\u001b[1A\u001b[0J  \u001b[32;1m•\u001b[39;22m \u001b[39mInstalling \u001b[39m\u001b[36mchart-studio\u001b[39m\u001b[39m (\u001b[39m\u001b[32m1.1.0\u001b[39m\u001b[39m)\u001b[39m\n",
-      "\n",
-      "\u001b[31;1mCould not find a matching version of package chart_studio.plotly\u001b[39;22m\n"
+      "                                                                                \r"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "100"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
+   "source": [
+    "corrected_combined.select(\"prediction\", \"comparison\").distinct().count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "!poetry add matplotlib\n",
     "!poetry add plotly.express\n",
-    "!poetry add nbformat\n",
     "!poetry add chart_studio\n",
     "!poetry add chart_studio.plotly\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 95,
    "metadata": {},
    "outputs": [
     {
@@ -329,7 +330,7 @@
        "'https://plotly.com/~irenelopezs/63/'"
       ]
      },
-     "execution_count": 79,
+     "execution_count": 95,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -347,9 +348,16 @@
     "\n",
     "pdf = corrected_combined.toPandas()\n",
     "pdf['or_result_observed'] = pdf['or_result_observed'].astype(str)\n",
+    "pdf['significance'] = 'Remains Significant'\n",
+    "pdf.loc[(pdf['pvalue_observed'] < 0.05) & (pdf['pvalue_empirical'] >= 0.05), 'significance'] = 'No Longer Significant'\n",
+    "pdf.loc[(pdf['pvalue_observed'] >= 0.05) & (pdf['pvalue_empirical'] < 0.05), 'significance'] = 'Now Significant'\n",
+    "pdf.loc[(pdf['pvalue_observed'] >= 0.05) & (pdf['pvalue_empirical'] >= 0.05), 'significance'] = 'Never Significant'\n",
+    "\n",
     "\n",
     "fig = px.scatter(pdf, x=\"pvalue_observed\", y=\"pvalue_empirical\",\n",
     "                 hover_data=[\"comparison\", \"prediction\", \"or_result_observed\"],\n",
+    "                 color='significance',\n",
+    "                 color_discrete_sequence=[\"green\", \"red\", \"blue\", \"black\"],\n",
     "                 title=\"Comparison of Observed and Empirical P-Values\")\n",
     "fig.add_shape(type=\"line\", \n",
     "              x0=pdf['pvalue_observed'].min(), x1=pdf['pvalue_observed'].max(), \n",
@@ -364,6 +372,407 @@
     "\n",
     "py.plot(fig, filename=\"stop_reasons_pvalue_scatter\", auto_open=True)\n"
    ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## P value correction for false discovery rate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>prediction</th>\n",
+       "      <th>comparison</th>\n",
+       "      <th>comparisonType</th>\n",
+       "      <th>predictionType</th>\n",
+       "      <th>a</th>\n",
+       "      <th>predictionTotal</th>\n",
+       "      <th>comparisonTotal</th>\n",
+       "      <th>total</th>\n",
+       "      <th>b</th>\n",
+       "      <th>c</th>\n",
+       "      <th>d</th>\n",
+       "      <th>or_result</th>\n",
+       "      <th>lower_ci</th>\n",
+       "      <th>upper_ci</th>\n",
+       "      <th>pvalue</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Insufficient_Enrollment</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>8615</td>\n",
+       "      <td>18439</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>9824</td>\n",
+       "      <td>194234</td>\n",
+       "      <td>200499</td>\n",
+       "      <td>0.905219</td>\n",
+       "      <td>0.878822</td>\n",
+       "      <td>0.932410</td>\n",
+       "      <td>4.259430e-11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Business_Administrative</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>5602</td>\n",
+       "      <td>11630</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>6028</td>\n",
+       "      <td>197247</td>\n",
+       "      <td>204295</td>\n",
+       "      <td>0.962536</td>\n",
+       "      <td>0.927670</td>\n",
+       "      <td>0.998713</td>\n",
+       "      <td>4.310669e-02</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Negative</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>2577</td>\n",
+       "      <td>5576</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>2999</td>\n",
+       "      <td>200272</td>\n",
+       "      <td>207324</td>\n",
+       "      <td>0.889544</td>\n",
+       "      <td>0.843623</td>\n",
+       "      <td>0.937964</td>\n",
+       "      <td>1.493401e-05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Invalid_Reason</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>1285</td>\n",
+       "      <td>2577</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>1292</td>\n",
+       "      <td>201564</td>\n",
+       "      <td>209031</td>\n",
+       "      <td>1.031427</td>\n",
+       "      <td>0.954548</td>\n",
+       "      <td>1.114498</td>\n",
+       "      <td>4.408484e-01</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Covid19</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>228</td>\n",
+       "      <td>503</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>275</td>\n",
+       "      <td>202621</td>\n",
+       "      <td>210048</td>\n",
+       "      <td>0.859481</td>\n",
+       "      <td>0.721025</td>\n",
+       "      <td>1.024524</td>\n",
+       "      <td>9.864084e-02</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                prediction comparison comparisonType predictionType     a  \\\n",
+       "0  Insufficient_Enrollment  europepmc   byDatasource         reason  8615   \n",
+       "1  Business_Administrative  europepmc   byDatasource         reason  5602   \n",
+       "2                 Negative  europepmc   byDatasource         reason  2577   \n",
+       "3           Invalid_Reason  europepmc   byDatasource         reason  1285   \n",
+       "4                  Covid19  europepmc   byDatasource         reason   228   \n",
+       "\n",
+       "   predictionTotal  comparisonTotal   total     b       c       d  or_result  \\\n",
+       "0            18439           202849  413172  9824  194234  200499   0.905219   \n",
+       "1            11630           202849  413172  6028  197247  204295   0.962536   \n",
+       "2             5576           202849  413172  2999  200272  207324   0.889544   \n",
+       "3             2577           202849  413172  1292  201564  209031   1.031427   \n",
+       "4              503           202849  413172   275  202621  210048   0.859481   \n",
+       "\n",
+       "   lower_ci  upper_ci        pvalue  \n",
+       "0  0.878822  0.932410  4.259430e-11  \n",
+       "1  0.927670  0.998713  4.310669e-02  \n",
+       "2  0.843623  0.937964  1.493401e-05  \n",
+       "3  0.954548  1.114498  4.408484e-01  \n",
+       "4  0.721025  1.024524  9.864084e-02  "
+      ]
+     },
+     "execution_count": 97,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "observed = spark.read.parquet(\"/Users/irenelopez/MEGAsync/EBI/repos/stopReasons/data/predictions_aggregations_all\").toPandas()\n",
+    "\n",
+    "observed.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>prediction</th>\n",
+       "      <th>comparison</th>\n",
+       "      <th>comparisonType</th>\n",
+       "      <th>predictionType</th>\n",
+       "      <th>a</th>\n",
+       "      <th>predictionTotal</th>\n",
+       "      <th>comparisonTotal</th>\n",
+       "      <th>total</th>\n",
+       "      <th>b</th>\n",
+       "      <th>c</th>\n",
+       "      <th>d</th>\n",
+       "      <th>or_result</th>\n",
+       "      <th>lower_ci</th>\n",
+       "      <th>upper_ci</th>\n",
+       "      <th>pvalue</th>\n",
+       "      <th>rejects</th>\n",
+       "      <th>pvalue_corrected</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Insufficient_Enrollment</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>8615</td>\n",
+       "      <td>18439</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>9824</td>\n",
+       "      <td>194234</td>\n",
+       "      <td>200499</td>\n",
+       "      <td>0.905219</td>\n",
+       "      <td>0.878822</td>\n",
+       "      <td>0.932410</td>\n",
+       "      <td>4.259430e-11</td>\n",
+       "      <td>True</td>\n",
+       "      <td>1.723030e-10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Business_Administrative</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>5602</td>\n",
+       "      <td>11630</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>6028</td>\n",
+       "      <td>197247</td>\n",
+       "      <td>204295</td>\n",
+       "      <td>0.962536</td>\n",
+       "      <td>0.927670</td>\n",
+       "      <td>0.998713</td>\n",
+       "      <td>4.310669e-02</td>\n",
+       "      <td>False</td>\n",
+       "      <td>7.268878e-02</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Negative</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>2577</td>\n",
+       "      <td>5576</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>2999</td>\n",
+       "      <td>200272</td>\n",
+       "      <td>207324</td>\n",
+       "      <td>0.889544</td>\n",
+       "      <td>0.843623</td>\n",
+       "      <td>0.937964</td>\n",
+       "      <td>1.493401e-05</td>\n",
+       "      <td>True</td>\n",
+       "      <td>4.078030e-05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Invalid_Reason</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>1285</td>\n",
+       "      <td>2577</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>1292</td>\n",
+       "      <td>201564</td>\n",
+       "      <td>209031</td>\n",
+       "      <td>1.031427</td>\n",
+       "      <td>0.954548</td>\n",
+       "      <td>1.114498</td>\n",
+       "      <td>4.408484e-01</td>\n",
+       "      <td>False</td>\n",
+       "      <td>5.564568e-01</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Covid19</td>\n",
+       "      <td>europepmc</td>\n",
+       "      <td>byDatasource</td>\n",
+       "      <td>reason</td>\n",
+       "      <td>228</td>\n",
+       "      <td>503</td>\n",
+       "      <td>202849</td>\n",
+       "      <td>413172</td>\n",
+       "      <td>275</td>\n",
+       "      <td>202621</td>\n",
+       "      <td>210048</td>\n",
+       "      <td>0.859481</td>\n",
+       "      <td>0.721025</td>\n",
+       "      <td>1.024524</td>\n",
+       "      <td>9.864084e-02</td>\n",
+       "      <td>False</td>\n",
+       "      <td>1.523175e-01</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                prediction comparison comparisonType predictionType     a  \\\n",
+       "0  Insufficient_Enrollment  europepmc   byDatasource         reason  8615   \n",
+       "1  Business_Administrative  europepmc   byDatasource         reason  5602   \n",
+       "2                 Negative  europepmc   byDatasource         reason  2577   \n",
+       "3           Invalid_Reason  europepmc   byDatasource         reason  1285   \n",
+       "4                  Covid19  europepmc   byDatasource         reason   228   \n",
+       "\n",
+       "   predictionTotal  comparisonTotal   total     b       c       d  or_result  \\\n",
+       "0            18439           202849  413172  9824  194234  200499   0.905219   \n",
+       "1            11630           202849  413172  6028  197247  204295   0.962536   \n",
+       "2             5576           202849  413172  2999  200272  207324   0.889544   \n",
+       "3             2577           202849  413172  1292  201564  209031   1.031427   \n",
+       "4              503           202849  413172   275  202621  210048   0.859481   \n",
+       "\n",
+       "   lower_ci  upper_ci        pvalue  rejects  pvalue_corrected  \n",
+       "0  0.878822  0.932410  4.259430e-11     True      1.723030e-10  \n",
+       "1  0.927670  0.998713  4.310669e-02    False      7.268878e-02  \n",
+       "2  0.843623  0.937964  1.493401e-05     True      4.078030e-05  \n",
+       "3  0.954548  1.114498  4.408484e-01    False      5.564568e-01  \n",
+       "4  0.721025  1.024524  9.864084e-02    False      1.523175e-01  "
+      ]
+     },
+     "execution_count": 99,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from statsmodels.stats.multitest import fdrcorrection\n",
+    "\n",
+    "pvals = observed['pvalue'].values\n",
+    "\n",
+    "rejected, pvals_corrected = fdrcorrection(pvals, alpha=0.05, method=\"indep\", is_sorted=False)\n",
+    "observed['rejects'] = rejected\n",
+    "observed[\"pvalue_corrected\"] = pvals_corrected\n",
+    "\n",
+    "observed.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "observed[\"significance\"] = \"Remains Significant\"\n",
+    "observed.loc[(pdf[\"pvalue\"] < 0.05) & (pdf[\"pvalue_corrected\"] >= 0.05), \"significance\"] = \"No Longer Significant\"\n",
+    "observed.loc[(pdf[\"pvalue_observed\"] >= 0.05) & (pdf[\"pvalue_corrected\"] < 0.05), \"significance\"] = \"Now Significant\"\n",
+    "observed.loc[(pdf[\"pvalue_observed\"] >= 0.05) & (pdf[\"pvalue_corrected\"] >= 0.05), \"significance\"] = \"Never Significant\"\n",
+    "\n",
+    "\n",
+    "fig = px.scatter(pdf, x=\"pvalue_observed\", y=\"pvalue_empirical\",\n",
+    "                 hover_data=[\"comparison\", \"prediction\", \"or_result_observed\"],\n",
+    "                 color=\"significance\",\n",
+    "                 color_discrete_sequence=[\"green\", \"red\", \"blue\", \"black\"],\n",
+    "                 title=\"Comparison of Observed and Empirical P-Values\")\n",
+    "fig.add_shape(type=\"line\", \n",
+    "              x0=pdf[\"pvalue_observed\"].min(), x1=pdf[\"pvalue_observed\"].max(), \n",
+    "              y0=0.05, y1=0.05,\n",
+    "              line=dict(color=\"Red\", dash=\"dash\"))\n",
+    "\n",
+    "fig.add_shape(type=\"line\", \n",
+    "              x0=0.05, x1=0.05,\n",
+    "              y0=pdf[\"pvalue_empirical\"].min(), y1=pdf[\"pvalue_empirical\"].max(), \n",
+    "              line=dict(color=\"Red\", dash=\"dash\"))\n",
+    "fig.update_layout(xaxis_title=\"Observed P-Values\", yaxis_title=\"Empirical P-Values\")\n",
+    "\n",
+    "py.plot(fig, filename=\"stop_reasons_pvalue_scatter\", auto_open=True)"
+   ]
   }
  ],
  "metadata": {
diff --git a/analysis/python/multiple_testing_correction.py b/analysis/python/multiple_testing_correction.py
new file mode 100644
index 0000000..54be37d
--- /dev/null
+++ b/analysis/python/multiple_testing_correction.py
@@ -0,0 +1,40 @@
+"""Applying Benjamini/Hochberg correction to account for false discovery rate."""
+
+import typer
+from pyspark.sql import SparkSession
+from statsmodels.stats.multitest import fdrcorrection
+
+
+def apply_fdr_correction(df):
+    """Apply Benjamini/Hochberg correction to account for false discovery rate."""
+    pvals = df["pvalue"].values
+    _, pvals_corrected = fdrcorrection(
+        pvals, alpha=0.05, method="indep", is_sorted=False
+    )
+    df["pvalue_corrected"] = pvals_corrected
+    return df
+
+
+def main(data_path):
+    """Apply Benjamini/Hochberg correction to account for false discovery rate in every predictionType/comparisonType pair.
+
+    Args:
+        data_path (str): Path to the parquet file containing the results of the enrichment analysis.
+    """
+    spark = SparkSession.builder.getOrCreate()
+    enrichments = spark.read.parquet(data_path).toPandas()
+
+    enrichments = enrichments.groupby(["predictionType", "comparisonType"]).apply(
+        apply_fdr_correction
+    )
+    output_data_path = f"{data_path}_corrected"
+    (
+        spark.createDataFrame(enrichments)
+        .write.mode("overwrite")
+        .parquet(output_data_path)
+    )
+    print(f"Adjusted data saved to {output_data_path}.")
+
+
+if __name__ == "__main__":
+    typer.run(main)

From 88c7b94b5f0a355de8b10e90e52c63fe9a9d5a8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Thu, 15 Jun 2023 19:40:07 +0100
Subject: [PATCH 2/6] feat: add plot_pvalue_scatter

---
 .../python/multiple_testing_correction.py     | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/analysis/python/multiple_testing_correction.py b/analysis/python/multiple_testing_correction.py
index 54be37d..79fe22e 100644
--- a/analysis/python/multiple_testing_correction.py
+++ b/analysis/python/multiple_testing_correction.py
@@ -15,6 +15,66 @@ def apply_fdr_correction(df):
     return df
 
 
+def plot_pvalue_scatter(df, figure_name, username, api_key):
+    """Plot a scatter plot of the observed p-values vs. the adjusted p-values after multiple testing correction.
+
+    The plot is saved to the plotly cloud and can be accessed via the link in the terminal output.
+
+    Args:
+        df (pandas.DataFrame): DataFrame containing the results of the enrichment analysis.
+        figure_name (str): Name of the plotly figure.
+        username (str): Username for the plotly account.
+        api_key (str): API key for the plotly account.
+    """
+    import chart_studio
+    import chart_studio.plotly as py
+    import plotly.express as px
+
+    chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
+
+    df["significance"] = "Remains Significant"
+    df.loc[
+        (df["pvalue"] < 0.05) & (df["pvalue_corrected"] >= 0.05), "significance"
+    ] = "No Longer Significant"
+    df.loc[
+        (df["pvalue"] >= 0.05) & (df["pvalue_corrected"] < 0.05), "significance"
+    ] = "Now Significant"
+    df.loc[
+        (df["pvalue"] >= 0.05) & (df["pvalue_corrected"] >= 0.05), "significance"
+    ] = "Never Significant"
+
+    fig = px.scatter(
+        df,
+        x="pvalue",
+        y="pvalue_corrected",
+        hover_data=["comparison", "prediction", "or_result"],
+        color="significance",
+        color_discrete_sequence=["green", "red", "blue", "black"],
+        log_x=True,
+        log_y=True,
+        title="Comparison of df and Empirical P-Values",
+    )
+    fig.add_shape(
+        type="line",
+        x0=df["pvalue"].min(),
+        x1=df["pvalue"].max(),
+        y0=0.05,
+        y1=0.05,
+        line=dict(color="Red", dash="dash"),
+    )
+
+    fig.add_shape(
+        type="line",
+        x0=0.05,
+        x1=0.05,
+        y0=df["pvalue_corrected"].min(),
+        y1=df["pvalue_corrected"].max(),
+        line=dict(color="Red", dash="dash"),
+    )
+    fig.update_layout(xaxis_title="Observed P-Values", yaxis_title="Corrected P-Values")
+    py.plot(fig, filename=figure_name, auto_open=True)
+
+
 def main(data_path):
     """Apply Benjamini/Hochberg correction to account for false discovery rate in every predictionType/comparisonType pair.
 

From ffadc5b8134de2a6179d3c21908089097bbac560 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Thu, 15 Jun 2023 19:43:52 +0100
Subject: [PATCH 3/6] fix: remove compare_baseline.ipynb from git tracking

---
 analysis/python/compare_baseline.ipynb | 800 -------------------------
 1 file changed, 800 deletions(-)
 delete mode 100644 analysis/python/compare_baseline.ipynb

diff --git a/analysis/python/compare_baseline.ipynb b/analysis/python/compare_baseline.ipynb
deleted file mode 100644
index 3cc22e8..0000000
--- a/analysis/python/compare_baseline.ipynb
+++ /dev/null
@@ -1,800 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pyspark.sql import SparkSession, Window\n",
-    "import pyspark.sql.functions as F\n",
-    "\n",
-    "spark = SparkSession.builder.getOrCreate()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                                                \r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n",
-      "|          prediction|      comparison|comparisonType|predictionType|   a|predictionTotal|comparisonTotal| total|    b|    c|     d|or_result| lower_ci| upper_ci|       pvalue|\n",
-      "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n",
-      "|Insufficient_Enro...|affected_pathway|    byDatatype|        reason|2291|          18439|          46211|413172|16148|43920|350813|1.1332343| 1.083513|1.1852372|  6.968556E-8|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason| 615|           5576|          46211|413172| 4961|45596|362000|0.9842099|0.9045937|1.0708333|    0.7321643|\n",
-      "|Business_Administ...|affected_pathway|    byDatatype|        reason|1317|          11630|          46211|413172|10313|44894|356648|1.0145005|0.9571549|1.0752817|   0.62243146|\n",
-      "|      Invalid_Reason|affected_pathway|    byDatatype|        reason| 422|           2577|          46211|413172| 2155|45789|364806|1.5601487|1.4049416| 1.732502|2.1676067E-15|\n",
-      "|        Study_Design|affected_pathway|    byDatatype|        reason| 405|           3166|          46211|413172| 2761|45806|364200| 1.166289|1.0503099| 1.295075| 0.0046313014|\n",
-      "+--------------------+----------------+--------------+--------------+----+---------------+---------------+------+-----+-----+------+---------+---------+---------+-------------+\n",
-      "only showing top 5 rows\n",
-      "\n",
-      "+--------------------+----------+--------------+--------------+----+---------------+---------------+------+----+------+------+----------+----------+----------+-------------+\n",
-      "|          prediction|comparison|comparisonType|predictionType|   a|predictionTotal|comparisonTotal| total|   b|     c|     d| or_result|  lower_ci|  upper_ci|       pvalue|\n",
-      "+--------------------+----------+--------------+--------------+----+---------------+---------------+------+----+------+------+----------+----------+----------+-------------+\n",
-      "|Insufficient_Enro...| europepmc|  byDatasource|        reason|8615|          18439|         202849|413172|9824|194234|200499| 0.9052195|0.87882197| 0.9324099|4.2594296E-11|\n",
-      "|Business_Administ...| europepmc|  byDatasource|        reason|5602|          11630|         202849|413172|6028|197247|204295|0.96253645|0.92767036|  0.998713|  0.043106694|\n",
-      "|            Negative| europepmc|  byDatasource|        reason|2577|           5576|         202849|413172|2999|200272|207324| 0.8895437|0.84362316|0.93796384| 1.4934009E-5|\n",
-      "|      Invalid_Reason| europepmc|  byDatasource|        reason|1285|           2577|         202849|413172|1292|201564|209031| 1.0314267| 0.9545476| 1.1144975|   0.44084844|\n",
-      "|             Covid19| europepmc|  byDatasource|        reason| 228|            503|         202849|413172| 275|202621|210048| 0.8594809| 0.7210249| 1.0245243|   0.09864084|\n",
-      "+--------------------+----------+--------------+--------------+----+---------------+---------------+------+----+------+------+----------+----------+----------+-------------+\n",
-      "only showing top 5 rows\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "permuted = spark.read.parquet(\"/Users/irenelopez/MEGAsync/EBI/repos/stopReasons/data/baseline_predictions_aggregations/*\")\n",
-    "observed = spark.read.parquet(\"/Users/irenelopez/MEGAsync/EBI/repos/stopReasons/data/predictions_aggregations_all\")\n",
-    "\n",
-    "permuted.show(5)\n",
-    "observed.show(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n",
-      "|          prediction|      comparison|comparisonType|predictionType|or_result_observed|lower_ci_observed|upper_ci_observed|pvalue_observed|or_result_permuted|lower_ci_permuted|upper_ci_permuted|pvalue_permuted|\n",
-      "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n",
-      "|Insufficient_Enro...|affected_pathway|    byDatatype|        reason|          1.101286|        1.0580636|        1.1462741|   2.9514772E-6|         1.1332343|         1.083513|        1.1852372|    6.968556E-8|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|         0.9842099|        0.9045937|        1.0708333|      0.7321643|\n",
-      "|Business_Administ...|affected_pathway|    byDatatype|        reason|         1.0135593|        0.9628919|         1.066893|     0.60830927|         1.0145005|        0.9571549|        1.0752817|     0.62243146|\n",
-      "|      Invalid_Reason|affected_pathway|    byDatatype|        reason|           1.42308|        1.2919756|        1.5674883|  4.5850663E-12|         1.5601487|        1.4049416|         1.732502|  2.1676067E-15|\n",
-      "|        Study_Design|affected_pathway|    byDatatype|        reason|         1.1850995|         1.080571|        1.2997394|   3.9830338E-4|          1.166289|        1.0503099|         1.295075|   0.0046313014|\n",
-      "|  Safety_Sideeffects|affected_pathway|    byDatatype|        reason|         1.6450619|        1.5032661|        1.8002325|   5.306468E-25|         1.5841677|        1.4315631|          1.75304|  2.4105537E-17|\n",
-      "| Logistics_Resources|affected_pathway|    byDatatype|        reason|        0.78194696|         0.695937|        0.8785868|   2.4237832E-5|        0.82520795|       0.72417057|        0.9403422|   0.0036548723|\n",
-      "|   Study_Staff_Moved|affected_pathway|    byDatatype|        reason|         1.0708259|        0.9271349|        1.2367867|      0.3469289|         1.1133336|       0.94795865|        1.3075589|        0.19965|\n",
-      "|          No_Context|affected_pathway|    byDatatype|        reason|          0.685027|        0.5684092|        0.8255707|   4.0033254E-5|        0.82053685|        0.6722922|        1.0014704|    0.053746197|\n",
-      "|            Negative|      literature|    byDatatype|        reason|         0.8895437|       0.84362316|       0.93796384|   1.4934009E-5|         0.9543413|       0.90287566|        1.0087405|     0.10023123|\n",
-      "|Business_Administ...|      literature|    byDatatype|        reason|        0.96253645|       0.92767036|         0.998713|    0.043106694|         1.0164348|       0.97820574|        1.0561578|     0.40482116|\n",
-      "|Insufficient_Enro...|      literature|    byDatatype|        reason|         0.9052195|       0.87882197|        0.9324099|  4.2594296E-11|         1.0470023|        1.0154496|        1.0795352|   0.0033233627|\n",
-      "|      Invalid_Reason|      literature|    byDatatype|        reason|         1.0314267|        0.9545476|        1.1144975|     0.44084844|         1.2810183|        1.1842574|        1.3856851|   9.200067E-10|\n",
-      "|             Covid19|      literature|    byDatatype|        reason|         0.8594809|        0.7210249|        1.0245243|     0.09864084|         0.5638818|       0.45956767|       0.69187355|   1.1856974E-8|\n",
-      "|        Study_Design|      literature|    byDatatype|        reason|          0.948647|       0.88451016|        1.0174345|     0.14340925|         1.0617356|       0.98768425|         1.141339|    0.105820864|\n",
-      "|  Safety_Sideeffects|      literature|    byDatatype|        reason|         1.3222082|        1.2257016|        1.4263134|   4.225119E-13|         1.4366288|        1.3318783|         1.549618|   1.605722E-20|\n",
-      "| Logistics_Resources|      literature|    byDatatype|        reason|        0.76919395|        0.7120554|       0.83091766|  2.2282516E-11|        0.80852926|       0.74461913|       0.87792474|   3.4295346E-7|\n",
-      "|   Study_Staff_Moved|      literature|    byDatatype|        reason|         0.9657753|       0.86894673|        1.0733938|      0.5355322|          0.937642|        0.8390834|        1.0477773|     0.26166993|\n",
-      "|       Another_Study|      literature|    byDatatype|        reason|        0.71158177|       0.64844126|       0.78087044|   5.357373E-13|        0.76391065|       0.69150585|        0.8438966|   7.8315104E-8|\n",
-      "|            Negative|  rna_expression|    byDatatype|        reason|         0.9926047|       0.92027634|        1.0706177|      0.8624524|         1.0168192|        0.9524448|        1.0855448|      0.6150149|\n",
-      "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+------------------+-----------------+-----------------+---------------+\n",
-      "only showing top 20 rows\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "# reshape data before joining\n",
-    "\n",
-    "common_cols = [\"prediction\", \"comparison\", \"comparisonType\", \"predictionType\"]\n",
-    "results_cols  = [\"or_result\", \"lower_ci\", \"upper_ci\", \"pvalue\"]\n",
-    "\n",
-    "\n",
-    "permuted = permuted.select(\n",
-    "    common_cols + [F.col(c).alias(f\"{c}_permuted\") for c in results_cols]\n",
-    ")\n",
-    "\n",
-    "observed = observed.select(\n",
-    "    common_cols + [F.col(c).alias(f\"{c}_observed\") for c in results_cols]\n",
-    ")\n",
-    "\n",
-    "combined = observed.join(permuted, on=common_cols, how=\"inner\")\n",
-    "\n",
-    "combined.show()\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n",
-      "|          prediction|      comparison|comparisonType|predictionType|or_result_permuted|lower_ci_permuted|upper_ci_permuted|pvalue_permuted|\n",
-      "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n",
-      "|Insufficient_Enro...|affected_pathway|    byDatatype|        reason|         1.1332343|         1.083513|        1.1852372|    6.968556E-8|\n",
-      "|            Negative|affected_pathway|    byDatatype|        reason|         0.9842099|        0.9045937|        1.0708333|      0.7321643|\n",
-      "|Business_Administ...|affected_pathway|    byDatatype|        reason|         1.0145005|        0.9571549|        1.0752817|     0.62243146|\n",
-      "|      Invalid_Reason|affected_pathway|    byDatatype|        reason|         1.5601487|        1.4049416|         1.732502|  2.1676067E-15|\n",
-      "|        Study_Design|affected_pathway|    byDatatype|        reason|          1.166289|        1.0503099|         1.295075|   0.0046313014|\n",
-      "+--------------------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n",
-      "only showing top 5 rows\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "permuted.show(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 83,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "+----------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n",
-      "|prediction|      comparison|comparisonType|predictionType|or_result_observed|lower_ci_observed|upper_ci_observed|pvalue_observed|\n",
-      "+----------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n",
-      "|  Negative|affected_pathway|    byDatatype|        reason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|\n",
-      "|  Negative|affected_pathway|    byDatatype|    metareason|        0.99127966|       0.92052126|        1.0674771|     0.83585304|\n",
-      "+----------+----------------+--------------+--------------+------------------+-----------------+-----------------+---------------+\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "observed.filter(F.col(\"prediction\") == \"Negative\").filter(F.col(\"comparison\") == \"affected_pathway\").show(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "100"
-      ]
-     },
-     "execution_count": 84,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "combined.select(\"prediction\", \"comparison\").distinct().count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def calculate_empirical_pvalue(df):\n",
-    "    return(\n",
-    "        df.withColumn(\"permuted_greater_than_observed\", F.when(F.col(\"pvalue_permuted\") > F.col(\"pvalue_observed\"), F.lit(1)).otherwise(F.lit(0)))\n",
-    "        .groupBy(\"comparison\", \"prediction\").agg(F.sum(\"permuted_greater_than_observed\").alias(\"permuted_greater_than_observed\"))\n",
-    "        .withColumn(\"pvalue_empirical\", F.col(\"permuted_greater_than_observed\") / 100)\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 91,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                                                \r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n",
-      "|          prediction|         comparison|permuted_greater_than_observed|pvalue_empirical|pvalue_observed|or_result_observed|\n",
-      "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n",
-      "|  Safety_Sideeffects|         literature|                             2|            0.02|   4.225119E-13|         1.3222082|\n",
-      "|Business_Administ...|   somatic_mutation|                            22|            0.22|      0.5010471|         1.0204865|\n",
-      "|       Another_Study|   affected_pathway|                            47|            0.47|     0.62539124|        0.96532875|\n",
-      "| Logistics_Resources|   somatic_mutation|                            82|            0.82|     0.13673647|        0.90625197|\n",
-      "|Business_Administ...|     rna_expression|                            59|            0.59|     0.37541625|             0.976|\n",
-      "|            Negative|   somatic_mutation|                           100|             1.0|   6.9009767E-9|         1.2699835|\n",
-      "|Business_Administ...|         known_drug|                             0|             0.0|            1.0|       0.028963346|\n",
-      "|      Invalid_Reason|genetic_association|                            97|            0.97|    0.024117803|        0.83866763|\n",
-      "|          Regulatory|     rna_expression|                           100|             1.0|   1.3482296E-9|        0.54306966|\n",
-      "|Insufficient_Enro...|   affected_pathway|                            34|            0.34|   2.9514772E-6|          1.101286|\n",
-      "|          No_Context|   affected_pathway|                            97|            0.97|   4.0033254E-5|          0.685027|\n",
-      "|Insufficient_Enro...|genetic_association|                            43|            0.43|            0.0|        0.60888106|\n",
-      "|      Invalid_Reason|         known_drug|                             0|             0.0|            1.0|       0.006276258|\n",
-      "|    Interim_Analysis|         literature|                             9|            0.09|     0.10653863|         0.7150094|\n",
-      "|      Invalid_Reason|       animal_model|                            87|            0.87|    0.065790154|         0.8683092|\n",
-      "|Insufficient_Enro...|       animal_model|                            93|            0.93|   6.694221E-29|        0.71514577|\n",
-      "|Business_Administ...|   affected_pathway|                            28|            0.28|     0.60830927|         1.0135593|\n",
-      "|        Study_Design|         literature|                            84|            0.84|     0.14340925|          0.948647|\n",
-      "|    Interim_Analysis|     rna_expression|                            55|            0.55|    0.042635407|        0.46051452|\n",
-      "|             Success|         literature|                             1|            0.01|     0.69708806|         1.2096671|\n",
-      "+--------------------+-------------------+------------------------------+----------------+---------------+------------------+\n",
-      "only showing top 20 rows\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "corrected_combined = (\n",
-    "    calculate_empirical_pvalue(combined)\n",
-    "    # .select(\"permuted_greater_than_observed\").distinct()\n",
-    "    .join(combined.select(\"prediction\", \"comparison\", \"pvalue_observed\", \"or_result_observed\"), on=[\"prediction\", \"comparison\"], how=\"inner\")\n",
-    "    .distinct()\n",
-    ")\n",
-    "\n",
-    "corrected_combined.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "100"
-      ]
-     },
-     "execution_count": 92,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "corrected_combined.count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 90,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                                                \r"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "100"
-      ]
-     },
-     "execution_count": 90,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "corrected_combined.select(\"prediction\", \"comparison\").distinct().count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!poetry add matplotlib\n",
-    "!poetry add plotly.express\n",
-    "!poetry add chart_studio\n",
-    "!poetry add chart_studio.plotly\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 95,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'https://plotly.com/~irenelopezs/63/'"
-      ]
-     },
-     "execution_count": 95,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import chart_studio\n",
-    "import chart_studio.plotly as py\n",
-    "import plotly.express as px\n",
-    "\n",
-    "\n",
-    "username = 'irenelopezs'\n",
-    "api_key = 'RaERO2wbBPVpLGiNNL1y'\n",
-    "\n",
-    "chart_studio.tools.set_credentials_file(username=username, api_key=api_key)\n",
-    "\n",
-    "pdf = corrected_combined.toPandas()\n",
-    "pdf['or_result_observed'] = pdf['or_result_observed'].astype(str)\n",
-    "pdf['significance'] = 'Remains Significant'\n",
-    "pdf.loc[(pdf['pvalue_observed'] < 0.05) & (pdf['pvalue_empirical'] >= 0.05), 'significance'] = 'No Longer Significant'\n",
-    "pdf.loc[(pdf['pvalue_observed'] >= 0.05) & (pdf['pvalue_empirical'] < 0.05), 'significance'] = 'Now Significant'\n",
-    "pdf.loc[(pdf['pvalue_observed'] >= 0.05) & (pdf['pvalue_empirical'] >= 0.05), 'significance'] = 'Never Significant'\n",
-    "\n",
-    "\n",
-    "fig = px.scatter(pdf, x=\"pvalue_observed\", y=\"pvalue_empirical\",\n",
-    "                 hover_data=[\"comparison\", \"prediction\", \"or_result_observed\"],\n",
-    "                 color='significance',\n",
-    "                 color_discrete_sequence=[\"green\", \"red\", \"blue\", \"black\"],\n",
-    "                 title=\"Comparison of Observed and Empirical P-Values\")\n",
-    "fig.add_shape(type=\"line\", \n",
-    "              x0=pdf['pvalue_observed'].min(), x1=pdf['pvalue_observed'].max(), \n",
-    "              y0=0.05, y1=0.05,\n",
-    "              line=dict(color=\"Red\", dash=\"dash\"))\n",
-    "\n",
-    "fig.add_shape(type=\"line\", \n",
-    "              x0=0.05, x1=0.05,\n",
-    "              y0=pdf['pvalue_empirical'].min(), y1=pdf['pvalue_empirical'].max(), \n",
-    "              line=dict(color=\"Red\", dash=\"dash\"))\n",
-    "fig.update_layout(xaxis_title=\"Observed P-Values\", yaxis_title=\"Empirical P-Values\")\n",
-    "\n",
-    "py.plot(fig, filename=\"stop_reasons_pvalue_scatter\", auto_open=True)\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## P value correction for false discovery rate"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 97,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>prediction</th>\n",
-       "      <th>comparison</th>\n",
-       "      <th>comparisonType</th>\n",
-       "      <th>predictionType</th>\n",
-       "      <th>a</th>\n",
-       "      <th>predictionTotal</th>\n",
-       "      <th>comparisonTotal</th>\n",
-       "      <th>total</th>\n",
-       "      <th>b</th>\n",
-       "      <th>c</th>\n",
-       "      <th>d</th>\n",
-       "      <th>or_result</th>\n",
-       "      <th>lower_ci</th>\n",
-       "      <th>upper_ci</th>\n",
-       "      <th>pvalue</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Insufficient_Enrollment</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>8615</td>\n",
-       "      <td>18439</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>9824</td>\n",
-       "      <td>194234</td>\n",
-       "      <td>200499</td>\n",
-       "      <td>0.905219</td>\n",
-       "      <td>0.878822</td>\n",
-       "      <td>0.932410</td>\n",
-       "      <td>4.259430e-11</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Business_Administrative</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>5602</td>\n",
-       "      <td>11630</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>6028</td>\n",
-       "      <td>197247</td>\n",
-       "      <td>204295</td>\n",
-       "      <td>0.962536</td>\n",
-       "      <td>0.927670</td>\n",
-       "      <td>0.998713</td>\n",
-       "      <td>4.310669e-02</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Negative</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>2577</td>\n",
-       "      <td>5576</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>2999</td>\n",
-       "      <td>200272</td>\n",
-       "      <td>207324</td>\n",
-       "      <td>0.889544</td>\n",
-       "      <td>0.843623</td>\n",
-       "      <td>0.937964</td>\n",
-       "      <td>1.493401e-05</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Invalid_Reason</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>1285</td>\n",
-       "      <td>2577</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>1292</td>\n",
-       "      <td>201564</td>\n",
-       "      <td>209031</td>\n",
-       "      <td>1.031427</td>\n",
-       "      <td>0.954548</td>\n",
-       "      <td>1.114498</td>\n",
-       "      <td>4.408484e-01</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Covid19</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>228</td>\n",
-       "      <td>503</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>275</td>\n",
-       "      <td>202621</td>\n",
-       "      <td>210048</td>\n",
-       "      <td>0.859481</td>\n",
-       "      <td>0.721025</td>\n",
-       "      <td>1.024524</td>\n",
-       "      <td>9.864084e-02</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                prediction comparison comparisonType predictionType     a  \\\n",
-       "0  Insufficient_Enrollment  europepmc   byDatasource         reason  8615   \n",
-       "1  Business_Administrative  europepmc   byDatasource         reason  5602   \n",
-       "2                 Negative  europepmc   byDatasource         reason  2577   \n",
-       "3           Invalid_Reason  europepmc   byDatasource         reason  1285   \n",
-       "4                  Covid19  europepmc   byDatasource         reason   228   \n",
-       "\n",
-       "   predictionTotal  comparisonTotal   total     b       c       d  or_result  \\\n",
-       "0            18439           202849  413172  9824  194234  200499   0.905219   \n",
-       "1            11630           202849  413172  6028  197247  204295   0.962536   \n",
-       "2             5576           202849  413172  2999  200272  207324   0.889544   \n",
-       "3             2577           202849  413172  1292  201564  209031   1.031427   \n",
-       "4              503           202849  413172   275  202621  210048   0.859481   \n",
-       "\n",
-       "   lower_ci  upper_ci        pvalue  \n",
-       "0  0.878822  0.932410  4.259430e-11  \n",
-       "1  0.927670  0.998713  4.310669e-02  \n",
-       "2  0.843623  0.937964  1.493401e-05  \n",
-       "3  0.954548  1.114498  4.408484e-01  \n",
-       "4  0.721025  1.024524  9.864084e-02  "
-      ]
-     },
-     "execution_count": 97,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "observed = spark.read.parquet(\"/Users/irenelopez/MEGAsync/EBI/repos/stopReasons/data/predictions_aggregations_all\").toPandas()\n",
-    "\n",
-    "observed.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 99,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>prediction</th>\n",
-       "      <th>comparison</th>\n",
-       "      <th>comparisonType</th>\n",
-       "      <th>predictionType</th>\n",
-       "      <th>a</th>\n",
-       "      <th>predictionTotal</th>\n",
-       "      <th>comparisonTotal</th>\n",
-       "      <th>total</th>\n",
-       "      <th>b</th>\n",
-       "      <th>c</th>\n",
-       "      <th>d</th>\n",
-       "      <th>or_result</th>\n",
-       "      <th>lower_ci</th>\n",
-       "      <th>upper_ci</th>\n",
-       "      <th>pvalue</th>\n",
-       "      <th>rejects</th>\n",
-       "      <th>pvalue_corrected</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Insufficient_Enrollment</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>8615</td>\n",
-       "      <td>18439</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>9824</td>\n",
-       "      <td>194234</td>\n",
-       "      <td>200499</td>\n",
-       "      <td>0.905219</td>\n",
-       "      <td>0.878822</td>\n",
-       "      <td>0.932410</td>\n",
-       "      <td>4.259430e-11</td>\n",
-       "      <td>True</td>\n",
-       "      <td>1.723030e-10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Business_Administrative</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>5602</td>\n",
-       "      <td>11630</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>6028</td>\n",
-       "      <td>197247</td>\n",
-       "      <td>204295</td>\n",
-       "      <td>0.962536</td>\n",
-       "      <td>0.927670</td>\n",
-       "      <td>0.998713</td>\n",
-       "      <td>4.310669e-02</td>\n",
-       "      <td>False</td>\n",
-       "      <td>7.268878e-02</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Negative</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>2577</td>\n",
-       "      <td>5576</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>2999</td>\n",
-       "      <td>200272</td>\n",
-       "      <td>207324</td>\n",
-       "      <td>0.889544</td>\n",
-       "      <td>0.843623</td>\n",
-       "      <td>0.937964</td>\n",
-       "      <td>1.493401e-05</td>\n",
-       "      <td>True</td>\n",
-       "      <td>4.078030e-05</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Invalid_Reason</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>1285</td>\n",
-       "      <td>2577</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>1292</td>\n",
-       "      <td>201564</td>\n",
-       "      <td>209031</td>\n",
-       "      <td>1.031427</td>\n",
-       "      <td>0.954548</td>\n",
-       "      <td>1.114498</td>\n",
-       "      <td>4.408484e-01</td>\n",
-       "      <td>False</td>\n",
-       "      <td>5.564568e-01</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Covid19</td>\n",
-       "      <td>europepmc</td>\n",
-       "      <td>byDatasource</td>\n",
-       "      <td>reason</td>\n",
-       "      <td>228</td>\n",
-       "      <td>503</td>\n",
-       "      <td>202849</td>\n",
-       "      <td>413172</td>\n",
-       "      <td>275</td>\n",
-       "      <td>202621</td>\n",
-       "      <td>210048</td>\n",
-       "      <td>0.859481</td>\n",
-       "      <td>0.721025</td>\n",
-       "      <td>1.024524</td>\n",
-       "      <td>9.864084e-02</td>\n",
-       "      <td>False</td>\n",
-       "      <td>1.523175e-01</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                prediction comparison comparisonType predictionType     a  \\\n",
-       "0  Insufficient_Enrollment  europepmc   byDatasource         reason  8615   \n",
-       "1  Business_Administrative  europepmc   byDatasource         reason  5602   \n",
-       "2                 Negative  europepmc   byDatasource         reason  2577   \n",
-       "3           Invalid_Reason  europepmc   byDatasource         reason  1285   \n",
-       "4                  Covid19  europepmc   byDatasource         reason   228   \n",
-       "\n",
-       "   predictionTotal  comparisonTotal   total     b       c       d  or_result  \\\n",
-       "0            18439           202849  413172  9824  194234  200499   0.905219   \n",
-       "1            11630           202849  413172  6028  197247  204295   0.962536   \n",
-       "2             5576           202849  413172  2999  200272  207324   0.889544   \n",
-       "3             2577           202849  413172  1292  201564  209031   1.031427   \n",
-       "4              503           202849  413172   275  202621  210048   0.859481   \n",
-       "\n",
-       "   lower_ci  upper_ci        pvalue  rejects  pvalue_corrected  \n",
-       "0  0.878822  0.932410  4.259430e-11     True      1.723030e-10  \n",
-       "1  0.927670  0.998713  4.310669e-02    False      7.268878e-02  \n",
-       "2  0.843623  0.937964  1.493401e-05     True      4.078030e-05  \n",
-       "3  0.954548  1.114498  4.408484e-01    False      5.564568e-01  \n",
-       "4  0.721025  1.024524  9.864084e-02    False      1.523175e-01  "
-      ]
-     },
-     "execution_count": 99,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from statsmodels.stats.multitest import fdrcorrection\n",
-    "\n",
-    "pvals = observed['pvalue'].values\n",
-    "\n",
-    "rejected, pvals_corrected = fdrcorrection(pvals, alpha=0.05, method=\"indep\", is_sorted=False)\n",
-    "observed['rejects'] = rejected\n",
-    "observed[\"pvalue_corrected\"] = pvals_corrected\n",
-    "\n",
-    "observed.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "observed[\"significance\"] = \"Remains Significant\"\n",
-    "observed.loc[(pdf[\"pvalue\"] < 0.05) & (pdf[\"pvalue_corrected\"] >= 0.05), \"significance\"] = \"No Longer Significant\"\n",
-    "observed.loc[(pdf[\"pvalue_observed\"] >= 0.05) & (pdf[\"pvalue_corrected\"] < 0.05), \"significance\"] = \"Now Significant\"\n",
-    "observed.loc[(pdf[\"pvalue_observed\"] >= 0.05) & (pdf[\"pvalue_corrected\"] >= 0.05), \"significance\"] = \"Never Significant\"\n",
-    "\n",
-    "\n",
-    "fig = px.scatter(pdf, x=\"pvalue_observed\", y=\"pvalue_empirical\",\n",
-    "                 hover_data=[\"comparison\", \"prediction\", \"or_result_observed\"],\n",
-    "                 color=\"significance\",\n",
-    "                 color_discrete_sequence=[\"green\", \"red\", \"blue\", \"black\"],\n",
-    "                 title=\"Comparison of Observed and Empirical P-Values\")\n",
-    "fig.add_shape(type=\"line\", \n",
-    "              x0=pdf[\"pvalue_observed\"].min(), x1=pdf[\"pvalue_observed\"].max(), \n",
-    "              y0=0.05, y1=0.05,\n",
-    "              line=dict(color=\"Red\", dash=\"dash\"))\n",
-    "\n",
-    "fig.add_shape(type=\"line\", \n",
-    "              x0=0.05, x1=0.05,\n",
-    "              y0=pdf[\"pvalue_empirical\"].min(), y1=pdf[\"pvalue_empirical\"].max(), \n",
-    "              line=dict(color=\"Red\", dash=\"dash\"))\n",
-    "fig.update_layout(xaxis_title=\"Observed P-Values\", yaxis_title=\"Empirical P-Values\")\n",
-    "\n",
-    "py.plot(fig, filename=\"stop_reasons_pvalue_scatter\", auto_open=True)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.8"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 95777aca0e434c0bbcd973ec6472ad0422dd7fb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Thu, 6 Jul 2023 12:14:11 +0100
Subject: [PATCH 4/6] chore: add data to gitignore

---
 .gitignore | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 6a1dcce..e1f3820 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,8 @@ derby.log
 .vscode/dryrun.log
 .vscode/targets.log
 .vscode/configurationCache.log
-temp
\ No newline at end of file
+temp
+data/baseline_predictions_aggregations
+data/predictions_aggregations_*
+data/chembl*
+reports/*
\ No newline at end of file

From de7010b1150cd0e16ab41ce2835867321f8e018d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Sat, 9 Sep 2023 00:22:54 +0100
Subject: [PATCH 5/6] feat: redefine dataset of permuted associations

---
 analysis/README.md                      |   9 +-
 analysis/python/enrichments.py          |   1 -
 analysis/python/enrichments_permuted.py | 474 ++++++++++++++++++++++++
 config.yml                              |   1 +
 4 files changed, 483 insertions(+), 2 deletions(-)
 create mode 100644 analysis/python/enrichments_permuted.py

diff --git a/analysis/README.md b/analysis/README.md
index 971c730..1aa4da6 100644
--- a/analysis/README.md
+++ b/analysis/README.md
@@ -6,7 +6,7 @@ This pipeline consists of the following steps:
 - Enrichment analysis.
 To launch the script as a Pyspark job on a cluster:
 ```bash
-gcloud dataproc clusters create il-big-stop-reasons  \
+gcloud dataproc clusters create il-big-stop-reasons-2  \
         --image-version=2.1 \
         --project=open-targets-eu-dev \
         --region=europe-west1 \
@@ -25,6 +25,13 @@ gcloud dataproc jobs submit pyspark  \
         --project=open-targets-eu-dev \
         analysis/python/enrichments.py -- config.yml --stratify-therapeutic-area non_oncology
 
+# To determine a baseline between all sets of comparisons, run the analysis with a randomly generated set of associations
+gcloud dataproc jobs submit pyspark  \
+        --cluster=il-big-stop-reasons \
+        --files=config.yml \
+        --region=europe-west1 \
+        --project=open-targets-eu-dev
+
 # Adjust the p values for multiple testing using the Benjamini-Hochberg procedure by providing the path to the file with the results of the enrichment analysis
 gcloud dataproc jobs submit pyspark  \
         --cluster=il-big-stop-reasons \
diff --git a/analysis/python/enrichments.py b/analysis/python/enrichments.py
index c351f2f..b7ef9e6 100644
--- a/analysis/python/enrichments.py
+++ b/analysis/python/enrichments.py
@@ -2,7 +2,6 @@
 from enum import Enum
 from functools import reduce
 
-import pandas as pd
 import pyspark.sql.functions as F
 import typer
 from omegaconf import OmegaConf
diff --git a/analysis/python/enrichments_permuted.py b/analysis/python/enrichments_permuted.py
new file mode 100644
index 0000000..0ab3c38
--- /dev/null
+++ b/analysis/python/enrichments_permuted.py
@@ -0,0 +1,474 @@
+"""Analysis of the results on all clinical trials."""
+from enum import Enum
+from functools import reduce
+
+import pyspark.sql.functions as F
+import typer
+from omegaconf import OmegaConf
+from pyspark import SparkConf
+from pyspark.sql import SparkSession
+from pyspark.sql.types import FloatType, StringType, StructField, StructType
+from pyspark.sql.window import Window
+from scipy.stats import fisher_exact
+from statsmodels.stats.contingency_tables import Table2x2
+
+CLINVAR_VALIDS = [
+    # ClinVar evidence we are interested
+    "affects",
+    "risk factor",
+    "pathogenic",
+    "likely pathogenic",
+    "protective",
+    "drug response",
+]
+
+NON_NEUTRAL_PREDICTIONS = [
+    # Stop predictions from Olesya
+    "Negative",
+    "Safety_Sideeffects",
+    "Success",
+    "Business_Administrative",
+    "Invalid_Reason",
+]
+
+STOPPED_STATUS = ["Terminated", "Withdrawn", "Suspended"]
+
+
+class ModeByTA(str, Enum):
+    """Mode of analysis by therapeutic area."""
+
+    ALL = "all"
+    ONCOLOGY = "oncology"
+    NON_ONCOLOGY = "non_oncology"
+
+
+def expand_disease_index(disease):
+    """Expand disease index to include ancestors to account for differences in granularity in the mapping."""
+    return (
+        disease.select(
+            F.col("id").alias("diseaseId"),
+            F.explode("ancestors").alias("propagatedDiseaseId"),
+        )
+        .union(
+            disease.select(
+                F.col("id").alias("diseaseId"), F.col("id").alias("propagatedDiseaseId")
+            )
+        )
+        .distinct()
+    )
+
+
+def prepare_l2g(evidence):
+    """Prepare l2g evidence with some arbitrary cut-offs."""
+    return (
+        evidence.filter(F.col("datasourceId") == "ot_genetics_portal")
+        .groupBy("targetId", "diseaseId")
+        .agg(F.max("score").alias("max_l2g"))
+        .withColumn(
+            "l2g_075",
+            F.when(F.col("max_l2g") > 0.75, "l2g_0.75"),
+        )
+        .withColumn("l2g_05", F.when(F.col("max_l2g") > 0.5, "l2g_0.5"))
+        .withColumn(
+            "l2g_025",
+            F.when(F.col("max_l2g") > 0.25, "l2g_0.25"),
+        )
+        .withColumn("l2g_01", F.when(F.col("max_l2g") > 0.1, "l2g_0.1"))
+        .withColumn(
+            "l2g_005",
+            F.when(F.col("max_l2g") > 0.05, "l2g_0.05"),
+        )
+    )
+
+
+def extract_therapeutic_area(disease):
+    """Assigns the most severe therapeutic are to a disease."""
+    taDf = spark.createDataFrame(
+        data=[
+            ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
+            ("EFO_0005741", "infectious disease", "Other"),
+            ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
+            ("EFO_0005932", "animal disease", "Other"),
+            ("MONDO_0024458", "disease of visual system", "Other"),
+            ("EFO_0000319", "cardiovascular disease", "Other"),
+            ("EFO_0009605", "pancreas disease", "Other"),
+            ("EFO_0010282", "gastrointestinal disease", "Other"),
+            ("OTAR_0000017", "reproductive system or breast disease", "Other"),
+            ("EFO_0010285", "integumentary system disease", "Other"),
+            ("EFO_0001379", "endocrine system disease", "Other"),
+            ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
+            ("EFO_0009690", "urinary system disease", "Other"),
+            ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
+            ("MONDO_0021205", "disease of ear", "Other"),
+            ("EFO_0000540", "immune system disease", "Other"),
+            ("EFO_0005803", "hematologic disease", "Other"),
+            ("EFO_0000618", "nervous system disease", "Other"),
+            ("MONDO_0002025", "psychiatric disorder", "Other"),
+            ("OTAR_0000020", "nutritional or metabolic disease", "Other"),
+            ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
+            ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
+            ("EFO_0000651", "phenotype", "Other"),
+            ("EFO_0001444", "measurement", "Other"),
+            ("GO_0008150", "biological process", "Other"),
+        ],
+        schema=StructType(
+            [
+                StructField("taId", StringType(), True),
+                StructField("taLabel", StringType(), True),
+                StructField("taLabelSimple", StringType(), True),
+            ]
+        ),
+    ).withColumn("taRank", F.monotonically_increasing_id())
+    wByDisease = Window.partitionBy("diseaseId")
+    return (
+        disease.withColumn("taId", F.explode("therapeuticAreas"))
+        .select(F.col("id").alias("diseaseId"), "taId")
+        .join(taDf, on="taId", how="left")
+        .withColumn("minRank", F.min("taRank").over(wByDisease))
+        .filter(F.col("taRank") == F.col("minRank"))
+        .drop("taRank", "minRank")
+    )
+
+
+def prepare_genetic_constraint(target):
+    """Prepare genetic constraint data for a given gene."""
+    return target.withColumn("gc", F.explode("constraint.upperBin6")).select(
+        F.col("id").alias("targetId"),
+        F.col("gc").cast("string"),
+    )
+
+
+def prepare_pli(target):
+    """Prepare pLI (predicted loss of function intolerant) data for a given gene."""
+    return (
+        target.withColumn("gc", F.explode("constraint"))
+        .filter(F.col("gc.constraintType") == "lof")
+        .select(F.col("id").alias("targetId"), F.col("gc.score").alias("pLI"))
+        .withColumn(
+            "lof_tolerance",
+            F.when(F.col("pLI") > 0.9, F.lit("LoF intolerant")).otherwise(
+                F.when(F.col("pLI") < 0.1, F.lit("LoF tolerant"))
+            ),
+        )
+        .drop("pLI")
+    )
+
+
+def extract_target_partners(interaction):
+    """Extract all partners of a given target."""
+    allInteractions = (
+        interaction.filter(F.col("sourceDatabase") == "intact")
+        .filter(F.col("scoring") > 0.42)
+        .filter(F.col("targetB").isNotNull())
+        .select("targetA", "targetB")
+        .distinct()
+    )
+    return (
+        allInteractions.union(
+            allInteractions.select(
+                F.col("targetA").alias("targetB"), F.col("targetB").alias("targetA")
+            )
+        )
+        .distinct()
+        .groupBy("targetA")
+        .agg(F.count("targetB").alias("partners"))
+        .withColumn(
+            "partnersBin",
+            F.when(F.col("partners") > 20, F.lit("greaterThan20"))
+            .when(
+                (F.col("partners") > 10) & (F.col("partners") <= 20),
+                F.lit("from11to20"),
+            )
+            .when(
+                (F.col("partners") > 0) & (F.col("partners") <= 10), F.lit("from1to10")
+            )
+            .otherwise(F.lit("none")),
+        )
+        .select(F.col("targetA").alias("targetId"), F.col("partnersBin"))
+    )
+
+
+def prepare_hpa_expression(hpa):
+    """Prepare HPA expression data for a given target."""
+    return hpa.select(
+        F.col("Ensembl").alias("targetId"),
+        F.col("RNA tissue distribution").alias("rnaDistribution"),
+        F.col("RNA tissue specificity").alias("rnaSpecificity"),
+    )
+
+
+def prepare_associations(evidence, disease_ancestors):
+    """Prepare a pseudo-associations dataset that consists of propagating the ontology across the evidence dataset and extract the maximum score per data source."""
+    return (
+        # Cleaned evidence (exclude "benign" clinvar genetic evidence)
+        evidence.withColumn("evaValids", F.array([F.lit(x) for x in CLINVAR_VALIDS]))
+        .withColumn("evaFilter", F.arrays_overlap("evaValids", "clinicalSignificances"))
+        .filter((F.col("evaFilter").isNull()) | (F.col("evaFilter")))
+        # pseudo-associations: ontology propagation + max datasource score
+        .join(disease_ancestors, on="diseaseId", how="left")
+        .drop("diseaseId")
+        .withColumnRenamed("propagatedDiseaseId", "diseaseId")
+        .select("targetId", "diseaseId", "datasourceId", "datatypeId")
+        .distinct()
+    )
+
+
+def prepare_associations_universe(target, disease):
+    """Creates a ficticial set of associations based on the combination of all possible targets and diseases."""
+    return (
+        target.selectExpr("id as targetId")
+        .crossJoin(disease.selectExpr("id as diseaseId"))
+        .select(
+            "*",
+            F.lit("random").alias("datasourceId"),
+            F.lit("random").alias("datatypeId"),
+        )
+    )
+
+
+def prepare_comparisons_df() -> list:
+    """Return list of all comparisons to be used in the analysis."""
+    comparisons = spark.createDataFrame(
+        data=[
+            ("datatypeId", "byDatatype"),
+        ],
+        schema=StructType(
+            [
+                StructField("comparison", StringType(), True),
+                StructField("comparisonType", StringType(), True),
+            ]
+        ),
+    )
+
+    predictions = spark.createDataFrame(
+        data=[
+            ("reason", "reason"),
+        ],
+        schema=StructType(
+            [
+                StructField("prediction", StringType(), True),
+                StructField("predictionType", StringType(), True),
+            ]
+        ),
+    )
+    return comparisons.join(predictions, how="full").collect()
+
+
+def aggregations(
+    df, comparisonColumn, comparisonType, predictionColumn, predictionType
+):
+    """Aggregate data to compute enrichment analysis."""
+    comparison_counts = df.groupBy(comparisonColumn).agg(
+        F.countDistinct("id").alias("comparisonTotal")
+    )
+    prediction_counts = df.groupBy(predictionColumn).agg(
+        F.countDistinct("id").alias("predictionTotal")
+    )
+    intersection_counts = df.groupBy(comparisonColumn, predictionColumn).agg(
+        F.countDistinct("id").alias("a")
+    )
+
+    return (
+        df.filter(
+            F.col(comparisonColumn).isNotNull() & F.col(predictionColumn).isNotNull()
+        )
+        .withColumn("comparisonType", F.lit(comparisonType))
+        .withColumn("predictionType", F.lit(predictionType))
+        .join(comparison_counts, on=comparisonColumn, how="left")
+        .join(prediction_counts, on=predictionColumn, how="left")
+        .join(intersection_counts, on=[comparisonColumn, predictionColumn], how="left")
+        .select(
+            F.col(predictionColumn).alias("prediction"),
+            F.col(comparisonColumn).alias("comparison"),
+            "comparisonType",
+            "predictionType",
+            "a",
+            "predictionTotal",
+            "comparisonTotal",
+            "total",
+        )
+        .distinct()
+    )
+
+
+def _compute_fisher_and_or(a, b, c, d) -> tuple:
+    """Compute Fisher's exact test by shaping each row into a 2x2 contingency table."""
+    table = [[a, b], [c, d]]
+    _, pvalue = fisher_exact(table)
+    table2x2 = Table2x2(table)
+    or_result = table2x2.oddsratio
+    lower_ci, upper_ci = table2x2.oddsratio_confint()
+    return (float(or_result), float(lower_ci), float(upper_ci), float(pvalue))
+
+
+def run_analysis(
+    conf,
+    seed,
+    stratify_therapeutic_area,
+):
+    """Run enrichment analysis."""
+    # Load raw datasets
+    evidence = spark.read.parquet(conf.default.input.evidence_path)
+    disease = spark.read.parquet(conf.default.input.disease_path).persist()
+    target = spark.read.parquet(conf.default.input.target_path).persist()
+    interaction = spark.read.parquet(conf.default.input.interactions_path)
+    hpa = spark.read.json(conf.default.input.hpa_path)
+    stop_predictions = (
+        spark.read.csv(conf.default.input.predictions_freeze_path, header=True)
+        # add prediction metaclass
+        .withColumn(
+            "metareason",
+            F.when(
+                F.col("prediction").isin(NON_NEUTRAL_PREDICTIONS), F.col("prediction")
+            ).otherwise(F.lit("Neutral")),
+        )
+    ).withColumnRenamed("prediction", "reason")
+
+    # Prepare processed datasets
+    disease_ancestors = expand_disease_index(disease)
+    l2g = prepare_l2g(evidence)
+    disease_ta = extract_therapeutic_area(disease)
+    target_gc = prepare_genetic_constraint(target)
+    target_pli = prepare_pli(target)
+    partners = extract_target_partners(interaction)
+    hpa_expr = prepare_hpa_expression(hpa)
+    associations = prepare_associations(evidence, disease_ancestors)
+    universe_size = associations.select("targetId", "diseaseId").distinct().count()
+    random_associations = (
+        prepare_associations_universe(target, disease)
+        .sample(fraction=1.0, seed=seed)
+        .limit(universe_size)
+        .withColumn("seed", F.lit(seed))
+    )
+
+    # Prepare clinical information
+    clinical = (
+        evidence.filter(F.col("sourceId") == "chembl")
+        .withColumn("urls", F.explode("urls"))
+        .withColumn(
+            "nctid", F.regexp_extract(F.col("urls.url"), "(.+)(id=%22)(.+)(%22)", 3)
+        )
+        .withColumn(
+            "nctid", F.when(F.col("nctid") != "", F.col("nctid")).otherwise(None)
+        )
+        .withColumn(
+            "stopStatus",
+            F.when(
+                F.col("clinicalStatus").isin(STOPPED_STATUS), F.col("clinicalStatus")
+            ),
+        )
+        .withColumn(
+            "isStopped",
+            F.when(F.col("clinicalStatus").isin(STOPPED_STATUS), F.lit("stopped")),
+        )
+        .withColumn(
+            "phase4",
+            F.when(F.col("clinicalPhase") == 4, F.lit("Phase IV")),
+        )
+        .withColumn(
+            "phase3",
+            F.when(F.col("clinicalPhase") >= 3, F.lit("Phase III+")),
+        )
+        .withColumn(
+            "phase2",
+            F.when(F.col("clinicalPhase") >= 2, F.lit("Phase II+")),
+        )
+        .select(
+            "targetId",
+            "diseaseId",
+            "nctid",
+            "clinicalStatus",
+            "clinicalPhase",
+            "studyStartDate",
+            "stopStatus",
+            "isStopped",
+            "phase4",
+            "phase3",
+            "phase2",
+        )
+        .distinct()
+        # Create ID
+        .withColumn("id", F.xxhash64("targetId", "diseaseId", "nctid"))
+        # Bring reason and metareason for stoppage
+        .join(stop_predictions, on="nctid", how="left")
+        # L2G cut-offs
+        .join(l2g, on=["targetId", "diseaseId"], how="left")
+        # Disease therapeutic area (only one by disease)
+        .join(disease_ta, on="diseaseId", how="left")
+        # Target genetic constraint
+        .join(target_gc, on="targetId", how="left")
+        # Target lof tolerance
+        .join(target_pli, on="targetId", how="left")
+        # Expression specificity
+        .join(hpa_expr, on="targetId", how="left")
+        # Physical interaction partners
+        .join(partners, on="targetId", how="left")
+        .withColumn("partnersBin", F.coalesce(F.col("partnersBin"), F.lit("none")))
+        # Datasources and Datatypes
+        .join(random_associations, on=["targetId", "diseaseId"], how="left")
+    )
+    # Run analysis split by oncology/non_oncology data
+    if stratify_therapeutic_area.value == "oncology":
+        clinical = clinical.filter(F.col("taLabelSimple") == "Oncology")
+    elif stratify_therapeutic_area.value == "non_oncology":
+        clinical = clinical.filter(F.col("taLabelSimple") != "Oncology")
+    # Define total number
+    uniqIds = clinical.select("id").distinct().count()
+    clinical = clinical.withColumn("total", F.lit(uniqIds)).persist()
+
+    ## Compute aggregations
+    agg_setups = prepare_comparisons_df()
+    all_comparisons = []
+    for row in agg_setups:
+        out = aggregations(clinical, *row)
+        all_comparisons.append(out)
+
+    ## Compute Fisher's exact test and odds ratio
+    schema = StructType(
+        [
+            StructField("or_result", FloatType(), nullable=False),
+            StructField("lower_ci", FloatType(), nullable=False),
+            StructField("upper_ci", FloatType(), nullable=False),
+            StructField("pvalue", FloatType(), nullable=False),
+        ]
+    )
+    compute_fisher_and_or_udf = F.udf(_compute_fisher_and_or, schema)
+    return (
+        reduce(lambda x, y: x.unionByName(y), all_comparisons)
+        .coalesce(200)
+        .withColumn("b", F.col("predictionTotal") - F.col("a"))
+        .withColumn("c", F.col("comparisonTotal") - F.col("a"))
+        .withColumn("d", F.col("total") - F.col("a") - F.col("b") - F.col("c"))
+        .withColumn("result", compute_fisher_and_or_udf("a", "b", "c", "d"))
+        .select("*", "result.*")
+        .drop("result")
+    )
+
+
+def main(
+    config,
+    stratify_therapeutic_area: ModeByTA = ModeByTA.ALL,
+):
+    """Generate enrichment reports on 100 random permutations of the data."""
+    conf = OmegaConf.load(config)
+    print("Therapeutic area analysis mode:", stratify_therapeutic_area.value)
+    print("Config", conf)
+    for i in range(100):
+        print("Running seed", i)
+        analysis_result = run_analysis(
+            conf=conf, seed=i, stratify_therapeutic_area=stratify_therapeutic_area
+        )
+        analysis_result.write.parquet(
+            f"{conf.default.permuted_enrichments_template}/{stratify_therapeutic_area.value}_seed_{i}",
+            mode="overwrite",
+        )
+
+
+if __name__ == "__main__":
+    sparkConf = (
+        SparkConf()
+        .set("spark.hadoop.fs.gs.requester.pays.mode", "AUTO")
+        .set("spark.hadoop.fs.gs.requester.pays.project.id", "open-targets-eu-dev")
+    )
+    spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
+    typer.run(main)
diff --git a/config.yml b/config.yml
index 40a00f5..4d27959 100644
--- a/config.yml
+++ b/config.yml
@@ -1,6 +1,7 @@
 default:
   spark_master: yarn
   enrichments_template: gs://ot-team/irene/stop_reasons/predictions_aggregations
+  permuted_enrichments_template: gs://ot-team/irene/stop_reasons/baseline_predictions_aggregations_08092023
   input:
     # gcp
     disease_path: gs://open-targets-data-releases/22.04/output/etl/parquet/diseases/

From 3d7359eda4f94a2ac7e0c71f5c0ac2f10e8f8250 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Wed, 13 Sep 2023 13:42:37 +0100
Subject: [PATCH 6/6] feat: add

---
 analysis/python/enrichments_stop_phase.py | 127 ++++++++++++++++++++++
 config.yml                                |   1 +
 2 files changed, 128 insertions(+)
 create mode 100644 analysis/python/enrichments_stop_phase.py

diff --git a/analysis/python/enrichments_stop_phase.py b/analysis/python/enrichments_stop_phase.py
new file mode 100644
index 0000000..5e301d8
--- /dev/null
+++ b/analysis/python/enrichments_stop_phase.py
@@ -0,0 +1,127 @@
+"""Functions to calculate ORs between the stop reason category of a clinical trial and its clinical phase. Results available at Supplementary Table 7."""
+
+from functools import reduce
+
+import pyspark.sql.functions as f
+import typer
+from omegaconf import OmegaConf
+from pyspark import SparkConf
+from pyspark.sql import SparkSession
+from pyspark.sql.types import FloatType, StringType, StructField, StructType
+
+from analysis.python.utils import _compute_fisher_and_or, aggregations
+
+
+def prepare_comparisons_df() -> list:
+    """Return list of all comparisons to be used in the analysis.
+    In this case, we only want to look at the comparison between the stop reason category and the clinical phase.
+    """
+    comparisons = spark.createDataFrame(
+        data=[
+            ("reason", "reason"),
+        ],
+        schema=StructType(
+            [
+                StructField("comparison", StringType(), True),
+                StructField("comparisonType", StringType(), True),
+            ]
+        ),
+    )
+
+    predictions = spark.createDataFrame(
+        data=[
+            ("phase4", "clinical"),
+            ("phase3", "clinical"),
+            ("phase32", "clinical"),
+            ("phase2", "clinical"),
+            ("phase21", "clinical"),
+            ("phase1", "clinical"),
+            ("phase1early", "clinical"),
+            ("phaseOther", "clinical"),
+        ],
+        schema=StructType(
+            [
+                StructField("prediction", StringType(), True),
+                StructField("predictionType", StringType(), True),
+            ]
+        ),
+    )
+    return comparisons.join(predictions, how="full").collect()
+
+
+def prepare_studies(studies):
+    """Processes the studies dataset to keep only the stopped ones in a suitable format to run enrichments on."""
+    formatted_studies = (
+        studies.filter(f.col("why_stopped").isNotNull())
+        .withColumnRenamed("prediction", "reason")
+        .withColumn("id", f.monotonically_increasing_id())
+        .withColumn("phase4", f.when(f.col("phase") == "Phase 4", f.lit("Phase 4")))
+        .withColumn("phase3", f.when(f.col("phase") == "Phase 3", f.lit("Phase 3")))
+        .withColumn(
+            "phase32",
+            f.when(f.col("phase") == "Phase 2/Phase 3", f.lit("Phase 2/Phase 3")),
+        )
+        .withColumn("phase2", f.when(f.col("phase") == "Phase 2", f.lit("Phase 2")))
+        .withColumn(
+            "phase21",
+            f.when(f.col("phase") == "Phase 1/Phase 2", f.lit("Phase 1/Phase 2")),
+        )
+        .withColumn("phase1", f.when(f.col("phase") == "Phase 1", f.lit("Phase 1")))
+        .withColumn(
+            "phase1early",
+            f.when(f.col("phase") == "Early Phase 1", f.lit("Early Phase 1")),
+        )
+        .withColumn("phaseOther", f.when(f.col("phase") == "nan", f.lit("Other")))
+    )
+    total_cts = formatted_studies.count()
+    return formatted_studies.withColumn("total", f.lit(total_cts))
+
+
+def main(config):
+    """Functions to calculate ORs between the stop reason category of a clinical trial and its clinical phase."""
+    # Load
+    conf = OmegaConf.load(config)
+    studies = spark.read.csv(conf.default.input.predictions_freeze_path, header=True)
+
+    # Process
+    formatted_studies = prepare_studies(studies)
+    agg_setups = prepare_comparisons_df()
+
+    all_comparisons = []
+    for row in agg_setups:
+        out = aggregations(formatted_studies, *row)
+        all_comparisons.append(out)
+
+    schema = StructType(
+        [
+            StructField("or_result", FloatType(), nullable=False),
+            StructField("lower_ci", FloatType(), nullable=False),
+            StructField("upper_ci", FloatType(), nullable=False),
+            StructField("pvalue", FloatType(), nullable=False),
+        ]
+    )
+    compute_fisher_and_or_udf = f.udf(_compute_fisher_and_or, schema)
+    results = (
+        reduce(lambda x, y: x.unionByName(y), all_comparisons)
+        .coalesce(200)
+        .withColumn("b", f.col("predictionTotal") - f.col("a"))
+        .withColumn("c", f.col("comparisonTotal") - f.col("a"))
+        .withColumn("d", f.col("total") - f.col("a") - f.col("b") - f.col("c"))
+        .withColumn("result", compute_fisher_and_or_udf("a", "b", "c", "d"))
+        .select("*", "result.*")
+        .drop("result")
+    )
+    results.write.parquet(
+        conf.default.stop_vs_phase_enrichments_template,
+        mode="overwrite",
+    )
+
+
+if __name__ == "__main__":
+    sparkConf = (
+        SparkConf()
+        .set("spark.hadoop.fs.gs.requester.pays.mode", "AUTO")
+        .set("spark.hadoop.fs.gs.requester.pays.project.id", "open-targets-eu-dev")
+    )
+    spark = SparkSession.builder.getOrCreate()
+    typer.run(main)
diff --git a/config.yml b/config.yml
index 4d27959..261453e 100644
--- a/config.yml
+++ b/config.yml
@@ -2,6 +2,7 @@ default:
   spark_master: yarn
   enrichments_template: gs://ot-team/irene/stop_reasons/predictions_aggregations
   permuted_enrichments_template: gs://ot-team/irene/stop_reasons/baseline_predictions_aggregations_08092023
+  stop_vs_phase_enrichments_template: gs://ot-team/irene/stop_reasons/reason_vs_phase_aggregations
   input:
     # gcp
     disease_path: gs://open-targets-data-releases/22.04/output/etl/parquet/diseases/