Merge branch 'develop' into develop_ehds2_cql

samply · Nov 22, 2024 · f77cece · f77cece
2 parents 28a88b1 + 49be488
commit f77cece
Show file tree

Hide file tree

Showing 20 changed files with 430 additions and 41 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,20 @@
+# Samply.Focus v0.8.0 2024-11-04
+
+In this release, we are supporting 4 types of SQL queries for Exliquid and Organoids
+
+## Major changes
+* Allowlist of SQL queries
+
+
+# Samply.Focus v0.7.0 2024-09-24
+
+In this release, we are extending the supported data backends beyond CQL-enabled FHIR stores. We now support PostgreSQL as well. Usage instructions are included in the Readme.
+
+## Major changes
+* PostgreSQL support added
+
+
+
 # Focus -- 2023-02-08
 
 This is the initial release of Focus, a task distribution application designed for working with Samply.Beam. Currently, only Samply.Blaze is supported as an endpoint, but other endpoints can easily be integrated.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "focus"
-version = "0.6.0"
+version = "0.9.0"
 edition = "2021"
 license = "Apache-2.0"
 
@@ -11,34 +11,37 @@ base64 = "0.22.1"
 reqwest = { version = "0.12", default-features = false, features = ["json", "default-tls"] }
 serde = { version = "1.0.152", features = ["serde_derive"] }
 serde_json = "1.0"
-thiserror = "1.0.38"
+thiserror = "2.0.3"
 chrono = "0.4.31"
 indexmap = "2.1.0"
-tokio = { version = "1.25.0", default_features = false, features = ["signal", "rt-multi-thread", "macros"] }
+tokio = { version = "1.25.0", default-features = false, features = ["signal", "rt-multi-thread", "macros"] }
 beam-lib = { git = "https://github.com/samply/beam", branch = "develop", features = ["http-util"] }
-laplace_rs = {git = "https://github.com/samply/laplace-rs.git", tag = "v0.3.0" }
+laplace_rs = {git = "https://github.com/samply/laplace-rs.git", tag = "v0.4.0" }
 uuid = "1.8.0"
 rand = { default-features = false, version = "0.8.5" }
 futures-util = { version = "0.3", default-features = false, features = ["std"] }
-sqlx = { version = "0.7.4", features = [ "runtime-tokio", "postgres", "macros", "chrono"] } 
-sqlx-pgrow-serde = "0.2.0"
 tryhard = "0.5"
 
 # Logging
-tracing = { version = "0.1.37", default_features = false }
-tracing-subscriber = { version = "0.3.11", default_features = false, features = ["env-filter", "ansi"] }
+tracing = { version = "0.1.37", default-features = false }
+tracing-subscriber = { version = "0.3.11", default-features = false, features = ["env-filter", "ansi"] }
 
 # Global variables
 once_cell = "1.18"
 
 # Command Line Interface
-clap = { version = "4", default_features = false, features = ["std", "env", "derive", "help", "color"] }
+clap = { version = "4", default-features = false, features = ["std", "env", "derive", "help", "color"] }
+
+# Query via SQL
+sqlx = { version = "0.8.2", features = [ "runtime-tokio", "postgres", "macros", "chrono", "rust_decimal", "uuid"], optional = true } 
+kurtbuilds_sqlx_serde = { version = "0.3.2", features = [ "json", "decimal", "chrono", "uuid"], optional = true }
 
 
 [features]
 default = []
 bbmri = []
-dktk = []
+dktk = ["query-sql"]
+query-sql = ["dep:sqlx", "dep:kurtbuilds_sqlx_serde"]
 
 [dev-dependencies]
 pretty_assertions = "1.4.0"

diff --git a/README.md b/README.md
@@ -51,10 +51,19 @@ PROJECTS_NO_OBFUSCATION = "exliquid;dktk_supervisors;exporter;ehds2" # Projects
 QUERIES_TO_CACHE = "queries_to_cache.conf" # The path to a file containing base64 encoded queries whose results are to be cached. If not set, no results are cached
 PROVIDER = "name" #EUCAIM provider name
 PROVIDER_ICON = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABAQMAAAAl21bKAAAAA1BMVEUAAACnej3aAAAAAXRSTlMAQObYZgAAAApJREFUCNdjYAAAAAIAAeIhvDMAAAAASUVORK5CYII=" # Base64 encoded EUCAIM provider icon
-AUTH_HEADER = "ApiKey XXXX" #Authorization header
+AUTH_HEADER = "ApiKey XXXX" #Authorization header; if the endpoint type is Blaze or BlazeAndSql, this header is used for the Exporter target application, and the syntax is AUTH_HEADER = "XXXX" where "XXXX" is the API key
+```
+
+In order to use Postgres querying, a Docker image built with the feature "dktk" needs to be used and this optional variable set:
+```bash
 POSTGRES_CONNECTION_STRING = "postgresql://postgres:Test.123@localhost:5432/postgres" # Postgres connection string
 ```
 
+Additionally when using Postgres this optional variable can be set:
+```bash
+MAX_DB_ATTEMPTS = "8" # Max number of attempts to connect to the database; default value: 8
+```
+
 Obfuscating zero counts is by default switched off. To enable obfuscating zero counts, set the env. variable `OBFUSCATE_ZERO`. 
 
 Optionally, you can provide the `TLS_CA_CERTIFICATES_DIR` environment variable to add additional trusted certificates, e.g., if you have a TLS-terminating proxy server in place. The application respects the `HTTP_PROXY`, `HTTPS_PROXY`, `ALL_PROXY`, `NO_PROXY`, and their respective lowercase equivalents.
@@ -81,9 +90,9 @@ Creating a sample task containing an abstract syntax tree (AST) query using curl
 curl -v -X POST -H "Content-Type: application/json" --data '{"id":"7fffefff-ffef-fcff-feef-feffffffffff","from":"app1.proxy1.broker","to":["app1.proxy1.broker"],"ttl":"10s","failure_strategy":{"retry":{"backoff_millisecs":1000,"max_tries":5}},"metadata":{"project":"bbmri"},"body":"eyJsYW5nIjoiYXN0IiwicGF5bG9hZCI6ImV5SmhjM1FpT25zaWIzQmxjbUZ1WkNJNklrOVNJaXdpWTJocGJHUnlaVzRpT2x0N0ltOXdaWEpoYm1RaU9pSkJUa1FpTENKamFHbHNaSEpsYmlJNlczc2liM0JsY21GdVpDSTZJazlTSWl3aVkyaHBiR1J5Wlc0aU9sdDdJbXRsZVNJNkltZGxibVJsY2lJc0luUjVjR1VpT2lKRlVWVkJURk1pTENKemVYTjBaVzBpT2lJaUxDSjJZV3gxWlNJNkltMWhiR1VpZlN4N0ltdGxlU0k2SW1kbGJtUmxjaUlzSW5SNWNHVWlPaUpGVVZWQlRGTWlMQ0p6ZVhOMFpXMGlPaUlpTENKMllXeDFaU0k2SW1abGJXRnNaU0o5WFgxZGZWMTlMQ0pwWkNJNkltRTJaakZqWTJZekxXVmlaakV0TkRJMFppMDVaRFk1TFRSbE5XUXhNelZtTWpNME1DSjkifQ=="}' -H "Authorization: ApiKey app1.proxy1.broker App1Secret" http://localhost:8081/v1/tasks
 ```
 
-Creating a sample SQL task for a `SELECT_TABLES` query using curl:
+Creating a sample SQL task for a `SELECT_TEST` query using curl:
 ```bash
- curl -v -X POST -H "Content-Type: application/json" --data '{"id":"7fffefff-ffef-fcff-feef-feffffffffff","from":"app1.proxy1.broker","to":["app1.proxy1.broker"],"ttl":"10s","failure_strategy":{"retry":{"backoff_millisecs":1000,"max_tries":5}},"metadata":{"project":"exliquid"},"body":"eyJwYXlsb2FkIjoiU0VMRUNUX1RBQkxFUyJ9"}' -H "Authorization: ApiKey app1.proxy1.broker App1Secret" http://localhost:8081/v1/tasks
+ curl -v -X POST -H "Content-Type: application/json" --data '{"id":"7fffefff-ffef-fcff-feef-feffffffffff","from":"app1.proxy1.broker","to":["app1.proxy1.broker"],"ttl":"10s","failure_strategy":{"retry":{"backoff_millisecs":1000,"max_tries":5}},"metadata":{"project":"exliquid"},"body":"eyJwYXlsb2FkIjoiU0VMRUNUX1RFU1QifQ=="}' -H "Authorization: ApiKey app1.proxy1.broker App1Secret" http://localhost:8081/v1/tasks
  ```
 
 Creating a sample [Exporter](https://github.com/samply/exporter) "execute" task containing an Exporter query using curl:

diff --git a/resources/cql/DHKI_STRAT_ENCOUNTER_STRATIFIER b/resources/cql/DHKI_STRAT_ENCOUNTER_STRATIFIER
@@ -0,0 +1,5 @@
+define Encounter:
+if InInitialPopulation then [Encounter] else {} as List<Encounter>
+
+define function Departments(encounter FHIR.Encounter):
+encounter.identifier.where(system = 'http://dktk.dkfz.de/fhir/sid/hki-department').value.first()
diff --git a/resources/cql/DHKI_STRAT_MEDICATION_STRATIFIER b/resources/cql/DHKI_STRAT_MEDICATION_STRATIFIER
@@ -0,0 +1,5 @@
+define MedicationStatement:
+if InInitialPopulation then [MedicationStatement] else {} as List <MedicationStatement>
+
+define function AppliedMedications(medication FHIR.MedicationStatement):
+medication.medication.coding.code.last()
diff --git a/resources/cql/DHKI_STRAT_SPECIMEN_STRATIFIER b/resources/cql/DHKI_STRAT_SPECIMEN_STRATIFIER
@@ -0,0 +1,8 @@
+define Specimen:
+if InInitialPopulation then [Specimen] else {} as List<Specimen>
+
+define function SampleType(specimen FHIR.Specimen):
+specimen.type.coding.where(system = 'https://fhir.bbmri.de/CodeSystem/SampleMaterialType').code.first()
+
+define function SampleSubtype(specimen FHIR.Specimen):
+specimen.type.text.first()
diff --git a/resources/cql/DKTK_STRAT_AGE_STRATIFIER b/resources/cql/DKTK_STRAT_AGE_STRATIFIER
@@ -4,5 +4,12 @@ from [Condition] C
 where C.extension.where(url='http://hl7.org/fhir/StructureDefinition/condition-related').empty() and C.onset is not null
 sort by date from onset asc)
 
+define FirstDiagnosis:
+First(
+from [Condition] C
+sort by date from onset asc)
+
 define AgeClass:
-if (PrimaryDiagnosis.onset is null) then 'unknown' else ToString((AgeInYearsAt(FHIRHelpers.ToDateTime(PrimaryDiagnosis.onset)) div 10) * 10)
+if (PrimaryDiagnosis.onset is null)
+then ToString((AgeInYearsAt(FHIRHelpers.ToDateTime(FirstDiagnosis.onset)) div 10) * 10)
+else ToString((AgeInYearsAt(FHIRHelpers.ToDateTime(PrimaryDiagnosis.onset)) div 10) * 10)
diff --git a/resources/cql/DKTK_STRAT_GENETIC_VARIANT b/resources/cql/DKTK_STRAT_GENETIC_VARIANT
@@ -0,0 +1,5 @@
+define GeneticVariantCount:
+if InInitialPopulation then [Observation: Code '69548-6' from loinc] else {} as List <Observation>
+
+define GeneticVariantCode:
+First (from [Observation: Code '69548-6' from loinc] O return O.component.where(code.coding contains Code '48018-6' from loinc).value.coding.code.first())
diff --git a/resources/sql/EXLIQUID_SAMPLE_3LEVELS b/resources/sql/EXLIQUID_SAMPLE_3LEVELS
@@ -0,0 +1,43 @@
+/*
+Exliquid query for sites with 'legacy' exliquid specimen documentation (3 level hierarchy versus 'virtual' mother sample).
+For current expected documentation see: https://wiki.verbis.dkfz.de/pages/viewpage.action?pageId=294716167.
+*/
+with t as (
+	select 
+		(s.resource ->> 'id')::text s_id,
+		(s_coding ->> 'code')::text sample_type
+	from specimen s, jsonb_array_elements(s.resource -> 'type' -> 'coding') as s_coding
+	where s_coding ->> 'system' = 'https://fhir.bbmri.de/CodeSystem/SampleMaterialType'
+),
+t2 as (
+	SELECT 
+		s_ali.resource ->> 'id' s_ali_id, 	
+		sample_type_ali.sample_type as s_ali_type,
+		(s_ali.resource -> 'container' -> 0 -> 'specimenQuantity' ->> 'value')::float s_ali_amountrest,
+		s_ali_grp.resource ->> 'id' s_ali_grp_id, 	
+		sample_type_ali_grp.sample_type as s_ali_grp_type,
+		(s_ali_grp.resource -> 'container' -> 0 -> 'specimenQuantity' ->> 'value')::float s_ali_grp_amountrest,
+		s_mother.resource ->> 'id' s_mother_id, 
+		sample_type_mother.sample_type as s_mother_type,
+		(s_mother.resource -> 'container' -> 0 -> 'specimenQuantity' ->> 'value')::float s_mother_amountrest,
+		s_mother.resource -> 'subject' ->> 'reference' as patient_id
+	FROM specimen s_ali
+	JOIN specimen s_ali_grp ON (s_ali.resource->'parent'->0->>'reference')::text = (s_ali_grp.resource->>'resourceType')::text || '/' || (s_ali_grp.resource->>'id')::text
+	JOIN specimen s_mother ON (s_ali_grp.resource->'parent'->0->>'reference')::text = (s_mother.resource->>'resourceType')::text || '/' || (s_mother.resource->>'id')::text
+	join t as sample_type_ali on s_ali.resource ->> 'id' = sample_type_ali.s_id
+	join t as sample_type_ali_grp on s_ali_grp.resource ->> 'id' = sample_type_ali_grp.s_id
+	join t as sample_type_mother on s_mother.resource ->> 'id' = sample_type_mother.s_id
+	where (s_ali.resource -> 'container' -> 0 -> 'specimenQuantity' ->> 'value')::float > 0
+),
+t3 as (
+select distinct 
+	t2.patient_id,
+	c.resource -> 'code' -> 'coding' -> 0 ->> 'code' icd10_code,
+	c.resource -> 'code' ->> 'text' diag_desc,
+	t2.s_mother_type
+from t2
+join condition c on t2.patient_id = c.resource -> 'subject' ->> 'reference'
+)
+select icd10_code, diag_desc, count(distinct patient_id) patient_count, s_mother_type, count(s_mother_type) sample_count
+from t3
+group by icd10_code, diag_desc, patient_id, s_mother_type;
diff --git a/resources/sql/SELECT_TABLES b/resources/sql/SELECT_TABLES
diff --git a/resources/sql/SELECT_TEST b/resources/sql/SELECT_TEST
@@ -0,0 +1 @@
+SELECT  10 AS VALUE, quote_literal('Hello Rustaceans') AS GREETING, 4.7 as FLOATY, CURRENT_DATE AS TODAY;
diff --git a/resources/sql/SIORGP_PUBLIC_MAIN b/resources/sql/SIORGP_PUBLIC_MAIN
@@ -0,0 +1,78 @@
+/*
+SIorgP MetPredict project
+The approach chosen here is to minimize the number of tasks generated and thus network traffic via Beam
+=> one large query that returns the most necessary fields over multiple smaller queries
+*/
+with t as (
+select
+o.resource->'subject'->>'reference' as pat_ref,
+o.resource->'code'->'coding'->0->>'code' as crf,
+  component->'code'->'coding'->0->>'code' AS code,
+  COALESCE(
+      component->'valueCodeableConcept'->'coding'->0->>'code',
+      component->>'valueDateTime',
+      component->'valueQuantity'->>'value',
+      component->>'valueString'
+    ) AS value
+FROM 
+  observation o ,
+  jsonb_array_elements(o.resource->'component') AS component
+where o.resource->'code'->'coding'->0->>'code' like 'SIOrgP%'
+),
+t2 AS (
+select t.value as pat_pseudonym,
+	-- t.crf,
+	p.resource->>'gender' as gender,
+	p.resource->>'birthDate' as birth_date,
+	t5.value as organoid_id,	
+	t2.value as location_primary_tumor,
+	t7.value as location_primary_tumor_precise,
+	t3.value as therapy,
+	t4.value as metastases_therapy,
+	t6.value::integer as age_at_enrollment
+from t
+left join t t2 on t.pat_ref = t2.pat_ref and t2.code='SIOP_LOCALISATION_PRIMARY_TUMOR' 
+left join t t3 on t.pat_ref = t3.pat_ref and t3.code='SIOP_NEOADJ_T_RECTAL_CARCINOMA' 
+left join t t4 on t.pat_ref = t4.pat_ref and t4.code='SIOP_NEOADJ_CTX_MET'
+left join t t5 on t.pat_ref = t5.pat_ref and t5.code like 'SIOP_SAMPLE_M0%_PSEUDONYM'
+left join t t6 on t.pat_ref = t6.pat_ref and t6.code='SIOP_AGE_STUDY_ENROLLMENT'
+left join t t7 on t.pat_ref = t7.pat_ref and t7.code='SIOP_LOCALISATION_PRIMARY_TUMOR_COLON'
+left join patient p on t.pat_ref = 'Patient/' || (p.resource->>'id')::text
+where t.crf like 'SIOrgP - MetPredict - Visite 1%' and t.code = 'SIOP_PATIENT_PSEUDONYM'
+),
+t8 as (
+  select pat_pseudonym, count(distinct organoid_id) n_organoids
+  from t2 
+  group by pat_pseudonym
+)
+-- patients having <= 3 organoids
+select 'MetPredict' as project, 'pat_pdos_leq_3' as field, (select count(distinct pat_pseudonym) from t8 where n_organoids <= 3) as value
+union
+-- patients having 4 organoids
+select 'MetPredict' as project, 'pat_pdos_4' as field, (select count(distinct pat_pseudonym) from t8 where n_organoids = 4) as value
+union
+-- patients having 5 organoids
+select 'MetPredict' as project, 'pat_pdos_5' as field, (select count(distinct pat_pseudonym) from t8 where n_organoids = 5) as value
+union
+-- patients having > 5 organoids
+select 'MetPredict' as project, 'pat_pdos_gt_5' as field, (select count(distinct pat_pseudonym) from t8 where n_organoids > 5) as value
+union
+-- the total number of patients
+select 'MetPredict' as project, 'n_patients' as field, (select count(distinct pat_pseudonym) from t2) as value
+union
+-- the total number of organoids
+select 'MetPredict' as project, 'n_organoids' as field, (select count(distinct organoid_id) from t2) as value
+union
+select 'MetPredict' as project, 'gender_male' as field, (select count(distinct pat_pseudonym) from t2 where gender = 'male') as value
+union 
+select 'MetPredict' as project, 'gender_female' as field, (select count(distinct pat_pseudonym) from t2 where gender = 'female') as value
+union 
+select 'MetPredict' as project, '<=30' as field, (select count(distinct pat_pseudonym) from t2 where age_at_enrollment <= 30) as value
+union 
+select 'MetPredict' as project, '31-40' as field, (select count(distinct pat_pseudonym) from t2 where age_at_enrollment >= 31 and age_at_enrollment <= 40) as value
+union 
+select 'MetPredict' as project, '41-50' as field, (select count(distinct pat_pseudonym) from t2 where age_at_enrollment >= 41 and age_at_enrollment <= 50) as value
+union 
+select 'MetPredict' as project, '51-60' as field, (select count(distinct pat_pseudonym) from t2 where age_at_enrollment >= 51 and age_at_enrollment <= 60) as value
+union 
+select 'MetPredict' as project, '>=61' as field, (select count(distinct pat_pseudonym) from t2 where age_at_enrollment >= 61) as value;
diff --git a/resources/sql/SIORGP_PUBLIC_NPAT b/resources/sql/SIORGP_PUBLIC_NPAT
@@ -0,0 +1,5 @@
+-- Test query. Number of patients that have a documented visit 1
+select count(distinct p.resource)
+from observation o
+join patient p on o.resource->'subject'->>'reference' = 'Patient/' || (p.resource->>'id')::text
+where o.resource->'code'->'coding'->0->>'code' like 'SIOrgP - MetPredict - Visite 1%';
diff --git a/resources/sql/SIORGP_PUBLIC_NVISIT2B b/resources/sql/SIORGP_PUBLIC_NVISIT2B
@@ -0,0 +1,4 @@
+-- Test query: Number of observations for visit 2b as a lower bound for the number of expected organoids
+select count(o) 
+from observation o
+where o.resource->'code'->'coding'->0->>'code' like 'SIOrgP - MetPredict - Visite 2b%';
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		SELECT 10 AS VALUE, quote_literal('Hello Rustaceans') AS GREETING, 4.7 as FLOATY, CURRENT_DATE AS TODAY;