Skip to content

Commit

Permalink
add dev mode
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Apr 12, 2024
1 parent 625ded4 commit cc3baa8
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 46 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ pnpm dev
> docker compose up
> ```
> [!NOTE]
>
> In development the user requesting a DCR will be added to as data owner of all cohorts dataset requested for development purpose (so they can provision the data themselves, and to avoid spamming emails owners when developing)
### 🧹 Code formatting and linting
Automatically format Python code with ruff and black, and TypeScript code with prettier:
Expand Down
2 changes: 1 addition & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ features = [
post-install-commands = []

[tool.hatch.envs.default.scripts]
dev = "uvicorn src.main:app --port 3000 --reload {args}"
dev = "DEV_MODE=true uvicorn src.main:app --port 3000 --reload {args}"
fmt = [
"black src",
"ruff src --fix",
Expand Down
2 changes: 0 additions & 2 deletions backend/src/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,6 @@ async def auth_callback(code: str) -> RedirectResponse:
# blockchain_addrs[id_payload["email"]] = "0x1234567890"
# json.dump(blockchain_addrs, settings.data_folder / "blockchain_addresses.json")



# Reuse expiration time from decentriq Auth0 access token
exp_timestamp = access_payload["exp"]
jwt_token = create_access_token(
Expand Down
1 change: 1 addition & 0 deletions backend/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class Settings:
admins: str = field(default_factory=lambda: os.getenv("ADMINS", ""))

data_folder: str = field(default_factory=lambda: os.getenv("DATA_FOLDER", "../data"))
dev_mode: bool = field(default_factory=lambda: os.getenv("DEV_MODE", "false").lower() == "true")

@property
def redirect_uri(self) -> str:
Expand Down
47 changes: 24 additions & 23 deletions backend/src/decentriq.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
AnalyticsDcrBuilder,
Column,
FormatType,
PythonComputeNodeDefinition,
PreviewComputeNodeDefinition,
PythonComputeNodeDefinition,
TableDataNodeDefinition,
)
from fastapi import APIRouter, Depends, HTTPException
Expand Down Expand Up @@ -58,7 +58,9 @@ def create_provision_dcr(user: Any, cohort: Cohort) -> dict[str, Any]:
data_node_id = cohort.cohort_id.replace(" ", "-")
# builder.add_node_definition(RawDataNodeDefinition(name=data_node_id, is_required=True))
# TODO: providing schema is broken in new SDK
builder.add_node_definition(TableDataNodeDefinition(name=data_node_id, columns=get_cohort_schema(cohort), is_required=True))
builder.add_node_definition(
TableDataNodeDefinition(name=data_node_id, columns=get_cohort_schema(cohort), is_required=True)
)

builder.add_participant(
user["email"],
Expand Down Expand Up @@ -99,9 +101,7 @@ def pandas_script_merge_cohorts(merged_cohorts: dict[str, list[str]], all_cohort
df_name = f"df_{cohort_id}"
vars_mapped = [f"'{var}'" for var in vars_requested]
dfs_to_merge.append(df_name)
merge_script += (
f"{df_name} = pd.DataFrame({cohort_id})[{vars_mapped}]\n"
)
merge_script += f"{df_name} = pd.DataFrame({cohort_id})[{vars_mapped}]\n"
# Assuming all dataframes have a common column for merging
merge_script += f"merged_df = pd.concat([{', '.join(dfs_to_merge)}], ignore_index=True)\n"
return merge_script
Expand Down Expand Up @@ -164,24 +164,32 @@ async def create_compute_dcr(
for cohort_id, cohort in selected_cohorts.items():
# Create data node for cohort
data_node_id = cohort_id.replace(" ", "-")
builder.add_node_definition(TableDataNodeDefinition(name=data_node_id, columns=get_cohort_schema(cohort), is_required=True))
builder.add_node_definition(
TableDataNodeDefinition(name=data_node_id, columns=get_cohort_schema(cohort), is_required=True)
)
data_nodes.append(data_node_id)

if cohort.airlock:
# Add airlock node to make it easy to access small part of the dataset
preview_node_id = f"preview-{data_node_id}"
builder.add_node_definition(PreviewComputeNodeDefinition(
name=preview_node_id,
dependency=data_node_id,
quota_bytes=1048576, # 10MB
))
builder.add_node_definition(
PreviewComputeNodeDefinition(
name=preview_node_id,
dependency=data_node_id,
quota_bytes=1048576, # 10MB
)
)
preview_nodes.append(preview_node_id)

# Add data owners to provision the data
for owner in cohort.cohort_email:
if owner not in participants:
participants[owner] = {"data_owner_of": set(), "analyst_of": set()}
participants[owner]["data_owner_of"].add(data_node_id)
# Add data owners to provision the data (in dev we dont add them to avoid unnecessary emails)
if not settings.dev_mode:
for owner in cohort.cohort_email:
if owner not in participants:
participants[owner] = {"data_owner_of": set(), "analyst_of": set()}
participants[owner]["data_owner_of"].add(data_node_id)
else:
# In dev_mode the requester is added as data owner instead
participants[user["email"]]["data_owner_of"].add(data_node_id)

# Add pandas preparation script
pandas_script = "import pandas as pd\nimport decentriq_util\n\n"
Expand All @@ -207,20 +215,13 @@ async def create_compute_dcr(
builder.add_node_definition(
PythonComputeNodeDefinition(name=f"prepare-{cohort_id}", script=pandas_script, dependencies=[data_node_id])
)
# builder.add_participant(user["email"], analyst_of=[f"prepare-{cohort_id}"])

# Add the requester as analyst of prepare script
participants[user["email"]]["analyst_of"].add(f"prepare-{cohort_id}")

# Add users permissions for previews
for prev_node in preview_nodes:
participants[user["email"]]["analyst_of"].add(prev_node)

# TODO: we add the DQ admin as data owner just for testing for now
# Will need to be removed when the workflow has been tested
for data_node in data_nodes:
participants[user["email"]]["data_owner_of"].add(data_node)

for p_email, p_perm in participants.items():
builder.add_participant(
p_email,
Expand Down
1 change: 0 additions & 1 deletion backend/src/explore.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os
from typing import Any

import requests
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import FileResponse

Expand Down
39 changes: 27 additions & 12 deletions backend/src/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def parse_categorical_string(s: str) -> list[dict[str, str]]:
# Old multiple column format for concept codes:
ID_COLUMNS_NAMESPACES = {"ICD-10": "icd10", "SNOMED-CT": "snomedct", "ATC-DDD": "atc", "LOINC": "loinc"}


def get_id_from_multi_columns(row):
"""Old format to pass concepts URIs from the ID provided in the various columns of the data dictionary (ICD-10, ATC, SNOMED...)"""
uris_list = []
Expand All @@ -185,6 +186,7 @@ def to_camelcase(s: str) -> str:
s = sub(r"(_|-)+", " ", s).title().replace(" ", "")
return "".join([s[0].lower(), s[1:]])


def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset:
"""Parse the cohort dictionary uploaded as excel or CSV spreadsheet, and load it to the triplestore"""
# print(f"Loading dictionary {dict_path}")
Expand All @@ -195,6 +197,8 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset:
detail="Only CSV files are supported. Please convert your file to CSV and try again.",
)
try:
# Record all errors and raise them at the end
errors = []
df = pd.read_csv(dict_path)
df = df.dropna(how="all")
df = df.fillna("")
Expand All @@ -212,16 +216,11 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset:
# Try to get IDs from old format multiple columns
df["concept_id"] = df.apply(lambda row: get_id_from_multi_columns(row), axis=1)

if " " in df["concept_id"]:
errors.append(f"Row {i+2} is using multiple concepts codes for a variable: {df['concept_id']}")

cohort_uri = get_cohort_uri(cohort_id)
g = init_graph()
g.add((cohort_uri, RDF.type, ICARE.Cohort, cohort_uri))
g.add((cohort_uri, DC.identifier, Literal(cohort_id), cohort_uri))

# Record all errors and raise them at the end
errors = []
for i, row in df.iterrows():
# Check if required columns are present
if not row["VARIABLE NAME"] or not row["VARIABLE LABEL"] or not row["VAR TYPE"] or not row["COUNT"]:
Expand All @@ -244,7 +243,7 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset:

# Get categories code if provided
categories_codes = []
if "Categorical Value Concept Code" in row and row["Categorical Value Concept Code"]:
if row.get("Categorical Value Concept Code"):
categories_codes = row["Categorical Value Concept Code"].split(",")
# Add properties
for column, value in row.items():
Expand Down Expand Up @@ -277,11 +276,15 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset:
if categories_codes:
cat_code_uri = converter.expand(str(categories_codes[index]).strip())
if not cat_code_uri:
errors.append(f"Row {i+2} for variable `{row['VARIABLE NAME']}` the category concept code provided for `{categories_codes[index]}` is not valid. Use one of snomedct:, icd10:, atc: or loinc: prefixes.")
errors.append(
f"Row {i+2} for variable `{row['VARIABLE NAME']}` the category concept code provided for `{categories_codes[index]}` is not valid. Use one of snomedct:, icd10:, atc: or loinc: prefixes."
)
else:
g.add((cat_uri, ICARE.conceptId, URIRef(cat_code_uri), cohort_uri))
except Exception as e:
errors.append(f"Row {i+2} for variable `{row['VARIABLE NAME']}` the {len(categories_codes)} category concept codes are not matching with {len(row['categories'])} categories provided.")
except Exception:
errors.append(
f"Row {i+2} for variable `{row['VARIABLE NAME']}` the {len(categories_codes)} category concept codes are not matching with {len(row['categories'])} categories provided."
)
# print(g.serialize(format="turtle"))
# Print all errors at once
if len(errors) > 0:
Expand Down Expand Up @@ -358,9 +361,18 @@ async def upload_cohort(
try:
g = load_cohort_dict_file(metadata_path, cohort_id)
# Airlock preview setting goes to mapping graph because it is defined in the explorer UI
delete_existing_triples(get_cohort_mapping_uri(cohort_id), f"<{get_cohort_uri(cohort_id)!s}>", "icare:previewEnabled")
g.add((get_cohort_uri(cohort_id), ICARE.previewEnabled, Literal(str(airlock).lower(), datatype=XSD.boolean), get_cohort_mapping_uri(cohort_id)))
g.add(
(
get_cohort_uri(cohort_id),
ICARE.previewEnabled,
Literal(str(airlock).lower(), datatype=XSD.boolean),
get_cohort_mapping_uri(cohort_id),
)
)
# Delete previous graph for this file from triplestore
delete_existing_triples(
get_cohort_mapping_uri(cohort_id), f"<{get_cohort_uri(cohort_id)!s}>", "icare:previewEnabled"
)
delete_existing_triples(get_cohort_uri(cohort_id))
publish_graph_to_endpoint(g)
except Exception as e:
Expand All @@ -371,7 +383,10 @@ async def upload_cohort(
try:
dcr_data = create_provision_dcr(user, cohorts_dict.get(cohort_id))
except Exception as e:
raise HTTPException(status_code=422, detail=f"The cohort was properly created in the Cohort Explorer, but there was an issue when uploading to Decentriq: {e}")
raise HTTPException(
status_code=422,
detail=f"The cohort was properly created in the Cohort Explorer, but there was an issue when uploading to Decentriq: {e}",
)
# print(dcr_data)
# Save data file
if cohort_data:
Expand Down
3 changes: 3 additions & 0 deletions backend/src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,15 @@ def run_query(query: str) -> dict[str, Any]:
def get_value(key: str, row: dict[str, Any]) -> str | None:
return str(row[key]["value"]) if key in row and row[key]["value"] else None


def get_int_value(key: str, row: dict[str, Any]) -> int | None:
return int(row[key]["value"]) if key in row and row[key]["value"] else None


def get_bool_value(key: str, row: dict[str, Any]) -> bool:
return str(row[key]["value"]).lower() == "true" if key in row and row[key]["value"] else False


def get_curie_value(key: str, row: dict[str, Any]) -> int | None:
return converter.compress(get_value(key, row)) if get_value(key, row) else None

Expand Down
2 changes: 2 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ services:
volumes:
- ./backend:/app
entrypoint: uvicorn src.main:app --host 0.0.0.0 --port 80 --reload --log-level debug
environment:
- DEV_MODE=true
ports:
- 3000:80

Expand Down
4 changes: 1 addition & 3 deletions frontend/src/pages/cohorts.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,7 @@ export default function CohortsList() {
✉️ {email}
</span>
))}
{cohortData.airlock && (
<span className="badge badge-outline mx-1">🔎 Data preview</span>
)}
{cohortData.airlock && <span className="badge badge-outline mx-1">🔎 Data preview</span>}
</div>
</div>
<div className="collapse-content">
Expand Down
4 changes: 1 addition & 3 deletions frontend/src/pages/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ export default function Home() {
-&gt;
</span>
</h2>
<p className={`m-0 max-w-[30ch] text-sm opacity-50`}>
View the documentation and source code on GitHub
</p>
<p className={`m-0 max-w-[30ch] text-sm opacity-50`}>View the documentation and source code on GitHub</p>
</Link>
</div>
</main>
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/pages/upload.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ export default function UploadPage() {
id="enableAirlock"
type="checkbox"
checked={enableAirlock}
onChange={(e) => setEnableAirlock(e.target.checked)}
onChange={e => setEnableAirlock(e.target.checked)}
className="checkbox"
/>
Enable data preview (airlock)
Expand Down

0 comments on commit cc3baa8

Please sign in to comment.