diff --git a/concepts/Readme.md b/concepts/Readme.md new file mode 100644 index 0000000..46c603a --- /dev/null +++ b/concepts/Readme.md @@ -0,0 +1,26 @@ +# MIMIC-IV Concepts + +## Generating the concepts in PostgreSQL (*nix/Mac OS X) + +Analogously to [MIMIC-III Concepts](https://github.com/MIT-LCP/mimic-code/tree/master/concepts), the SQL scripts here are written in BigQuery's Standard SQL syntax, so that the following changes are necessary to make them compaible with PostgreSQL: + +* create postgres functions which emulate BigQuery functions (identical to MIMIC-III) +* modify SQL scripts for incompatible syntax +* run the modified SQL scripts and direct the output into tables in the PostgreSQL database + +This can be done as follows (again, analogously to [MIMIC-III](https://github.com/MIT-LCP/mimic-code/tree/master/concepts): + +1. Open a terminal in the `concepts` folder. +2. Run [postgres-functions.sql](postgres-functions.sql). + * e.g. `psql -f postgres-functions.sql` + * This script creates functions which emulate BigQuery syntax. +3. Run [postgres_make_concepts.sh](postgres_make_concepts.sh). + * e.g. `bash postgres_make_concepts.sh` + * This file runs the scripts after applying a few regular expressions which convert table references and date calculations appropriately. + * This file generates all concepts on the `public` schema. + +The main difference to MIMIC-III are different slightly different regular expressions and a loop similar to [make_concepts.sh](make_concepts.sh). Also, one of them uses `perl` now, which might be necessary to install. + +### Known Problems + +* [postgres_make_concepts.sh](postgres_make_concepts.sh) fails for [icustays_hourly.sh](demographics/icustays_hourly.sh) due to `INTERVAL CAST(hr AS INT64) HOUR)', which cannot easily be changed into a PostrgeSQL compatible expression. diff --git a/concepts/postgres-functions.sql b/concepts/postgres-functions.sql new file mode 100644 index 0000000..e3be8da --- /dev/null +++ b/concepts/postgres-functions.sql @@ -0,0 +1,157 @@ +-- Functions TODO: +-- FROM table CROSS JOIN UNNEST(table.column) AS col -> ???? (see icustay-hours) +-- ???(column) -> PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY column) (not sure how to do median in BQ) + +SET search_path TO public; + +CREATE OR REPLACE FUNCTION REGEXP_EXTRACT(str TEXT, pattern TEXT) RETURNS TEXT AS $$ +BEGIN +RETURN substring(str from pattern); +END; $$ +LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION REGEXP_CONTAINS(str TEXT, pattern TEXT) RETURNS BOOL AS $$ +BEGIN +RETURN str ~ pattern; +END; $$ +LANGUAGE PLPGSQL; + +-- alias generate_series with generate_array +CREATE OR REPLACE FUNCTION GENERATE_ARRAY(i INTEGER, j INTEGER) +RETURNS setof INTEGER language sql as $$ + SELECT GENERATE_SERIES(i, j) +$$; + +-- datetime functions +CREATE OR REPLACE FUNCTION DATETIME(dt DATE) RETURNS TIMESTAMP(3) AS $$ +BEGIN +RETURN CAST(dt AS TIMESTAMP(3)); +END; $$ +LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION DATETIME(year INTEGER, month INTEGER, day INTEGER, hour INTEGER, minute INTEGER, second INTEGER) RETURNS TIMESTAMP(3) AS $$ +BEGIN +RETURN TO_TIMESTAMP( + TO_CHAR(year, '0000') || TO_CHAR(month, '00') || TO_CHAR(day, '00') || TO_CHAR(hour, '00') || TO_CHAR(minute, '00') || TO_CHAR(second, '00'), + 'yyyymmddHH24MISS' +); +END; $$ +LANGUAGE PLPGSQL; + +-- overload allowing string input + +-- DATETIME_ADD(datetime, INTERVAL 'n' DATEPART) -> datetime + INTERVAL 'n' DATEPART +-- note: in bigquery, `INTERVAL 1 YEAR` is a valid interval +-- but in postgres, it must be `INTERVAL '1' YEAR` +CREATE OR REPLACE FUNCTION DATETIME_ADD(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$ +BEGIN +RETURN datetime_val + intvl; +END; $$ +LANGUAGE PLPGSQL; + +-- DATETIME_SUB(datetime, INTERVAL 'n' DATEPART) -> datetime - INTERVAL 'n' DATEPART +CREATE OR REPLACE FUNCTION DATETIME_SUB(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$ +BEGIN +RETURN datetime_val - intvl; +END; $$ +LANGUAGE PLPGSQL; + +-- TODO: +-- DATETIME_TRUNC(datetime, PART) -> DATE_TRUNC('datepart', datetime) + +-- below requires a regex to convert datepart from primitive to a string +-- i.e. encapsulate it in single quotes +CREATE OR REPLACE FUNCTION DATETIME_DIFF(endtime TIMESTAMP(3), starttime TIMESTAMP(3), datepart TEXT) RETURNS NUMERIC AS $$ +BEGIN +RETURN + EXTRACT(EPOCH FROM endtime - starttime) / + CASE + WHEN datepart = 'SECOND' THEN 1.0 + WHEN datepart = 'MINUTE' THEN 60.0 + WHEN datepart = 'HOUR' THEN 3600.0 + WHEN datepart = 'DAY' THEN 24*3600.0 + WHEN datepart = 'YEAR' THEN 365.242*24*3600.0 + ELSE NULL END; +END; $$ +LANGUAGE PLPGSQL; + +-- BigQuery has a custom data type, PART +-- It's difficult to replicate this in postgresql, which recognizes the PART as a column name, +-- unless it is within an EXTRACT() function. + +CREATE OR REPLACE FUNCTION BIGQUERY_FORMAT_TO_PSQL(format_str VARCHAR(255)) RETURNS TEXT AS $$ +BEGIN +RETURN + -- use replace to convert BigQuery string format to postgres string format + -- only handles a few cases since we don't extensively use this function + REPLACE( + REPLACE( + REPLACE( + REPLACE( + REPLACE( + REPLACE( + format_str + , '%S', 'SS' + ) + , '%M', 'MI' + ) + , '%H', 'HH24' + ) + , '%d', 'dd' + ) + , '%m', 'mm' + ) + , '%Y', 'yyyy' + ) +; +END; $$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION FORMAT_DATE(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$ +BEGIN +RETURN TO_CHAR( + datetime_val, + -- use replace to convert BigQuery string format to postgres string format + -- only handles a few cases since we don't extensively use this function + BIGQUERY_FORMAT_TO_PSQL(format_str) +); +END; $$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION PARSE_DATE(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS DATE AS $$ +BEGIN +RETURN TO_DATE( + string_val, + -- use replace to convert BigQuery string format to postgres string format + -- only handles a few cases since we don't extensively use this function + BIGQUERY_FORMAT_TO_PSQL(format_str) +); +END; $$ +LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION FORMAT_DATETIME(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$ +BEGIN +RETURN TO_CHAR( + datetime_val, + -- use replace to convert BigQuery string format to postgres string format + -- only handles a few cases since we don't extensively use this function + BIGQUERY_FORMAT_TO_PSQL(format_str) +); +END; $$ +LANGUAGE PLPGSQL; + + +CREATE OR REPLACE FUNCTION PARSE_DATETIME(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS TIMESTAMP(3) AS $$ +BEGIN +RETURN TO_TIMESTAMP( + string_val, + -- use replace to convert BigQuery string format to postgres string format + -- only handles a few cases since we don't extensively use this function + BIGQUERY_FORMAT_TO_PSQL(format_str) +); +END; $$ +LANGUAGE PLPGSQL; + + diff --git a/concepts/postgres_make_concepts.sh b/concepts/postgres_make_concepts.sh new file mode 100755 index 0000000..eea132c --- /dev/null +++ b/concepts/postgres_make_concepts.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# This file makes tables for the concepts in this subfolder. +# Be sure to run postgres-functions.sql first, as the concepts rely on those function definitions. +# Note that this may take a large amount of time and hard drive space. + +# string replacements are necessary for some queries +export REGEX_DATETIME_DIFF="s/DATETIME_DIFF\((.+?),\s?(.+?),\s?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATETIME_DIFF(\1, \2, '\3')/g" +export REGEX_SCHEMA='s/`physionet-data.(mimic_core|mimic_icu|mimic_derived|mimic_hosp).(.+?)`/\1.\2/g' +# Add necessary quotes to INTERVAL, e.g. "INTERVAL 5 hour" to "INTERVAL '5' hour" +export REGEX_INTERVAL="s/interval\s([[:digit:]])\s(hour|day|month|year)/INTERVAL '\1' \2/gI" +# Add numeric cast to ROUND(), e.g. "ROUND(1.234, 2)" to "ROUND( CAST(1.234 as numeric), 2)" over 0 or 3 lines +export PERL_REGEX_ROUND='s/round\((.*|.*[\n]?.*[\n]?.*[\n]?.*)(\, \d\))/ROUND\( CAST\($1 as numeric\)$2/gi' +export CONNSTR='-d mimic' + +# this is set as the search_path variable for psql +# a search path of "public,mimic_icu" will search both public and mimic_icu +# schemas for data, but will create tables on the public schema +export PSQL_PREAMBLE='SET search_path TO public,mimic_icu' +export TARGET_DATASET='mimic_derived' + +echo '' +echo '===' +echo 'Beginning to create tables for MIMIC database.' +echo 'Any notices of the form "NOTICE: TABLE "XXXXXX" does not exist" can be ignored.' +echo 'The scripts drop views before creating them, and these notices indicate nothing existed prior to creating the view.' +echo '===' +echo '' + +# generate tables in subfolders +# for d in demographics measurement medication treatment firstday score; +for d in firstday; +do + for fn in `ls $d`; + do + echo "${d}" + # only run SQL queries + if [[ "${fn: -4}" == ".sql" ]]; then + # table name is file name minus extension + tbl="${fn::-4}" + + # skip first_day_sofa as it depends on other firstday queries + if [[ "${tbl}" == "first_day_sofa" ]]; then + continue + fi + echo "Generating ${TARGET_DATASET}.${tbl}" + { echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.${tbl}; CREATE TABLE ${TARGET_DATASET}.${tbl} AS "; cat "${d}/${fn}";} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" | psql ${CONNSTR} + fi + done +done + + +# generate first_day_sofa table last +echo "Generating ${TARGET_DATASET}.first_day_sofa" +{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.first_day_sofa; CREATE TABLE ${TARGET_DATASET}.first_day_sofa AS "; cat firstday/first_day_sofa.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" | psql ${CONNSTR}