Skip to content
This repository was archived by the owner on Aug 17, 2021. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions concepts/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# MIMIC-IV Concepts

## Generating the concepts in PostgreSQL (*nix/Mac OS X)

Analogously to [MIMIC-III Concepts](https://github.com/MIT-LCP/mimic-code/tree/master/concepts), the SQL scripts here are written in BigQuery's Standard SQL syntax, so that the following changes are necessary to make them compaible with PostgreSQL:

* create postgres functions which emulate BigQuery functions (identical to MIMIC-III)
* modify SQL scripts for incompatible syntax
* run the modified SQL scripts and direct the output into tables in the PostgreSQL database

This can be done as follows (again, analogously to [MIMIC-III](https://github.com/MIT-LCP/mimic-code/tree/master/concepts):

1. Open a terminal in the `concepts` folder.
2. Run [postgres-functions.sql](postgres-functions.sql).
* e.g. `psql -f postgres-functions.sql`
* This script creates functions which emulate BigQuery syntax.
3. Run [postgres_make_concepts.sh](postgres_make_concepts.sh).
* e.g. `bash postgres_make_concepts.sh`
* This file runs the scripts after applying a few regular expressions which convert table references and date calculations appropriately.
* This file generates all concepts on the `public` schema.

The main difference to MIMIC-III are different slightly different regular expressions and a loop similar to [make_concepts.sh](make_concepts.sh). Also, one of them uses `perl` now, which might be necessary to install.

### Known Problems

* [postgres_make_concepts.sh](postgres_make_concepts.sh) fails for [icustays_hourly.sh](demographics/icustays_hourly.sh) due to `INTERVAL CAST(hr AS INT64) HOUR)', which cannot easily be changed into a PostrgeSQL compatible expression.
157 changes: 157 additions & 0 deletions concepts/postgres-functions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
-- Functions TODO:
-- FROM table CROSS JOIN UNNEST(table.column) AS col -> ???? (see icustay-hours)
-- ???(column) -> PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY column) (not sure how to do median in BQ)

SET search_path TO public;

CREATE OR REPLACE FUNCTION REGEXP_EXTRACT(str TEXT, pattern TEXT) RETURNS TEXT AS $$
BEGIN
RETURN substring(str from pattern);
END; $$
LANGUAGE PLPGSQL;

CREATE OR REPLACE FUNCTION REGEXP_CONTAINS(str TEXT, pattern TEXT) RETURNS BOOL AS $$
BEGIN
RETURN str ~ pattern;
END; $$
LANGUAGE PLPGSQL;

-- alias generate_series with generate_array
CREATE OR REPLACE FUNCTION GENERATE_ARRAY(i INTEGER, j INTEGER)
RETURNS setof INTEGER language sql as $$
SELECT GENERATE_SERIES(i, j)
$$;

-- datetime functions
CREATE OR REPLACE FUNCTION DATETIME(dt DATE) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN CAST(dt AS TIMESTAMP(3));
END; $$
LANGUAGE PLPGSQL;

CREATE OR REPLACE FUNCTION DATETIME(year INTEGER, month INTEGER, day INTEGER, hour INTEGER, minute INTEGER, second INTEGER) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN TO_TIMESTAMP(
TO_CHAR(year, '0000') || TO_CHAR(month, '00') || TO_CHAR(day, '00') || TO_CHAR(hour, '00') || TO_CHAR(minute, '00') || TO_CHAR(second, '00'),
'yyyymmddHH24MISS'
);
END; $$
LANGUAGE PLPGSQL;

-- overload allowing string input

-- DATETIME_ADD(datetime, INTERVAL 'n' DATEPART) -> datetime + INTERVAL 'n' DATEPART
-- note: in bigquery, `INTERVAL 1 YEAR` is a valid interval
-- but in postgres, it must be `INTERVAL '1' YEAR`
CREATE OR REPLACE FUNCTION DATETIME_ADD(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN datetime_val + intvl;
END; $$
LANGUAGE PLPGSQL;

-- DATETIME_SUB(datetime, INTERVAL 'n' DATEPART) -> datetime - INTERVAL 'n' DATEPART
CREATE OR REPLACE FUNCTION DATETIME_SUB(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN datetime_val - intvl;
END; $$
LANGUAGE PLPGSQL;

-- TODO:
-- DATETIME_TRUNC(datetime, PART) -> DATE_TRUNC('datepart', datetime)

-- below requires a regex to convert datepart from primitive to a string
-- i.e. encapsulate it in single quotes
CREATE OR REPLACE FUNCTION DATETIME_DIFF(endtime TIMESTAMP(3), starttime TIMESTAMP(3), datepart TEXT) RETURNS NUMERIC AS $$
BEGIN
RETURN
EXTRACT(EPOCH FROM endtime - starttime) /
CASE
WHEN datepart = 'SECOND' THEN 1.0
WHEN datepart = 'MINUTE' THEN 60.0
WHEN datepart = 'HOUR' THEN 3600.0
WHEN datepart = 'DAY' THEN 24*3600.0
WHEN datepart = 'YEAR' THEN 365.242*24*3600.0
ELSE NULL END;
END; $$
LANGUAGE PLPGSQL;

-- BigQuery has a custom data type, PART
-- It's difficult to replicate this in postgresql, which recognizes the PART as a column name,
-- unless it is within an EXTRACT() function.

CREATE OR REPLACE FUNCTION BIGQUERY_FORMAT_TO_PSQL(format_str VARCHAR(255)) RETURNS TEXT AS $$
BEGIN
RETURN
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
REPLACE(
REPLACE(
REPLACE(
REPLACE(
REPLACE(
REPLACE(
format_str
, '%S', 'SS'
)
, '%M', 'MI'
)
, '%H', 'HH24'
)
, '%d', 'dd'
)
, '%m', 'mm'
)
, '%Y', 'yyyy'
)
;
END; $$
LANGUAGE PLPGSQL;


CREATE OR REPLACE FUNCTION FORMAT_DATE(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$
BEGIN
RETURN TO_CHAR(
datetime_val,
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
BIGQUERY_FORMAT_TO_PSQL(format_str)
);
END; $$
LANGUAGE PLPGSQL;


CREATE OR REPLACE FUNCTION PARSE_DATE(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS DATE AS $$
BEGIN
RETURN TO_DATE(
string_val,
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
BIGQUERY_FORMAT_TO_PSQL(format_str)
);
END; $$
LANGUAGE PLPGSQL;

CREATE OR REPLACE FUNCTION FORMAT_DATETIME(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$
BEGIN
RETURN TO_CHAR(
datetime_val,
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
BIGQUERY_FORMAT_TO_PSQL(format_str)
);
END; $$
LANGUAGE PLPGSQL;


CREATE OR REPLACE FUNCTION PARSE_DATETIME(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS TIMESTAMP(3) AS $$
BEGIN
RETURN TO_TIMESTAMP(
string_val,
-- use replace to convert BigQuery string format to postgres string format
-- only handles a few cases since we don't extensively use this function
BIGQUERY_FORMAT_TO_PSQL(format_str)
);
END; $$
LANGUAGE PLPGSQL;


54 changes: 54 additions & 0 deletions concepts/postgres_make_concepts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash
# This file makes tables for the concepts in this subfolder.
# Be sure to run postgres-functions.sql first, as the concepts rely on those function definitions.
# Note that this may take a large amount of time and hard drive space.

# string replacements are necessary for some queries
export REGEX_DATETIME_DIFF="s/DATETIME_DIFF\((.+?),\s?(.+?),\s?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATETIME_DIFF(\1, \2, '\3')/g"
export REGEX_SCHEMA='s/`physionet-data.(mimic_core|mimic_icu|mimic_derived|mimic_hosp).(.+?)`/\1.\2/g'
# Add necessary quotes to INTERVAL, e.g. "INTERVAL 5 hour" to "INTERVAL '5' hour"
export REGEX_INTERVAL="s/interval\s([[:digit:]])\s(hour|day|month|year)/INTERVAL '\1' \2/gI"
# Add numeric cast to ROUND(), e.g. "ROUND(1.234, 2)" to "ROUND( CAST(1.234 as numeric), 2)" over 0 or 3 lines
export PERL_REGEX_ROUND='s/round\((.*|.*[\n]?.*[\n]?.*[\n]?.*)(\, \d\))/ROUND\( CAST\($1 as numeric\)$2/gi'
export CONNSTR='-d mimic'

# this is set as the search_path variable for psql
# a search path of "public,mimic_icu" will search both public and mimic_icu
# schemas for data, but will create tables on the public schema
export PSQL_PREAMBLE='SET search_path TO public,mimic_icu'
export TARGET_DATASET='mimic_derived'

echo ''
echo '==='
echo 'Beginning to create tables for MIMIC database.'
echo 'Any notices of the form "NOTICE: TABLE "XXXXXX" does not exist" can be ignored.'
echo 'The scripts drop views before creating them, and these notices indicate nothing existed prior to creating the view.'
echo '==='
echo ''

# generate tables in subfolders
# for d in demographics measurement medication treatment firstday score;
for d in firstday;
do
for fn in `ls $d`;
do
echo "${d}"
# only run SQL queries
if [[ "${fn: -4}" == ".sql" ]]; then
# table name is file name minus extension
tbl="${fn::-4}"

# skip first_day_sofa as it depends on other firstday queries
if [[ "${tbl}" == "first_day_sofa" ]]; then
continue
fi
echo "Generating ${TARGET_DATASET}.${tbl}"
{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.${tbl}; CREATE TABLE ${TARGET_DATASET}.${tbl} AS "; cat "${d}/${fn}";} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" | psql ${CONNSTR}
fi
done
done


# generate first_day_sofa table last
echo "Generating ${TARGET_DATASET}.first_day_sofa"
{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.first_day_sofa; CREATE TABLE ${TARGET_DATASET}.first_day_sofa AS "; cat firstday/first_day_sofa.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" | psql ${CONNSTR}