Display machine information:
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.29 R6_2.5.1 jsonlite_1.7.2 magrittr_2.0.1
## [5] evaluate_0.14 stringi_1.7.6 rlang_1.0.1 cli_3.1.0
## [9] rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1 rmarkdown_2.11
## [13] tools_3.6.0 stringr_1.4.0 xfun_0.29 yaml_2.2.1
## [17] fastmap_1.1.0 compiler_3.6.0 htmltools_0.5.2 knitr_1.37
## [21] sass_0.4.0
Load database libraries and the tidyverse frontend:
suppressPackageStartupMessages(library(dbplyr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
library(DBI)
library(bigrquery)
Below is an outline of steps.
# path to the service account token
if (Sys.info()[["nodename"]] == "biostat-203b-teaching-server") {
# on teaching server
satoken <- "/mnt/mimiciv/1.0/biostat-203b-2022winter-3fdc2392ac39.json"
} else {
# on my own Mac laptop
satoken <- "/Users/huazhou/Documents/github.com/ucla-biostat-203b/2022winter/hw/hw3/biostat-203b-2022winter-3fdc2392ac39.json"
}
# BigQuery authentication using service account
bq_auth(
path = satoken,
email = "mimiciv-bigquery@biostat-203b-2022winter.iam.gserviceaccount.com",
scopes = c("https://www.googleapis.com/auth/bigquery",
"https://www.googleapis.com/auth/cloud-platform")
)
mimic4_v1_0_203b
in GCP (Google Cloud Platform), using the billing account biostat-203b-2022winter
.# Connect to the BigQuery database `biostat-203b-2022winter.mimic4_v1_0_203b`
con <- dbConnect(
bigrquery::bigquery(),
project = "biostat-203b-2022winter",
dataset = "mimic4_v1_0_203b",
billing = "biostat-203b-2022winter"
)
con
## <BigQueryConnection>
## Dataset: biostat-203b-2022winter.mimic4_v1_0_203b
## Billing: biostat-203b-2022winter
mimic4_v1_0_203b
database.dbListTables(con)
## [1] "admissions" "chartevents" "d_items" "d_labitems" "icustays"
## [6] "labevents" "patients"
icustays
table.# full ICU stays table
icustays_tble <- tbl(con, "icustays") %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT *
## FROM `icustays`
## # Source: table<icustays> [?? x 8]
## # Database: BigQueryConnection
## subject_id hadm_id stay_id first_careunit last_careunit
## <int> <int> <int> <chr> <chr>
## 1 12776735 20817525 34547665 Neuro Stepdown Neuro Stepdown
## 2 16256226 20013290 39289362 Neuro Stepdown Neuro Stepdown
## 3 12974563 29618057 32563675 Neuro Stepdown Neuro Stepdown
## 4 14609218 20606189 34947848 Neuro Stepdown Neuro Stepdown
## 5 12687112 26132667 37445058 Neuro Stepdown Neuro Stepdown
## 6 18190935 24712081 30056748 Neuro Stepdown Neuro Stepdown
## 7 10404210 22880512 30254828 Neuro Stepdown Neuro Stepdown
## 8 12552973 21320643 30261549 Neuro Stepdown Neuro Stepdown
## 9 14303051 26559537 30401486 Neuro Stepdown Neuro Stepdown
## 10 12347959 22779073 30503984 Neuro Stepdown Neuro Stepdown
## intime outtime los
## <dttm> <dttm> <dbl>
## 1 2200-07-12 00:33:00 2200-07-13 16:44:40 1.67
## 2 2150-12-20 16:09:08 2150-12-21 14:58:40 0.951
## 3 2138-11-13 23:30:01 2138-11-15 16:25:19 1.71
## 4 2174-06-28 21:13:00 2174-07-05 17:01:32 6.83
## 5 2162-05-31 18:08:45 2162-06-04 10:16:13 3.67
## 6 2115-12-08 09:39:45 2115-12-12 02:36:52 3.71
## 7 2161-03-03 18:30:00 2161-03-05 18:10:31 1.99
## 8 2135-09-21 18:12:03 2135-09-23 12:56:46 1.78
## 9 2175-11-20 01:02:00 2175-11-21 16:06:17 1.63
## 10 2140-07-11 17:54:14 2140-07-19 13:01:05 7.80
## # … with more rows
We only keep the first ICU stay. Following code is kind of a hack, using the summarise_all(min)
function. It seems that slice_min()
, slice_head()
, distinct(, .keep_all = TRUE)
don’t work with dbplyr
+bigrquery
at the moment.
icustays_tble <- icustays_tble %>%
select(subject_id, intime) %>%
group_by(subject_id) %>%
summarise_all(min) %>%
left_join(icustays_tble, by = c("subject_id", "intime")) %>%
show_query() %>%
print(width = Inf)
## Warning: Missing values are always removed in SQL.
## Use `MIN(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## <SQL>
## SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
##
## # Source: lazy query [?? x 8]
## # Database: BigQueryConnection
## subject_id intime hadm_id stay_id first_careunit
## <int> <dttm> <int> <int> <chr>
## 1 12776735 2200-07-12 00:33:00 20817525 34547665 Neuro Stepdown
## 2 16256226 2150-12-20 16:09:08 20013290 39289362 Neuro Stepdown
## 3 12974563 2138-11-13 23:30:01 29618057 32563675 Neuro Stepdown
## 4 14609218 2174-06-28 21:13:00 20606189 34947848 Neuro Stepdown
## 5 12687112 2162-05-31 18:08:45 26132667 37445058 Neuro Stepdown
## 6 18190935 2115-11-12 23:23:32 21137829 34338402 Neuro Stepdown
## 7 10404210 2161-03-03 18:30:00 22880512 30254828 Neuro Stepdown
## 8 12552973 2135-09-21 18:12:03 21320643 30261549 Neuro Stepdown
## 9 14303051 2175-11-20 01:02:00 26559537 30401486 Neuro Stepdown
## 10 12347959 2140-07-11 17:54:14 22779073 30503984 Neuro Stepdown
## last_careunit outtime los
## <chr> <dttm> <dbl>
## 1 Neuro Stepdown 2200-07-13 16:44:40 1.67
## 2 Neuro Stepdown 2150-12-21 14:58:40 0.951
## 3 Neuro Stepdown 2138-11-15 16:25:19 1.71
## 4 Neuro Stepdown 2174-07-05 17:01:32 6.83
## 5 Neuro Stepdown 2162-06-04 10:16:13 3.67
## 6 Neuro Stepdown 2115-11-15 18:06:26 2.78
## 7 Neuro Stepdown 2161-03-05 18:10:31 1.99
## 8 Neuro Stepdown 2135-09-23 12:56:46 1.78
## 9 Neuro Stepdown 2175-11-21 16:06:17 1.63
## 10 Neuro Stepdown 2140-07-19 13:01:05 7.80
## # … with more rows
admissions
table and only keep the patients who have a match in icustays_tble
(according to subject_id
and hadm_id
).Solution: Here is my code.
# import admissions table and only keep the patients for the first ICU stay
admissions_tble <- tbl(con, "admissions") %>%
semi_join(icustays_tble, by = c("subject_id", "hadm_id")) %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT * FROM `admissions` AS `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`hadm_id` = `RHS`.`hadm_id`)
## )
## # Source: lazy query [?? x 15]
## # Database: BigQueryConnection
## subject_id hadm_id admittime dischtime
## <int> <int> <dttm> <dttm>
## 1 13946340 20423027 2181-03-16 15:50:00 2181-03-24 03:03:00
## 2 16297597 25146493 2127-02-07 00:39:00 2127-02-16 17:55:00
## 3 10902586 25215925 2155-02-12 17:54:00 2155-02-19 16:30:00
## 4 16423390 22555943 2138-01-07 14:22:00 2138-01-31 21:35:00
## 5 11990000 25805051 2183-01-29 14:01:00 2183-02-06 15:24:00
## 6 13546016 22408998 2119-05-26 07:57:00 2119-05-30 15:59:00
## 7 13205038 24285099 2161-11-11 21:43:00 2161-12-11 10:42:00
## 8 16731711 23567649 2110-10-25 18:56:00 2110-11-08 11:35:00
## 9 16333496 25737846 2175-07-22 15:45:00 2175-07-31 06:58:00
## 10 11407769 26218529 2188-06-25 04:11:00 2188-06-27 10:30:00
## deathtime admission_type admission_location
## <dttm> <chr> <chr>
## 1 2181-03-24 03:03:00 URGENT TRANSFER FROM HOSPITAL
## 2 NA URGENT TRANSFER FROM HOSPITAL
## 3 NA URGENT TRANSFER FROM HOSPITAL
## 4 2138-01-31 21:35:00 URGENT TRANSFER FROM HOSPITAL
## 5 NA URGENT TRANSFER FROM HOSPITAL
## 6 NA URGENT TRANSFER FROM HOSPITAL
## 7 NA URGENT TRANSFER FROM HOSPITAL
## 8 2110-11-08 11:35:00 URGENT TRANSFER FROM HOSPITAL
## 9 2175-07-31 06:58:00 URGENT TRANSFER FROM HOSPITAL
## 10 2188-06-27 10:30:00 URGENT TRANSFER FROM HOSPITAL
## discharge_location insurance language marital_status ethnicity
## <chr> <chr> <chr> <chr> <chr>
## 1 DIED Medicare ENGLISH <NA> WHITE
## 2 HOME HEALTH CARE Medicare ENGLISH <NA> UNKNOWN
## 3 HOME HEALTH CARE Other ENGLISH <NA> UNKNOWN
## 4 DIED Other ENGLISH <NA> UNKNOWN
## 5 CHRONIC/LONG TERM ACUTE CARE Medicare ENGLISH <NA> WHITE
## 6 SKILLED NURSING FACILITY Medicare ENGLISH <NA> UNKNOWN
## 7 REHAB Other ? <NA> UNKNOWN
## 8 DIED Medicare ENGLISH <NA> UNKNOWN
## 9 DIED Medicare ENGLISH <NA> UNKNOWN
## 10 DIED Other ENGLISH <NA> UNKNOWN
## edregtime edouttime hospital_expire_flag
## <dttm> <dttm> <int>
## 1 NA NA 1
## 2 NA NA 0
## 3 NA NA 0
## 4 NA NA 1
## 5 NA NA 0
## 6 NA NA 0
## 7 NA NA 0
## 8 NA NA 1
## 9 NA NA 1
## 10 2188-06-25 02:28:00 2188-06-25 05:25:00 1
## # … with more rows
patients
table and only keep the patients who have a match in icustays_tble
(according to subject_id
).Solution: Here is my code.
# import patients table and only keep the patients for the first ICU stay
patients_tble <- tbl(con, "patients") %>%
semi_join(icustays_tble, by = c("subject_id")) %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT * FROM `patients` AS `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`subject_id` = `RHS`.`subject_id`)
## )
## # Source: lazy query [?? x 6]
## # Database: BigQueryConnection
## subject_id gender anchor_age anchor_year anchor_year_group dod
## <int> <chr> <int> <int> <chr> <date>
## 1 19137716 F 18 2111 2008 - 2010 NA
## 2 19077387 F 18 2112 2017 - 2019 NA
## 3 11155072 M 18 2113 2011 - 2013 NA
## 4 17945455 F 18 2113 2014 - 2016 NA
## 5 17689255 F 18 2113 2017 - 2019 NA
## 6 10894885 F 18 2114 2008 - 2010 NA
## 7 15175657 M 18 2114 2014 - 2016 NA
## 8 17600374 F 18 2118 2014 - 2016 NA
## 9 16501289 M 18 2118 2017 - 2019 NA
## 10 10141487 F 18 2119 2017 - 2019 NA
## # … with more rows
labevents
table and retrieve a subset that only contain subjects who appear in icustays_tble
and the lab items listed in HW2.Solution: Here is my code. I ended up not using labels on app (inputting them directly in ui
).
# full labevents table
labevents_tble <- tbl(con, "labevents") %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT *
## FROM `labevents`
## # Source: table<labevents> [?? x 15]
## # Database: BigQueryConnection
## labevent_id subject_id hadm_id specimen_id itemid charttime
## <int> <int> <int> <int> <int> <dttm>
## 1 2692 10000764 27897940 9550105 51248 2132-10-17 05:59:00
## 2 47208 10003299 NA 69855539 51516 2178-12-10 18:05:00
## 3 47512 10003299 29323205 34579755 50983 2181-10-23 05:20:00
## 4 55877 10003700 NA 22046762 51214 2165-04-24 03:40:00
## 5 57003 10004235 24181354 7200252 50813 2196-02-24 20:54:00
## 6 57106 10004235 24181354 36231870 50813 2196-02-25 04:50:00
## 7 57424 10004235 24181354 20870289 50818 2196-02-26 18:10:00
## 8 58753 10004322 22602599 6039016 50934 2134-10-01 06:19:00
## 9 73432 10005464 NA 56095848 51491 2165-06-28 07:05:00
## 10 83043 10006029 NA 27974406 50954 2169-01-22 12:00:00
## storetime value valuenum valueuom ref_range_lower ref_range_upper
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2132-10-17 08:09:00 32.3 32.3 pg 27 32
## 2 2178-12-10 18:49:00 1 1 #/hpf 0 5
## 3 2181-10-23 06:49:00 145 145 mEq/L 133 145
## 4 2165-04-24 04:18:00 268 268 mg/dL 150 400
## 5 2196-02-24 20:56:00 3.2 3.2 mmol/L 0.5 2
## 6 2196-02-25 04:56:00 2.9 2.9 mmol/L 0.5 2
## 7 2196-02-26 18:13:00 38 38 mm Hg 35 45
## 8 2134-10-01 08:23:00 18 18 <NA> NA NA
## 9 2165-06-28 07:58:00 6.5 6.5 units 5 8
## 10 2169-01-22 14:09:00 226 226 IU/L 94 250
## flag priority comments
## <chr> <chr> <chr>
## 1 abnormal ROUTINE <NA>
## 2 <NA> STAT <NA>
## 3 <NA> ROUTINE <NA>
## 4 <NA> STAT <NA>
## 5 abnormal <NA> <NA>
## 6 abnormal <NA> <NA>
## 7 <NA> <NA> <NA>
## 8 <NA> ROUTINE <NA>
## 9 <NA> STAT <NA>
## 10 <NA> STAT <NA>
## # … with more rows
d_labitems_tble <- tbl(con, "d_labitems") %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT *
## FROM `d_labitems`
## # Source: table<d_labitems> [?? x 5]
## # Database: BigQueryConnection
## itemid label fluid category loinc_code
## <int> <chr> <chr> <chr> <chr>
## 1 52021 Abz Blood Blood Gas <NA>
## 2 52022 Albumin, Blood Blood Blood Gas <NA>
## 3 50801 Alveolar-arterial Gradient Blood Blood Gas <NA>
## 4 52023 Assist/Control Blood Blood Gas <NA>
## 5 50802 Base Excess Blood Blood Gas <NA>
## 6 50803 Calculated Bicarbonate, Whole Blood Blood Blood Gas <NA>
## 7 50804 Calculated Total CO2 Blood Blood Gas <NA>
## 8 50805 Carboxyhemoglobin Blood Blood Gas <NA>
## 9 50806 Chloride, Whole Blood Blood Blood Gas <NA>
## 10 52434 Chloride, Whole Blood Blood Blood Gas <NA>
## # … with more rows
choice_lab <- c(50912, 50971, 50983, 50902, 50882,
51221, 51301, 50931, 50960, 50893)
labevents_tble <- labevents_tble %>%
# filter out 10 lab items
select(subject_id, itemid, charttime, valuenum) %>%
filter(itemid %in% choice_lab) %>%
semi_join(icustays_tble, by = c("subject_id")) %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT * FROM (SELECT *
## FROM (SELECT `subject_id`, `itemid`, `charttime`, `valuenum`
## FROM `labevents`) `q01`
## WHERE (`itemid` IN (50912.0, 50971.0, 50983.0, 50902.0, 50882.0, 51221.0, 51301.0, 50931.0, 50960.0, 50893.0))) `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`subject_id` = `RHS`.`subject_id`)
## )
## # Source: lazy query [?? x 4]
## # Database: BigQueryConnection
## subject_id itemid charttime valuenum
## <int> <int> <dttm> <dbl>
## 1 10004720 50983 2186-11-13 14:29:00 135
## 2 10007677 50912 2121-06-16 06:59:00 0.9
## 3 10019003 50893 2153-04-02 06:34:00 7.9
## 4 10019003 51301 2153-09-22 14:30:00 124.
## 5 10023708 50912 2144-12-06 09:00:00 0.8
## 6 10031358 50983 2159-10-26 06:44:00 137
## 7 10035168 50971 2144-08-08 10:11:00 4.2
## 8 10035168 50912 2145-03-17 09:40:00 1
## 9 10035168 50971 2145-06-18 10:46:00 4.5
## 10 10035168 50902 2145-12-25 02:23:00 115
## # … with more rows
Only keep the first lab measurements during ICU stay and pivot lab items to become variables/columns.
Solution: Here is my code. I avoided an error in BigQuery by adding lab
at the beginning of the itemid
, using names_prefix
.
labevents_tble <- labevents_tble %>%
# keep only lab measurements between intime and outtime
left_join(select(icustays_tble, subject_id, intime, outtime),
by = c("subject_id")) %>%
filter(charttime >= intime, charttime <= outtime) %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT *
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `itemid`, `charttime`, `valuenum`, `intime`, `outtime`
## FROM (SELECT * FROM (SELECT *
## FROM (SELECT `subject_id`, `itemid`, `charttime`, `valuenum`
## FROM `labevents`) `q01`
## WHERE (`itemid` IN (50912.0, 50971.0, 50983.0, 50902.0, 50882.0, 51221.0, 51301.0, 50931.0, 50960.0, 50893.0))) `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`subject_id` = `RHS`.`subject_id`)
## )) `LHS`
## LEFT JOIN (SELECT `subject_id`, `intime`, `outtime`
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `q01`) `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id`)
## ) `q02`
## WHERE ((`charttime` >= `intime`) AND (`charttime` <= `outtime`))
## # Source: lazy query [?? x 6]
## # Database: BigQueryConnection
## subject_id itemid charttime valuenum intime
## <int> <int> <dttm> <dbl> <dttm>
## 1 10043122 51221 2167-09-16 01:45:00 24.7 2167-09-15 04:33:44
## 2 10135398 50960 2153-10-07 07:00:00 2 2153-09-25 23:39:00
## 3 10220107 50983 2203-07-10 15:07:00 137 2203-07-08 17:18:02
## 4 10222191 50893 2185-07-05 12:08:00 7.1 2185-06-26 04:01:00
## 5 10222191 50882 2185-08-22 01:11:00 22 2185-06-26 04:01:00
## 6 10254097 50902 2138-11-06 00:05:00 106 2138-11-01 07:44:00
## 7 10297708 50971 2136-08-11 02:28:00 4 2136-08-08 10:17:34
## 8 10308342 50893 2162-11-14 15:22:00 8.7 2162-11-12 12:05:29
## 9 10331862 50983 2173-11-06 04:39:00 137 2173-11-03 19:44:00
## 10 10377337 51301 2126-03-17 04:20:00 6.5 2126-03-17 03:51:32
## outtime
## <dttm>
## 1 2167-09-18 19:32:40
## 2 2153-10-08 22:22:14
## 3 2203-07-13 14:54:35
## 4 2185-09-02 00:54:46
## 5 2185-09-02 00:54:46
## 6 2138-11-07 23:21:55
## 7 2136-09-10 15:15:33
## 8 2162-11-30 17:57:28
## 9 2173-11-07 15:13:05
## 10 2126-03-18 16:46:07
## # … with more rows
labevents_tble <- labevents_tble %>%
# sort charttime in ascending order by subject_id and item_id
select(subject_id, itemid, charttime) %>%
group_by(subject_id, itemid) %>%
summarise_all(min) %>%
left_join(labevents_tble, by = c("subject_id", "itemid", "charttime")) %>%
# keep only 11 columns and spread itemid and valuenum
select(-charttime, -intime, -outtime) %>%
# spread valuenum by itemid
pivot_wider(names_from = itemid,
values_from = valuenum,
names_prefix = "lab") %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT `subject_id`, MAX(IF(`itemid` = 50960, `valuenum`, NULL)) AS `lab50960`, MAX(IF(`itemid` = 51221, `valuenum`, NULL)) AS `lab51221`, MAX(IF(`itemid` = 50971, `valuenum`, NULL)) AS `lab50971`, MAX(IF(`itemid` = 50983, `valuenum`, NULL)) AS `lab50983`, MAX(IF(`itemid` = 51301, `valuenum`, NULL)) AS `lab51301`, MAX(IF(`itemid` = 50882, `valuenum`, NULL)) AS `lab50882`, MAX(IF(`itemid` = 50893, `valuenum`, NULL)) AS `lab50893`, MAX(IF(`itemid` = 50902, `valuenum`, NULL)) AS `lab50902`, MAX(IF(`itemid` = 50912, `valuenum`, NULL)) AS `lab50912`, MAX(IF(`itemid` = 50931, `valuenum`, NULL)) AS `lab50931`
## FROM (SELECT `subject_id`, `itemid`, `valuenum`
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`itemid` AS `itemid`, `LHS`.`charttime` AS `charttime`, `valuenum`, `intime`, `outtime`
## FROM (SELECT `subject_id`, `itemid`, MIN(`charttime`) AS `charttime`
## FROM (SELECT `subject_id`, `itemid`, `charttime`
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `itemid`, `charttime`, `valuenum`, `intime`, `outtime`
## FROM (SELECT * FROM (SELECT *
## FROM (SELECT `subject_id`, `itemid`, `charttime`, `valuenum`
## FROM `labevents`) `q01`
## WHERE (`itemid` IN (50912.0, 50971.0, 50983.0, 50902.0, 50882.0, 51221.0, 51301.0, 50931.0, 50960.0, 50893.0))) `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`subject_id` = `RHS`.`subject_id`)
## )) `LHS`
## LEFT JOIN (SELECT `subject_id`, `intime`, `outtime`
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `q01`) `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id`)
## ) `q02`
## WHERE ((`charttime` >= `intime`) AND (`charttime` <= `outtime`))) `q03`
## GROUP BY `subject_id`, `itemid`) `LHS`
## LEFT JOIN (SELECT *
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `itemid`, `charttime`, `valuenum`, `intime`, `outtime`
## FROM (SELECT * FROM (SELECT *
## FROM (SELECT `subject_id`, `itemid`, `charttime`, `valuenum`
## FROM `labevents`) `q01`
## WHERE (`itemid` IN (50912.0, 50971.0, 50983.0, 50902.0, 50882.0, 51221.0, 51301.0, 50931.0, 50960.0, 50893.0))) `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`subject_id` = `RHS`.`subject_id`)
## )) `LHS`
## LEFT JOIN (SELECT `subject_id`, `intime`, `outtime`
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `q01`) `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id`)
## ) `q02`
## WHERE ((`charttime` >= `intime`) AND (`charttime` <= `outtime`))) `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`itemid` = `RHS`.`itemid` AND `LHS`.`charttime` = `RHS`.`charttime`)
## ) `q03`) `q04`
## GROUP BY `subject_id`
## # Source: lazy query [?? x 11]
## # Database: BigQueryConnection
## # Groups: subject_id
## subject_id lab50960 lab51221 lab50971 lab50983 lab51301 lab50882 lab50893
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10035631 1.9 28.9 4 129 45.6 23 7.9
## 2 10038933 1.4 37.6 3.5 143 13.7 22 8
## 3 10222191 1.8 49.9 6.4 131 19.8 12 3.7
## 4 10256906 2.3 37.9 4.3 137 7.5 24 9.3
## 5 10258162 1.7 28.6 4.4 138 1.4 16 6.5
## 6 10292289 1.7 24.3 4.9 138 6.6 25 9.3
## 7 10297774 1.8 25 3.4 133 8.1 23 8.7
## 8 10308342 2.3 29 3.6 143 8.4 22 9.9
## 9 10442603 1.4 32.6 3.9 134 7.2 17 7.4
## 10 10450519 1.5 26.7 4.5 141 12.1 27 8.4
## lab50902 lab50912 lab50931
## <dbl> <dbl> <dbl>
## 1 92 1 204
## 2 110 0.7 163
## 3 101 5.2 148
## 4 104 1 158
## 5 118 0.7 126
## 6 102 0.9 205
## 7 93 2.3 128
## 8 111 4.6 149
## 9 86 2.7 87
## 10 103 2.5 106
## # … with more rows
chartevents
table and retrieve a subset that only contain subjects who appear in icustays_tble
and the chart events listed in HW2.Solution: Here is my code, based on the same idea as Q1-6.
# full chartevents table
chartevents_tble <- tbl(con, "chartevents") %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT *
## FROM `chartevents`
## # Source: table<chartevents> [?? x 10]
## # Database: BigQueryConnection
## subject_id hadm_id stay_id charttime storetime itemid
## <int> <int> <int> <dttm> <dttm> <int>
## 1 19189222 25044031 35017358 2188-03-25 01:00:00 2188-03-25 01:05:00 223791
## 2 10030753 29738545 39753951 2198-08-26 05:34:00 2198-08-26 06:31:00 227467
## 3 10092175 27558426 39615949 2145-10-10 07:33:00 2145-10-10 07:34:00 220058
## 4 10114694 22418467 35032951 2163-03-28 23:08:00 2163-03-28 23:10:00 226062
## 5 10129254 29023346 35006975 2189-01-15 11:00:00 2189-01-15 11:58:00 220293
## 6 10311764 21484017 33257561 2136-12-13 05:00:00 2136-12-13 05:38:00 224639
## 7 10388400 23303089 34606617 2137-07-17 03:04:00 2137-07-17 03:04:00 229247
## 8 10397277 20053627 34996407 2182-07-31 18:24:00 2182-07-31 18:51:00 227468
## 9 10420013 29217083 39721603 2165-02-03 04:00:00 2165-02-03 05:21:00 227443
## 10 10439110 26127653 37943127 2144-10-15 08:00:00 2144-10-15 07:42:00 224700
## value valuenum valueuom warning
## <chr> <dbl> <chr> <int>
## 1 Moderate to Severe. 7 <NA> 0
## 2 0.9 0.9 <NA> 0
## 3 160 160 mmHg 0
## 4 53 53 mmHg 1
## 5 22 22 L/min 0
## 6 91 91 kg 0
## 7 73 73 mmHg 0
## 8 203 203 mg/dL 1
## 9 11 11 mEq/L 1
## 10 8 8 cmH2O 0
## # … with more rows
d_items_tble <- tbl(con, "d_items") %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT *
## FROM `d_items`
## # Source: table<d_items> [?? x 9]
## # Database: BigQueryConnection
## itemid label abbreviation linksto
## <int> <chr> <chr> <chr>
## 1 223933 Heat Sounds Heat Sounds chartevents
## 2 223934 Dorsal PedPulse R Dorsal PedPulse R chartevents
## 3 223935 PostTib. Pulses R PostTib Pulses R chartevents
## 4 223936 Radial Pulse R Radial Pulse R chartevents
## 5 223938 Ulnar Pulse R Ulnar Pulse R chartevents
## 6 223939 Brachial Pulse R Brachial Pulse R chartevents
## 7 223940 Femoral Pulse R Femoral Pulse R chartevents
## 8 223941 Popliteal Pulse R Popliteal Pulse R chartevents
## 9 223943 Dorsal PedPulse L Dorsal PedPulse L chartevents
## 10 223944 Brachial Pulse L Brachial Pulse L chartevents
## category unitname param_type lownormalvalue highnormalvalue
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Cardiovascular (Pulses) <NA> Text NA NA
## 2 Cardiovascular (Pulses) <NA> Text NA NA
## 3 Cardiovascular (Pulses) <NA> Text NA NA
## 4 Cardiovascular (Pulses) <NA> Text NA NA
## 5 Cardiovascular (Pulses) <NA> Text NA NA
## 6 Cardiovascular (Pulses) <NA> Text NA NA
## 7 Cardiovascular (Pulses) <NA> Text NA NA
## 8 Cardiovascular (Pulses) <NA> Text NA NA
## 9 Cardiovascular (Pulses) <NA> Text NA NA
## 10 Cardiovascular (Pulses) <NA> Text NA NA
## # … with more rows
choice_chart <- c(220045, 220181, 220179, 223761, 220210)
chartevents_tble <- chartevents_tble %>%
# only containing these items for the patients in icustays_tble
select(stay_id, itemid, charttime, valuenum) %>%
# only containing these items for the patients in icustays_tble
filter(itemid %in% choice_chart) %>%
semi_join(icustays_tble, by = c("stay_id")) %>%
# # add label in d_items
# left_join(select(d_items_tble, itemid, label), by = c("itemid")) %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT * FROM (SELECT *
## FROM (SELECT `stay_id`, `itemid`, `charttime`, `valuenum`
## FROM `chartevents`) `q01`
## WHERE (`itemid` IN (220045.0, 220181.0, 220179.0, 223761.0, 220210.0))) `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`stay_id` = `RHS`.`stay_id`)
## )
## # Source: lazy query [?? x 4]
## # Database: BigQueryConnection
## stay_id itemid charttime valuenum
## <int> <int> <dttm> <dbl>
## 1 38197705 220045 2116-12-04 02:00:00 112
## 2 36686004 220045 2149-04-01 08:00:00 70
## 3 37753812 220045 2112-12-22 20:00:00 67
## 4 31462065 220045 2117-08-11 18:10:00 65
## 5 31556155 220045 2190-08-14 09:00:00 108
## 6 30767577 220045 2132-08-07 16:26:00 118
## 7 39637267 220045 2134-06-30 12:00:00 74
## 8 32551329 220045 2145-11-10 13:00:00 104
## 9 32826165 220045 2118-08-07 07:00:00 55
## 10 39025103 220045 2114-04-04 23:00:00 94
## # … with more rows
Only keep the first chart events during ICU stay and pivot chart events to become variables/columns.
Solution: Here is my code, based on the same idea as Q1-6.
chartevents_tble <- chartevents_tble %>%
# add intime & outtime from icustays_tble to find the first vital measurement
left_join(select(icustays_tble, stay_id, intime, outtime),
by = c("stay_id")) %>%
# keep only vital measurements between intime and outtime
filter(charttime >= intime, charttime <= outtime) %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT *
## FROM (SELECT `LHS`.`stay_id` AS `stay_id`, `itemid`, `charttime`, `valuenum`, `intime`, `outtime`
## FROM (SELECT * FROM (SELECT *
## FROM (SELECT `stay_id`, `itemid`, `charttime`, `valuenum`
## FROM `chartevents`) `q01`
## WHERE (`itemid` IN (220045.0, 220181.0, 220179.0, 223761.0, 220210.0))) `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`stay_id` = `RHS`.`stay_id`)
## )) `LHS`
## LEFT JOIN (SELECT `stay_id`, `intime`, `outtime`
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `q01`) `RHS`
## ON (`LHS`.`stay_id` = `RHS`.`stay_id`)
## ) `q02`
## WHERE ((`charttime` >= `intime`) AND (`charttime` <= `outtime`))
## # Source: lazy query [?? x 6]
## # Database: BigQueryConnection
## stay_id itemid charttime valuenum intime
## <int> <int> <dttm> <dbl> <dttm>
## 1 33215621 220045 2142-09-12 13:04:00 82 2142-09-11 09:04:02
## 2 39381813 220045 2156-07-20 02:00:00 61 2156-07-19 02:36:00
## 3 38119087 220045 2178-05-26 18:00:00 92 2178-05-25 11:20:00
## 4 34067416 220045 2186-12-17 02:00:00 86 2186-12-16 09:43:27
## 5 39620927 220045 2158-06-01 11:00:00 84 2158-05-29 19:58:00
## 6 34183391 220045 2188-09-23 00:52:00 48 2188-09-17 16:27:00
## 7 31271512 220045 2146-03-22 22:00:00 70 2146-03-17 09:22:42
## 8 34756858 220045 2181-11-19 10:00:00 62 2181-11-09 09:15:29
## 9 38563408 220045 2155-02-13 23:00:00 102 2155-02-13 11:13:20
## 10 31637478 220045 2151-11-10 09:00:00 114 2151-11-09 00:01:00
## outtime
## <dttm>
## 1 2142-09-12 18:43:34
## 2 2156-07-21 15:50:42
## 3 2178-06-04 17:11:19
## 4 2186-12-17 21:14:28
## 5 2158-06-02 02:37:35
## 6 2188-09-28 23:17:42
## 7 2146-04-07 17:05:58
## 8 2181-11-19 16:19:12
## 9 2155-02-19 15:35:38
## 10 2151-11-10 15:59:09
## # … with more rows
chartevents_tble <- chartevents_tble %>%
select(stay_id, itemid, charttime) %>%
# sort charttime in ascending order by subject_id and item_id
group_by(stay_id, itemid) %>%
# keep only the first vital measurement by group
summarise_all(min) %>%
left_join(chartevents_tble, by = c("stay_id", "itemid", "charttime")) %>%
# restrict columns and spread label and valuenum
select(-charttime, -intime, -outtime) %>%
# spread valuenum by itemid
pivot_wider(names_from = itemid,
values_from = valuenum,
names_prefix = "chart") %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT `stay_id`, MAX(IF(`itemid` = 223761, `valuenum`, NULL)) AS `chart223761`, MAX(IF(`itemid` = 220179, `valuenum`, NULL)) AS `chart220179`, MAX(IF(`itemid` = 220181, `valuenum`, NULL)) AS `chart220181`, MAX(IF(`itemid` = 220210, `valuenum`, NULL)) AS `chart220210`, MAX(IF(`itemid` = 220045, `valuenum`, NULL)) AS `chart220045`
## FROM (SELECT `stay_id`, `itemid`, `valuenum`
## FROM (SELECT `LHS`.`stay_id` AS `stay_id`, `LHS`.`itemid` AS `itemid`, `LHS`.`charttime` AS `charttime`, `valuenum`, `intime`, `outtime`
## FROM (SELECT `stay_id`, `itemid`, MIN(`charttime`) AS `charttime`
## FROM (SELECT `stay_id`, `itemid`, `charttime`
## FROM (SELECT `LHS`.`stay_id` AS `stay_id`, `itemid`, `charttime`, `valuenum`, `intime`, `outtime`
## FROM (SELECT * FROM (SELECT *
## FROM (SELECT `stay_id`, `itemid`, `charttime`, `valuenum`
## FROM `chartevents`) `q01`
## WHERE (`itemid` IN (220045.0, 220181.0, 220179.0, 223761.0, 220210.0))) `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`stay_id` = `RHS`.`stay_id`)
## )) `LHS`
## LEFT JOIN (SELECT `stay_id`, `intime`, `outtime`
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `q01`) `RHS`
## ON (`LHS`.`stay_id` = `RHS`.`stay_id`)
## ) `q02`
## WHERE ((`charttime` >= `intime`) AND (`charttime` <= `outtime`))) `q03`
## GROUP BY `stay_id`, `itemid`) `LHS`
## LEFT JOIN (SELECT *
## FROM (SELECT `LHS`.`stay_id` AS `stay_id`, `itemid`, `charttime`, `valuenum`, `intime`, `outtime`
## FROM (SELECT * FROM (SELECT *
## FROM (SELECT `stay_id`, `itemid`, `charttime`, `valuenum`
## FROM `chartevents`) `q01`
## WHERE (`itemid` IN (220045.0, 220181.0, 220179.0, 223761.0, 220210.0))) `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `RHS`
## WHERE (`LHS`.`stay_id` = `RHS`.`stay_id`)
## )) `LHS`
## LEFT JOIN (SELECT `stay_id`, `intime`, `outtime`
## FROM (SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
## ) `q01`) `RHS`
## ON (`LHS`.`stay_id` = `RHS`.`stay_id`)
## ) `q02`
## WHERE ((`charttime` >= `intime`) AND (`charttime` <= `outtime`))) `RHS`
## ON (`LHS`.`stay_id` = `RHS`.`stay_id` AND `LHS`.`itemid` = `RHS`.`itemid` AND `LHS`.`charttime` = `RHS`.`charttime`)
## ) `q03`) `q04`
## GROUP BY `stay_id`
## # Source: lazy query [?? x 6]
## # Database: BigQueryConnection
## # Groups: stay_id
## stay_id chart223761 chart220179 chart220181 chart220210 chart220045
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 34765738 98.6 97 55 19 72
## 2 31267603 NA NA NA 26 89
## 3 33047448 98.8 124 73 24 79
## 4 33302878 97.9 131 72 10 80
## 5 34796548 98.1 135 75 19 80
## 6 35856543 98 163 112 27 68
## 7 36315389 98.7 133 68 20 77
## 8 31089994 97.3 93 63 29 112
## 9 36570011 97.8 143 90 27 102
## 10 39556306 97.6 81 54 10 70
## # … with more rows
%>%
to perform following data wrangling steps: (i) start with the icustays_tble
for the first ICU stay of each unique patient, (ii) merge in admissions and patients tables, (iii) keep adults only (age at admission >= 18), (iv) merge in the labevents and chartevents tables, (v) create an indicator for 30-day mortality, (vi) save the final tibble to an icu_cohort.rds
R data file in the mimiciv_shiny
folder.Solution: Here is my code. The difference from HW2 is:
dod
instead of deathtime
due to limitations in BigQuery# make a directory mimiciv_shiny
if (!dir.exists("mimiciv_shiny")) {
dir.create("mimiciv_shiny")
}
# (i) start with the `icustays_tble` for the first ICU stay
icu_cohort <- icustays_tble %>%
# (ii) merge in admissions and patients tables
left_join(admissions_tble, by = c("subject_id", "hadm_id")) %>%
left_join(patients_tble, by = c("subject_id")) %>%
# (iii) keep adults only (age at admission >= 18)
mutate(age_hadm = anchor_age + year(admittime) - anchor_year) %>%
filter(age_hadm >= 18) %>%
# (iv) merge in the labevents and chartevents tables
left_join(labevents_tble, by = c("subject_id")) %>%
left_join(chartevents_tble, by = c("stay_id")) %>%
# (v) create an indicator for 30-day mortality
mutate(thirty_day_mort =
ifelse(is.na(dod), FALSE,
DATE_DIFF(date(dod), date(admittime), DAY) <= 30)) %>%
# (vi) save the final tibble to an `icu_cohort.rds` in `mimiciv_shiny`
collect() %>%
write_rds("mimiciv_shiny/icu_cohort.rds") %>%
# show_query() %>%
print(width = Inf)
## # A tibble: 53,150 × 43
## subject_id intime hadm_id stay_id first_careunit
## <int> <dttm> <int> <int> <chr>
## 1 12776735 2200-07-12 00:33:00 20817525 34547665 Neuro Stepdown
## 2 16256226 2150-12-20 16:09:08 20013290 39289362 Neuro Stepdown
## 3 12974563 2138-11-13 23:30:01 29618057 32563675 Neuro Stepdown
## 4 14609218 2174-06-28 21:13:00 20606189 34947848 Neuro Stepdown
## 5 12687112 2162-05-31 18:08:45 26132667 37445058 Neuro Stepdown
## 6 18190935 2115-11-12 23:23:32 21137829 34338402 Neuro Stepdown
## 7 10404210 2161-03-03 18:30:00 22880512 30254828 Neuro Stepdown
## 8 12552973 2135-09-21 18:12:03 21320643 30261549 Neuro Stepdown
## 9 14303051 2175-11-20 01:02:00 26559537 30401486 Neuro Stepdown
## 10 12347959 2140-07-11 17:54:14 22779073 30503984 Neuro Stepdown
## last_careunit outtime los admittime
## <chr> <dttm> <dbl> <dttm>
## 1 Neuro Stepdown 2200-07-13 16:44:40 1.67 2200-07-11 22:46:00
## 2 Neuro Stepdown 2150-12-21 14:58:40 0.951 2150-12-20 03:00:00
## 3 Neuro Stepdown 2138-11-15 16:25:19 1.71 2138-11-13 01:07:00
## 4 Neuro Stepdown 2174-07-05 17:01:32 6.83 2174-06-28 20:40:00
## 5 Neuro Stepdown 2162-06-04 10:16:13 3.67 2162-05-31 15:36:00
## 6 Neuro Stepdown 2115-11-15 18:06:26 2.78 2115-11-02 18:38:00
## 7 Neuro Stepdown 2161-03-05 18:10:31 1.99 2161-03-03 17:45:00
## 8 Neuro Stepdown 2135-09-23 12:56:46 1.78 2135-09-21 02:12:00
## 9 Neuro Stepdown 2175-11-21 16:06:17 1.63 2175-11-20 00:08:00
## 10 Neuro Stepdown 2140-07-19 13:01:05 7.80 2140-07-11 17:52:00
## dischtime deathtime admission_type
## <dttm> <dttm> <chr>
## 1 2200-07-19 12:00:00 NA OBSERVATION ADMIT
## 2 2150-12-21 14:50:00 NA SURGICAL SAME DAY ADMISSION
## 3 2138-11-15 15:53:00 NA OBSERVATION ADMIT
## 4 2174-07-05 16:45:00 NA OBSERVATION ADMIT
## 5 2162-06-04 10:16:00 NA OBSERVATION ADMIT
## 6 2115-11-26 14:30:00 NA OBSERVATION ADMIT
## 7 2161-03-05 18:10:00 NA OBSERVATION ADMIT
## 8 2135-09-23 12:56:00 NA SURGICAL SAME DAY ADMISSION
## 9 2175-11-21 15:30:00 NA OBSERVATION ADMIT
## 10 2140-07-19 13:00:00 NA URGENT
## admission_location discharge_location insurance language
## <chr> <chr> <chr> <chr>
## 1 EMERGENCY ROOM SKILLED NURSING FACILITY Medicare ENGLISH
## 2 PHYSICIAN REFERRAL HOME Other ENGLISH
## 3 EMERGENCY ROOM HOME HEALTH CARE Other ENGLISH
## 4 EMERGENCY ROOM REHAB Other ENGLISH
## 5 PHYSICIAN REFERRAL HOME Other ENGLISH
## 6 TRANSFER FROM HOSPITAL CHRONIC/LONG TERM ACUTE CARE Other ENGLISH
## 7 EMERGENCY ROOM HOME Other ENGLISH
## 8 PHYSICIAN REFERRAL HOME Other ENGLISH
## 9 EMERGENCY ROOM HOME Other ENGLISH
## 10 TRANSFER FROM HOSPITAL REHAB Other ENGLISH
## marital_status ethnicity edregtime edouttime
## <chr> <chr> <dttm> <dttm>
## 1 MARRIED OTHER 2200-07-11 15:27:00 2200-07-12 00:33:00
## 2 SINGLE OTHER NA NA
## 3 WIDOWED WHITE 2138-11-12 17:26:00 2138-11-13 20:21:00
## 4 SINGLE WHITE 2174-06-28 18:46:00 2174-06-28 21:13:00
## 5 SINGLE BLACK/AFRICAN AMERICAN 2162-05-31 09:36:00 2162-05-31 19:59:00
## 6 DIVORCED WHITE 2115-11-02 13:59:00 2115-11-02 22:17:00
## 7 MARRIED HISPANIC/LATINO 2161-03-03 13:39:00 2161-03-03 18:30:00
## 8 MARRIED WHITE NA NA
## 9 SINGLE WHITE 2175-11-19 10:42:00 2175-11-20 01:02:00
## 10 MARRIED WHITE NA NA
## hospital_expire_flag gender anchor_age anchor_year anchor_year_group
## <int> <chr> <int> <int> <chr>
## 1 0 M 72 2192 2008 - 2010
## 2 0 F 49 2150 2017 - 2019
## 3 0 F 72 2138 2014 - 2016
## 4 0 F 69 2174 2017 - 2019
## 5 0 M 57 2156 2011 - 2013
## 6 0 F 57 2115 2017 - 2019
## 7 0 F 46 2161 2017 - 2019
## 8 0 F 48 2131 2011 - 2013
## 9 0 F 24 2175 2014 - 2016
## 10 0 F 49 2140 2017 - 2019
## dod age_hadm lab50960 lab51221 lab50971 lab50983 lab51301 lab50882
## <date> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NA 80 1.5 30.6 3.3 130 7.8 22
## 2 NA 49 1.8 30.5 4 141 3.5 21
## 3 NA 72 1.9 41.6 4.4 139 10.6 16
## 4 NA 69 1.7 36 4.6 134 12.1 23
## 5 NA 63 1.4 30 5 143 5.4 19
## 6 2115-12-11 57 1.6 31.5 4.3 138 9.2 26
## 7 NA 46 2.2 34.1 4.1 138 3.6 26
## 8 NA 52 1.9 37.1 4 145 14.7 22
## 9 NA 24 2 34.7 4.3 139 7.1 24
## 10 NA 49 1.8 39.6 3.8 137 20.1 20
## lab50893 lab50902 lab50912 lab50931 chart223761 chart220179 chart220181
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8.2 97 1 113 102. 98 86
## 2 8.1 108 0.5 99 98.4 111 72
## 3 8.9 104 1 78 98.7 183 107
## 4 9.2 98 0.5 106 98.3 162 99
## 5 8.4 105 8.2 127 98 165 95
## 6 9.2 98 0.4 135 98.6 132 97
## 7 9 104 0.8 99 98.4 114 79
## 8 8.3 106 0.8 157 98.4 144 99
## 9 9.2 105 0.8 85 98.6 120 85
## 10 9.6 101 0.7 98 99.5 140 117
## chart220210 chart220045 thirty_day_mort
## <dbl> <dbl> <lgl>
## 1 16 78 FALSE
## 2 5 91 FALSE
## 3 16 67 FALSE
## 4 12 74 FALSE
## 5 26 95 FALSE
## 6 23 96 FALSE
## 7 18 89 FALSE
## 8 23 71 FALSE
## 9 17 67 FALSE
## 10 12 72 FALSE
## # … with 53,140 more rows
# check the number of patients who died within 30 days
icu_cohort %>%
select(admittime, dod, thirty_day_mort) %>%
filter(thirty_day_mort == TRUE) %>%
summarise(n = n()) %>%
print(width = Inf)
## # A tibble: 1 × 1
## n
## <int>
## 1 5436
Comment: I got 5,436 patients who died withn 30 days. If changing age_hadm >= 18
to > 18
, the number will be 5,435.
Close database connection and clear workspace.
dbDisconnect(con)
rm(list = ls())
Develop a Shiny app for exploring the ICU cohort data created in Q1. The app should reside in the mimiciv_shiny
folder. The app should provide easy access to the graphical and numerical summaries of variables (demographics, lab measurements, vitals) in the ICU cohort.
Solution: My app (app.R) was saved in the mimiciv_shiny
folder. I made two types of summaries: using 30-day mortality as an indicator (like HW2) and not. For numerical variables, I removed extreme values to make the graphs readable.