Display machine information for reproducibility:
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.29 R6_2.5.1 jsonlite_1.7.2 magrittr_2.0.1
## [5] evaluate_0.14 stringi_1.7.6 rlang_1.0.1 cli_3.1.0
## [9] rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1 rmarkdown_2.11
## [13] tools_3.6.0 stringr_1.4.0 xfun_0.29 yaml_2.2.1
## [17] fastmap_1.1.0 compiler_3.6.0 htmltools_0.5.2 knitr_1.37
## [21] sass_0.4.0
knitr::opts_chunk$set(echo = TRUE, cache = TRUE, cache.lazy = FALSE)
library(tidyverse)
library(data.table)
library(lubridate)
os <- sessionInfo()$running
if (str_detect(os, "Linux")) {
mimic_path <- "/mnt/mimiciv/1.0"
} else if (str_detect(os, "macOS")) {
mimic_path <- "/Users/tomokiokuno/mimic-iv-1.0"
}
In this exercise, we use tidyverse (ggpot2, dplyr, etc) to explore the MIMIC-IV data introduced in homework 1 and to build a cohort of ICU stays.
# tree -s -L 2 /Users/huazhou/Documents/Box\ Sync/MIMIC/mimic-iv-1.0
system(str_c("tree -s -L 2 ", shQuote(mimic_path)), intern = TRUE)
<<<<<<< HEAD
## [1] "/mnt/mimiciv/1.0"
## [2] "├── [ 2356] biostat-203b-2022winter-3fdc2392ac39.json"
## [3] "├── [ 4096] core"
## [4] "│  ├── [ 17208966] admissions.csv.gz"
## [5] "│  ├── [ 606] index.html"
## [6] "│  ├── [ 2955582] patients.csv.gz"
## [7] "│  └── [ 53014503] transfers.csv.gz"
## [8] "├── [ 4096] hosp"
## [9] "│  ├── [ 430049] d_hcpcs.csv.gz"
## [10] "│  ├── [ 29531152] diagnoses_icd.csv.gz"
## [11] "│  ├── [ 863239] d_icd_diagnoses.csv.gz"
## [12] "│  ├── [ 579998] d_icd_procedures.csv.gz"
## [13] "│  ├── [ 14898] d_labitems.csv.gz"
## [14] "│  ├── [ 11684062] drgcodes.csv.gz"
## [15] "│  ├── [ 515763427] emar.csv.gz"
## [16] "│  ├── [ 476252563] emar_detail.csv.gz"
## [17] "│  ├── [ 2098831] hcpcsevents.csv.gz"
## [18] "│  ├── [ 2325] index.html"
## [19] "│  ├── [ 2091865786] labevents.csv.gz"
## [20] "│  ├── [ 171624288] labevents_filtered_itemid.csv.gz"
## [21] "│  ├── [ 99133381] microbiologyevents.csv.gz"
## [22] "│  ├── [ 422874088] pharmacy.csv.gz"
## [23] "│  ├── [ 501381155] poe.csv.gz"
## [24] "│  ├── [ 24020923] poe_detail.csv.gz"
## [25] "│  ├── [ 367041717] prescriptions.csv.gz"
## [26] "│  ├── [ 7750325] procedures_icd.csv.gz"
## [27] "│  └── [ 9565293] services.csv.gz"
## [28] "├── [ 4096] icu"
## [29] "│  ├── [ 2350783547] chartevents.csv.gz"
## [30] "│  ├── [ 110272408] chartevents_filtered_itemid.csv.gz"
## [31] "│  ├── [ 43296273] datetimeevents.csv.gz"
## [32] "│  ├── [ 55917] d_items.csv.gz"
## [33] "│  ├── [ 2848628] icustays.csv.gz"
## [34] "│  ├── [ 1103] index.html"
## [35] "│  ├── [ 352443512] inputevents.csv.gz"
## [36] "│  ├── [ 37095672] outputevents.csv.gz"
## [37] "│  └── [ 20567368] procedureevents.csv.gz"
## [38] "├── [ 797] index.html"
## [39] "├── [ 2518] LICENSE.txt"
## [40] "└── [ 2459] SHA256SUMS.txt"
## [41] ""
## [42] "3 directories, 36 files"
=======
## [1] "/mnt/mimiciv/1.0"
## [2] "├── [ 2356] biostat-203b-2022winter-3fdc2392ac39.json"
## [3] "├── [ 439] client_secret_34349235653-sidv2drmetcimi1fba4tj024aqth8qmr.apps.googleusercontent.com.json"
## [4] "├── [ 4096] core"
## [5] "│  ├── [ 17208966] admissions.csv.gz"
## [6] "│  ├── [ 606] index.html"
## [7] "│  ├── [ 2955582] patients.csv.gz"
## [8] "│  └── [ 53014503] transfers.csv.gz"
## [9] "├── [ 4096] hosp"
## [10] "│  ├── [ 430049] d_hcpcs.csv.gz"
## [11] "│  ├── [ 29531152] diagnoses_icd.csv.gz"
## [12] "│  ├── [ 863239] d_icd_diagnoses.csv.gz"
## [13] "│  ├── [ 579998] d_icd_procedures.csv.gz"
## [14] "│  ├── [ 14898] d_labitems.csv.gz"
## [15] "│  ├── [ 11684062] drgcodes.csv.gz"
## [16] "│  ├── [ 515763427] emar.csv.gz"
## [17] "│  ├── [ 476252563] emar_detail.csv.gz"
## [18] "│  ├── [ 2098831] hcpcsevents.csv.gz"
## [19] "│  ├── [ 2325] index.html"
## [20] "│  ├── [ 2091865786] labevents.csv.gz"
## [21] "│  ├── [ 171624288] labevents_filtered_itemid.csv.gz"
## [22] "│  ├── [ 99133381] microbiologyevents.csv.gz"
## [23] "│  ├── [ 422874088] pharmacy.csv.gz"
## [24] "│  ├── [ 501381155] poe.csv.gz"
## [25] "│  ├── [ 24020923] poe_detail.csv.gz"
## [26] "│  ├── [ 367041717] prescriptions.csv.gz"
## [27] "│  ├── [ 7750325] procedures_icd.csv.gz"
## [28] "│  └── [ 9565293] services.csv.gz"
## [29] "├── [ 4096] icu"
## [30] "│  ├── [ 2350783547] chartevents.csv.gz"
## [31] "│  ├── [ 110272408] chartevents_filtered_itemid.csv.gz"
## [32] "│  ├── [ 43296273] datetimeevents.csv.gz"
## [33] "│  ├── [ 55917] d_items.csv.gz"
## [34] "│  ├── [ 2848628] icustays.csv.gz"
## [35] "│  ├── [ 1103] index.html"
## [36] "│  ├── [ 352443512] inputevents.csv.gz"
## [37] "│  ├── [ 37095672] outputevents.csv.gz"
## [38] "│  └── [ 20567368] procedureevents.csv.gz"
## [39] "├── [ 797] index.html"
## [40] "├── [ 2518] LICENSE.txt"
## [41] "└── [ 2459] SHA256SUMS.txt"
## [42] ""
## [43] "3 directories, 37 files"
>>>>>>> develop
read.csv
(base R) vs read_csv
(tidyverse) vs fread
(data.table)There are quite a few utilities in R for reading plain text data files. Let us test the speed of reading a moderate sized compressed csv file, admissions.csv.gz
, by three programs: read.csv
in base R, read_csv
in tidyverse, and fread
in the popular data.table package.
Which function is fastest? Is there difference in the (default) parsed data types? (Hint: R function system.time
measures run times.)
For later questions, we stick to the tidyverse.
Solution: I used system.time
to measure run times for the three functions. Looking at user time, CPU time charged for the execution of user instructions, fread
was the fastest function.
system.time(tmp_r <- read.csv(str_c(mimic_path,"/core/admissions.csv.gz")))
## user system elapsed
<<<<<<< HEAD
## 38.665 0.178 38.848
=======
## 39.562 0.210 39.781
>>>>>>> develop
system.time(tmp_t <- read_csv(str_c(mimic_path,"/core/admissions.csv.gz")))
## Rows: 523740 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): admission_type, admission_location, discharge_location, insurance,...
## dbl (3): subject_id, hadm_id, hospital_expire_flag
## dttm (5): admittime, dischtime, deathtime, edregtime, edouttime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## user system elapsed
<<<<<<< HEAD
## 3.216 0.143 2.924
system.time(tmp_f <- fread(str_c(mimic_path,"/core/admissions.csv.gz")))
## user system elapsed
## 1.516 0.112 0.964
=======
## 3.289 0.159 2.457
system.time(tmp_f <- fread(str_c(mimic_path,"/core/admissions.csv.gz")))
## user system elapsed
## 1.206 0.107 1.041
>>>>>>> develop
Solution: Next, I used str
to confirm data types for the three functions. Obviously, there are differences between them. For instance, both read.csv
and fread
read subject_id
as integer
while read_csv
read this as double (numeric)
. Also, both read_csv
and fread
imported the time as POSIXct
, but read.csv
imported as charactor
.
str(tmp_r)
## 'data.frame': 523740 obs. of 15 variables:
## $ subject_id : int 14679932 15585972 11989120 17817079 15078341 19124609 17301855 17991012 16865435 13693648 ...
## $ hadm_id : int 21038362 24941086 21965160 24709883 23272159 20517215 29732723 24298836 23216961 21640725 ...
## $ admittime : Factor w/ 514435 levels "2105-10-04 17:26:00",..: 162283 62357 210124 330971 56261 351331 166789 430755 456181 3345 ...
## $ dischtime : Factor w/ 507789 levels "2105-10-12 11:11:00",..: 160158 61694 207314 326516 55632 346651 164592 424886 449948 3316 ...
## $ deathtime : Factor w/ 9337 levels "","2110-01-25 09:40:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ admission_type : Factor w/ 9 levels "AMBULATORY OBSERVATION",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ admission_location : Factor w/ 12 levels "","AMBULATORY SURGERY TRANSFER",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ discharge_location : Factor w/ 14 levels "","ACUTE HOSPITAL",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ insurance : Factor w/ 3 levels "Medicaid","Medicare",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ language : Factor w/ 2 levels "?","ENGLISH": 2 2 2 2 2 2 2 2 2 2 ...
## $ marital_status : Factor w/ 5 levels "","DIVORCED",..: 4 1 1 1 1 1 1 1 1 1 ...
## $ ethnicity : Factor w/ 8 levels "AMERICAN INDIAN/ALASKA NATIVE",..: 7 8 7 5 3 7 8 8 8 8 ...
## $ edregtime : Factor w/ 310256 levels "","2109-08-31 02:46:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ edouttime : Factor w/ 310322 levels "","2109-08-31 07:51:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ hospital_expire_flag: int 0 0 0 0 0 0 0 0 0 0 ...
str(tmp_t)
## spec_tbl_df [523,740 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ subject_id : num [1:523740] 14679932 15585972 11989120 17817079 15078341 ...
## $ hadm_id : num [1:523740] 21038362 24941086 21965160 24709883 23272159 ...
## $ admittime : POSIXct[1:523740], format: "2139-09-26 14:16:00" "2123-10-07 23:56:00" ...
## $ dischtime : POSIXct[1:523740], format: "2139-09-28 11:30:00" "2123-10-12 11:22:00" ...
## $ deathtime : POSIXct[1:523740], format: NA NA ...
## $ admission_type : chr [1:523740] "ELECTIVE" "ELECTIVE" "ELECTIVE" "ELECTIVE" ...
## $ admission_location : chr [1:523740] NA NA NA NA ...
## $ discharge_location : chr [1:523740] "HOME" "HOME" "HOME" "HOME" ...
## $ insurance : chr [1:523740] "Other" "Other" "Other" "Other" ...
## $ language : chr [1:523740] "ENGLISH" "ENGLISH" "ENGLISH" "ENGLISH" ...
## $ marital_status : chr [1:523740] "SINGLE" NA NA NA ...
## $ ethnicity : chr [1:523740] "UNKNOWN" "WHITE" "UNKNOWN" "OTHER" ...
## $ edregtime : POSIXct[1:523740], format: NA NA ...
## $ edouttime : POSIXct[1:523740], format: NA NA ...
## $ hospital_expire_flag: num [1:523740] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. subject_id = col_double(),
## .. hadm_id = col_double(),
## .. admittime = col_datetime(format = ""),
## .. dischtime = col_datetime(format = ""),
## .. deathtime = col_datetime(format = ""),
## .. admission_type = col_character(),
## .. admission_location = col_character(),
## .. discharge_location = col_character(),
## .. insurance = col_character(),
## .. language = col_character(),
## .. marital_status = col_character(),
## .. ethnicity = col_character(),
## .. edregtime = col_datetime(format = ""),
## .. edouttime = col_datetime(format = ""),
## .. hospital_expire_flag = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(tmp_f)
## Classes 'data.table' and 'data.frame': 523740 obs. of 15 variables:
## $ subject_id : int 14679932 15585972 11989120 17817079 15078341 19124609 17301855 17991012 16865435 13693648 ...
## $ hadm_id : int 21038362 24941086 21965160 24709883 23272159 20517215 29732723 24298836 23216961 21640725 ...
## $ admittime : POSIXct, format: "2139-09-26 14:16:00" "2123-10-07 23:56:00" ...
## $ dischtime : POSIXct, format: "2139-09-28 11:30:00" "2123-10-12 11:22:00" ...
## $ deathtime : POSIXct, format: NA NA ...
## $ admission_type : chr "ELECTIVE" "ELECTIVE" "ELECTIVE" "ELECTIVE" ...
## $ admission_location : chr "" "" "" "" ...
## $ discharge_location : chr "HOME" "HOME" "HOME" "HOME" ...
## $ insurance : chr "Other" "Other" "Other" "Other" ...
## $ language : chr "ENGLISH" "ENGLISH" "ENGLISH" "ENGLISH" ...
## $ marital_status : chr "SINGLE" "" "" "" ...
## $ ethnicity : chr "UNKNOWN" "WHITE" "UNKNOWN" "OTHER" ...
## $ edregtime : POSIXct, format: NA NA ...
## $ edouttime : POSIXct, format: NA NA ...
## $ hospital_expire_flag: int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
icustays.csv.gz
(https://mimic.mit.edu/docs/iv/modules/icu/icustays/) contains data about Intensive Care Units (ICU) stays. The first 10 lines are
system(
str_c(
"zcat < ",
shQuote(str_c(mimic_path, "/icu/icustays.csv.gz")),
" | head"
),
intern = TRUE
)
## [1] "subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los"
## [2] "17867402,24528534,31793211,Trauma SICU (TSICU),Trauma SICU (TSICU),2154-03-03 04:11:00,2154-03-04 18:16:56,1.5874537037037035"
## [3] "14435996,28960964,31983544,Trauma SICU (TSICU),Trauma SICU (TSICU),2150-06-19 17:57:00,2150-06-22 18:33:54,3.025625"
## [4] "17609946,27385897,33183475,Trauma SICU (TSICU),Trauma SICU (TSICU),2138-02-05 18:54:00,2138-02-15 12:42:05,9.741724537037038"
## [5] "18966770,23483021,34131444,Trauma SICU (TSICU),Trauma SICU (TSICU),2123-10-25 10:35:00,2123-10-25 18:59:47,0.35054398148148147"
## [6] "12776735,20817525,34547665,Neuro Stepdown,Neuro Stepdown,2200-07-12 00:33:00,2200-07-13 16:44:40,1.6747685185185184"
## [7] "10215159,24283593,34569476,Trauma SICU (TSICU),Trauma SICU (TSICU),2124-09-20 15:05:29,2124-09-21 22:06:58,1.2926967592592593"
## [8] "14489052,26516390,35056286,Trauma SICU (TSICU),Trauma SICU (TSICU),2118-10-26 10:33:56,2118-10-26 20:28:10,0.4126620370370371"
## [9] "15914763,28906020,36909804,Trauma SICU (TSICU),Trauma SICU (TSICU),2176-12-14 12:00:00,2176-12-17 11:47:01,2.9909837962962964"
## [10] "16256226,20013290,39289362,Neuro Stepdown,Neuro Stepdown,2150-12-20 16:09:08,2150-12-21 14:58:40,0.9510648148148149"
icustatys.csv.gz
as a tibble icustays_tble
.Solution: I used read_csv
to import this file as a tibble and sorted it by subject_id
and hadm_id
.
# imported as a master file for work efficiency
icustays_master <- read_csv(str_c(mimic_path, "/icu/icustays.csv.gz"))
## Rows: 76540 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): first_careunit, last_careunit
## dbl (4): subject_id, hadm_id, stay_id, los
## dttm (2): intime, outtime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
icustays_tble <- icustays_master %>%
arrange(subject_id, hadm_id) %>%
print(width = Inf)
## # A tibble: 76,540 × 8
## subject_id hadm_id stay_id first_careunit
## <dbl> <dbl> <dbl> <chr>
## 1 10000032 29079034 39553978 Medical Intensive Care Unit (MICU)
## 2 10000980 26913865 39765666 Medical Intensive Care Unit (MICU)
## 3 10001217 24597018 37067082 Surgical Intensive Care Unit (SICU)
## 4 10001217 27703517 34592300 Surgical Intensive Care Unit (SICU)
## 5 10001725 25563031 31205490 Medical/Surgical Intensive Care Unit (MICU/SICU)
## 6 10001884 26184834 37510196 Medical Intensive Care Unit (MICU)
## 7 10002013 23581541 39060235 Cardiac Vascular Intensive Care Unit (CVICU)
## 8 10002155 20345487 32358465 Medical Intensive Care Unit (MICU)
## 9 10002155 23822395 33685454 Coronary Care Unit (CCU)
## 10 10002155 28994087 31090461 Medical/Surgical Intensive Care Unit (MICU/SICU)
## last_careunit intime
## <chr> <dttm>
## 1 Medical Intensive Care Unit (MICU) 2180-07-23 14:00:00
## 2 Medical Intensive Care Unit (MICU) 2189-06-27 08:42:00
## 3 Surgical Intensive Care Unit (SICU) 2157-11-20 19:18:02
## 4 Surgical Intensive Care Unit (SICU) 2157-12-19 15:42:24
## 5 Medical/Surgical Intensive Care Unit (MICU/SICU) 2110-04-11 15:52:22
## 6 Medical Intensive Care Unit (MICU) 2131-01-11 04:20:05
## 7 Cardiac Vascular Intensive Care Unit (CVICU) 2160-05-18 10:00:53
## 8 Medical Intensive Care Unit (MICU) 2131-03-09 21:33:00
## 9 Coronary Care Unit (CCU) 2129-08-04 12:45:00
## 10 Medical/Surgical Intensive Care Unit (MICU/SICU) 2130-09-24 00:50:00
## outtime los
## <dttm> <dbl>
## 1 2180-07-23 23:50:47 0.410
## 2 2189-06-27 20:38:27 0.498
## 3 2157-11-21 22:08:00 1.12
## 4 2157-12-20 14:27:41 0.948
## 5 2110-04-12 23:59:56 1.34
## 6 2131-01-20 08:27:30 9.17
## 7 2160-05-19 17:33:33 1.31
## 8 2131-03-10 18:09:21 0.859
## 9 2129-08-10 17:02:38 6.18
## 10 2130-09-27 22:13:41 3.89
## # … with 76,530 more rows
subject_id
? Can a subject_id
have multiple ICU stays?Solution: Aggregation of icustays_tble
by subject_id
yielded 53,150 unique subject_id
’s, which is less than the total number of rows (76,540). Thus, subject_id
can have multiple ICU stays.
nrow(distinct(icustays_tble, subject_id))
## [1] 53150
subject_id
, let’s only keep the first ICU stay in the tibble icustays_tble
.Solution: I used arrange
to sort intime
in ascending order, and then removed the duplicates subject_id
. This method should be faster than using filter
and min
. As a result, the new table has 53,150 rows.
icustays_tble <- icustays_tble %>%
arrange(subject_id, intime) %>%
distinct(subject_id, .keep_all = TRUE) %>%
print(width = Inf)
## # A tibble: 53,150 × 8
## subject_id hadm_id stay_id first_careunit
## <dbl> <dbl> <dbl> <chr>
## 1 10000032 29079034 39553978 Medical Intensive Care Unit (MICU)
## 2 10000980 26913865 39765666 Medical Intensive Care Unit (MICU)
## 3 10001217 24597018 37067082 Surgical Intensive Care Unit (SICU)
## 4 10001725 25563031 31205490 Medical/Surgical Intensive Care Unit (MICU/SICU)
## 5 10001884 26184834 37510196 Medical Intensive Care Unit (MICU)
## 6 10002013 23581541 39060235 Cardiac Vascular Intensive Care Unit (CVICU)
## 7 10002155 23822395 33685454 Coronary Care Unit (CCU)
## 8 10002223 22494570 39638202 Trauma SICU (TSICU)
## 9 10002348 22725460 32610785 Neuro Intermediate
## 10 10002428 28662225 33987268 Medical Intensive Care Unit (MICU)
## last_careunit intime
## <chr> <dttm>
## 1 Medical Intensive Care Unit (MICU) 2180-07-23 14:00:00
## 2 Medical Intensive Care Unit (MICU) 2189-06-27 08:42:00
## 3 Surgical Intensive Care Unit (SICU) 2157-11-20 19:18:02
## 4 Medical/Surgical Intensive Care Unit (MICU/SICU) 2110-04-11 15:52:22
## 5 Medical Intensive Care Unit (MICU) 2131-01-11 04:20:05
## 6 Cardiac Vascular Intensive Care Unit (CVICU) 2160-05-18 10:00:53
## 7 Coronary Care Unit (CCU) 2129-08-04 12:45:00
## 8 Trauma SICU (TSICU) 2158-01-15 08:01:49
## 9 Neuro Intermediate 2112-11-30 23:24:00
## 10 Medical Intensive Care Unit (MICU) 2156-04-12 16:24:18
## outtime los
## <dttm> <dbl>
## 1 2180-07-23 23:50:47 0.410
## 2 2189-06-27 20:38:27 0.498
## 3 2157-11-21 22:08:00 1.12
## 4 2110-04-12 23:59:56 1.34
## 5 2131-01-20 08:27:30 9.17
## 6 2160-05-19 17:33:33 1.31
## 7 2129-08-10 17:02:38 6.18
## 8 2158-01-16 15:19:24 1.30
## 9 2112-12-10 18:25:13 9.79
## 10 2156-04-17 15:57:08 4.98
## # … with 53,140 more rows
admission
dataInformation of the patients admitted into hospital is available in admissions.csv.gz
. See https://mimic.mit.edu/docs/iv/modules/core/admissions/ for details of each field in this file. The first 10 lines are
system(
str_c(
"zcat < ",
shQuote(str_c(mimic_path, "/core/admissions.csv.gz")),
" | head"
),
intern = TRUE
)
## [1] "subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag"
## [2] "14679932,21038362,2139-09-26 14:16:00,2139-09-28 11:30:00,,ELECTIVE,,HOME,Other,ENGLISH,SINGLE,UNKNOWN,,,0"
## [3] "15585972,24941086,2123-10-07 23:56:00,2123-10-12 11:22:00,,ELECTIVE,,HOME,Other,ENGLISH,,WHITE,,,0"
## [4] "11989120,21965160,2147-01-14 09:00:00,2147-01-17 14:25:00,,ELECTIVE,,HOME,Other,ENGLISH,,UNKNOWN,,,0"
## [5] "17817079,24709883,2165-12-27 17:33:00,2165-12-31 21:18:00,,ELECTIVE,,HOME,Other,ENGLISH,,OTHER,,,0"
## [6] "15078341,23272159,2122-08-28 08:48:00,2122-08-30 12:32:00,,ELECTIVE,,HOME,Other,ENGLISH,,BLACK/AFRICAN AMERICAN,,,0"
## [7] "19124609,20517215,2169-03-14 12:44:00,2169-03-20 19:15:00,,ELECTIVE,,HOME,Other,ENGLISH,,UNKNOWN,,,0"
## [8] "17301855,29732723,2140-06-06 14:23:00,2140-06-08 14:25:00,,ELECTIVE,,HOME,Other,ENGLISH,,WHITE,,,0"
## [9] "17991012,24298836,2181-07-10 20:28:00,2181-07-12 15:49:00,,ELECTIVE,,HOME,Other,ENGLISH,,WHITE,,,0"
## [10] "16865435,23216961,2185-07-19 02:12:00,2185-07-21 11:50:00,,ELECTIVE,,HOME,Other,ENGLISH,,WHITE,,,0"
admissions.csv.gz
as a tibble admissions_tble
.Solution: I used read_csv
to import this file as a tibble and obtained a 523,740 × 15 table.
admissions_master <- read_csv(str_c(mimic_path,"/core/admissions.csv.gz"))
## Rows: 523740 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): admission_type, admission_location, discharge_location, insurance,...
## dbl (3): subject_id, hadm_id, hospital_expire_flag
## dttm (5): admittime, dischtime, deathtime, edregtime, edouttime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
admissions_tble <- admissions_master %>%
print(width = Inf)
## # A tibble: 523,740 × 15
## subject_id hadm_id admittime dischtime
## <dbl> <dbl> <dttm> <dttm>
## 1 14679932 21038362 2139-09-26 14:16:00 2139-09-28 11:30:00
## 2 15585972 24941086 2123-10-07 23:56:00 2123-10-12 11:22:00
## 3 11989120 21965160 2147-01-14 09:00:00 2147-01-17 14:25:00
## 4 17817079 24709883 2165-12-27 17:33:00 2165-12-31 21:18:00
## 5 15078341 23272159 2122-08-28 08:48:00 2122-08-30 12:32:00
## 6 19124609 20517215 2169-03-14 12:44:00 2169-03-20 19:15:00
## 7 17301855 29732723 2140-06-06 14:23:00 2140-06-08 14:25:00
## 8 17991012 24298836 2181-07-10 20:28:00 2181-07-12 15:49:00
## 9 16865435 23216961 2185-07-19 02:12:00 2185-07-21 11:50:00
## 10 13693648 21640725 2111-01-30 23:43:00 2111-02-02 13:03:00
## deathtime admission_type admission_location discharge_location
## <dttm> <chr> <chr> <chr>
## 1 NA ELECTIVE <NA> HOME
## 2 NA ELECTIVE <NA> HOME
## 3 NA ELECTIVE <NA> HOME
## 4 NA ELECTIVE <NA> HOME
## 5 NA ELECTIVE <NA> HOME
## 6 NA ELECTIVE <NA> HOME
## 7 NA ELECTIVE <NA> HOME
## 8 NA ELECTIVE <NA> HOME
## 9 NA ELECTIVE <NA> HOME
## 10 NA ELECTIVE <NA> HOME
## insurance language marital_status ethnicity edregtime
## <chr> <chr> <chr> <chr> <dttm>
## 1 Other ENGLISH SINGLE UNKNOWN NA
## 2 Other ENGLISH <NA> WHITE NA
## 3 Other ENGLISH <NA> UNKNOWN NA
## 4 Other ENGLISH <NA> OTHER NA
## 5 Other ENGLISH <NA> BLACK/AFRICAN AMERICAN NA
## 6 Other ENGLISH <NA> UNKNOWN NA
## 7 Other ENGLISH <NA> WHITE NA
## 8 Other ENGLISH <NA> WHITE NA
## 9 Other ENGLISH <NA> WHITE NA
## 10 Other ENGLISH <NA> WHITE NA
## edouttime hospital_expire_flag
## <dttm> <dbl>
## 1 NA 0
## 2 NA 0
## 3 NA 0
## 4 NA 0
## 5 NA 0
## 6 NA 0
## 7 NA 0
## 8 NA 0
## 9 NA 0
## 10 NA 0
## # … with 523,730 more rows
icustays_tble
according to subject_id
and hadm_id
.Solution: I used semi_join
to keep only the rows of admissions_tble
with the same subject_id
and hadm_id
as icustays_tble
.
admissions_tble <- admissions_tble %>%
arrange(subject_id, hadm_id) %>%
semi_join(icustays_tble, by = c("subject_id", "hadm_id")) %>%
print(width = Inf)
## # A tibble: 53,150 × 15
## subject_id hadm_id admittime dischtime
## <dbl> <dbl> <dttm> <dttm>
## 1 10000032 29079034 2180-07-23 12:35:00 2180-07-25 17:55:00
## 2 10000980 26913865 2189-06-27 07:38:00 2189-07-03 03:00:00
## 3 10001217 24597018 2157-11-18 22:56:00 2157-11-25 18:00:00
## 4 10001725 25563031 2110-04-11 15:08:00 2110-04-14 15:00:00
## 5 10001884 26184834 2131-01-07 20:39:00 2131-01-20 05:15:00
## 6 10002013 23581541 2160-05-18 07:45:00 2160-05-23 13:30:00
## 7 10002155 23822395 2129-08-04 12:44:00 2129-08-18 16:53:00
## 8 10002223 22494570 2158-01-15 08:00:00 2158-01-20 19:29:00
## 9 10002348 22725460 2112-11-30 22:22:00 2112-12-10 17:56:00
## 10 10002428 28662225 2156-04-12 14:16:00 2156-04-29 16:26:00
## deathtime admission_type admission_location
## <dttm> <chr> <chr>
## 1 NA EW EMER. EMERGENCY ROOM
## 2 NA EW EMER. EMERGENCY ROOM
## 3 NA EW EMER. EMERGENCY ROOM
## 4 NA EW EMER. PACU
## 5 2131-01-20 05:15:00 OBSERVATION ADMIT EMERGENCY ROOM
## 6 NA SURGICAL SAME DAY ADMISSION PHYSICIAN REFERRAL
## 7 NA EW EMER. PROCEDURE SITE
## 8 NA EW EMER. EMERGENCY ROOM
## 9 NA OBSERVATION ADMIT TRANSFER FROM HOSPITAL
## 10 NA EW EMER. EMERGENCY ROOM
## discharge_location insurance language marital_status
## <chr> <chr> <chr> <chr>
## 1 HOME Medicaid ENGLISH WIDOWED
## 2 HOME HEALTH CARE Medicare ENGLISH MARRIED
## 3 HOME HEALTH CARE Other ? MARRIED
## 4 HOME Other ENGLISH MARRIED
## 5 DIED Medicare ENGLISH MARRIED
## 6 HOME HEALTH CARE Medicare ENGLISH SINGLE
## 7 CHRONIC/LONG TERM ACUTE CARE Other ENGLISH MARRIED
## 8 HOME Other ENGLISH <NA>
## 9 HOME HEALTH CARE Medicare ENGLISH SINGLE
## 10 SKILLED NURSING FACILITY Medicare ENGLISH WIDOWED
## ethnicity edregtime edouttime
## <chr> <dttm> <dttm>
## 1 WHITE 2180-07-23 05:54:00 2180-07-23 14:00:00
## 2 BLACK/AFRICAN AMERICAN 2189-06-27 06:25:00 2189-06-27 08:42:00
## 3 WHITE 2157-11-18 17:38:00 2157-11-19 01:24:00
## 4 WHITE NA NA
## 5 BLACK/AFRICAN AMERICAN 2131-01-07 13:36:00 2131-01-07 22:13:00
## 6 OTHER NA NA
## 7 WHITE 2129-08-04 11:00:00 2129-08-04 12:35:00
## 8 UNABLE TO OBTAIN 2158-01-15 06:49:00 2158-01-15 07:36:00
## 9 WHITE 2112-11-30 15:08:00 2112-11-30 23:24:00
## 10 WHITE 2156-04-12 09:56:00 2156-04-12 17:11:00
## hospital_expire_flag
## <dbl>
## 1 0
## 2 0
## 3 0
## 4 0
## 5 1
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## # … with 53,140 more rows
Solution: I chose bar plots to show the distributions:
ggplot(data = admissions_tble) +
geom_bar(mapping = aes(x = year(admittime))) +
labs(title = "Distribution of admission time by year") +
labs(x = "Admission year")
ggplot(data = admissions_tble) +
stat_count(mapping = aes(x = lubridate::month(admittime, label = T))) +
labs(title = "Distribution of admission time by month") +
labs(x = "Admission month")
ggplot(data = admissions_tble) +
stat_count(mapping = aes(x = mday(admittime))) +
labs(title = "Distribution of admission time by month and day") +
labs(x = "Admission month day")
ggplot(data = admissions_tble) +
stat_count(mapping = aes(x = lubridate::wday(admittime, label = T))) +
labs(title = "Distribution of admission time by week day") +
labs(x = "Admission week day")
ggplot(data = admissions_tble) +
stat_count(mapping = aes(x = hour(admittime))) +
labs(title = "Distribution of admission time by hour") +
labs(x = "Admission hour")
Solution: From the above graph, 0 AM and 7 AM appear to be unusual since the frequencies are higher than other times despite a late night and an early morning. That may be caused by some operation of the hospital.
patients
dataPatient information is available in patients.csv.gz
. See https://mimic.mit.edu/docs/iv/modules/core/patients/ for details of each field in this file. The first 10 lines are
system(
str_c(
"zcat < ",
shQuote(str_c(mimic_path, "/core/patients.csv.gz")),
" | head"
),
intern = TRUE
)
## [1] "subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod"
## [2] "10000048,F,23,2126,2008 - 2010,"
## [3] "10002723,F,0,2128,2017 - 2019,"
## [4] "10003939,M,0,2184,2008 - 2010,"
## [5] "10004222,M,0,2161,2014 - 2016,"
## [6] "10005325,F,0,2154,2011 - 2013,"
## [7] "10007338,F,0,2153,2017 - 2019,"
## [8] "10008101,M,0,2142,2008 - 2010,"
## [9] "10009872,F,0,2168,2014 - 2016,"
## [10] "10011333,F,0,2132,2014 - 2016,"
patients.csv.gz
(https://mimic.mit.edu/docs/iv/modules/core/patients/) as a tibble patients_tble
and only keep the patients who have a match in icustays_tble
(according to subject_id
).Solution: I used read_csv
to import this file as a tibble. The subset contains 53,150 rows.
patients_master <- read_csv(str_c(mimic_path,"/core/patients.csv.gz"),
show_col_types = FALSE)
patients_tble <- patients_master %>%
arrange(subject_id) %>%
semi_join(icustays_tble, by = c("subject_id")) %>%
print(width = Inf)
## # A tibble: 53,150 × 6
## subject_id gender anchor_age anchor_year anchor_year_group dod
## <dbl> <chr> <dbl> <dbl> <chr> <date>
## 1 10000032 F 52 2180 2014 - 2016 NA
## 2 10000980 F 73 2186 2008 - 2010 NA
## 3 10001217 F 55 2157 2011 - 2013 NA
## 4 10001725 F 46 2110 2011 - 2013 NA
## 5 10001884 F 68 2122 2008 - 2010 2131-01-20
## 6 10002013 F 53 2156 2008 - 2010 NA
## 7 10002155 F 80 2128 2008 - 2010 2131-03-10
## 8 10002223 M 21 2158 2008 - 2010 NA
## 9 10002348 F 77 2112 2017 - 2019 NA
## 10 10002428 F 80 2155 2011 - 2013 NA
## # … with 53,140 more rows
gender
and anchor_age
, and explain any patterns you see.Solution: I used a box plot and a bar plot to visualize the distribution of the two variables.
For the box plot:
For the bar plot:
p1 <- ggplot(data = patients_tble, mapping = aes(x = gender, y = anchor_age)) +
stat_boxplot(geom = "errorbar", width = 0.2) +
geom_boxplot() +
labs(title = "Distributions for anchor age by gender") +
labs(x = "Gender", y = "Anchor age (years)")
p2 <- ggplot(data = patients_tble) +
geom_bar(mapping = aes(x = anchor_age, fill = gender)) +
labs(title = "") +
labs(x = "Anchor age (years)", fill = "Gender")
max(patients_tble$anchor_age)
## [1] 91
gridExtra::grid.arrange(p1, p2, nrow = 1)
labevents.csv.gz
(https://mimic.mit.edu/docs/iv/modules/hosp/labevents/) contains all laboratory measurements for patients. The first 10 lines are
system(
str_c(
"zcat < ",
shQuote(str_c(mimic_path, "/hosp/labevents.csv.gz")),
" | head"
),
intern = TRUE
)
## [1] "labevent_id,subject_id,hadm_id,specimen_id,itemid,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments"
## [2] "670,10000048,,6448755,51484,2126-11-22 19:20:00,2126-11-22 20:07:00,150,150,mg/dL,,,,STAT,"
## [3] "673,10000048,,6448755,51491,2126-11-22 19:20:00,2126-11-22 20:07:00,6.5,6.5,units,5,8,,STAT,"
## [4] "675,10000048,,6448755,51498,2126-11-22 19:20:00,2126-11-22 20:07:00,1.029,1.029, ,1.001,1.035,,STAT,"
## [5] "683,10000048,,82729055,50861,2126-11-22 20:45:00,2126-11-23 00:55:00,39,39,IU/L,0,40,,STAT,"
## [6] "684,10000048,,82729055,50862,2126-11-22 20:45:00,2126-11-23 00:55:00,4.7,4.7,g/dL,3.4,4.8,,STAT,"
## [7] "685,10000048,,82729055,50863,2126-11-22 20:45:00,2126-11-23 00:55:00,45,45,IU/L,39,117,,STAT,"
## [8] "686,10000048,,82729055,50868,2126-11-22 20:45:00,2126-11-22 21:32:00,13,13,mEq/L,8,20,,STAT,"
## [9] "687,10000048,,82729055,50878,2126-11-22 20:45:00,2126-11-23 00:55:00,28,28,IU/L,0,40,,STAT,"
## [10] "688,10000048,,82729055,50882,2126-11-22 20:45:00,2126-11-22 21:32:00,26,26,mEq/L,22,32,,STAT,"
system(
str_c(
"zcat < ",
shQuote(str_c(mimic_path, "/hosp/d_labitems.csv.gz")),
" | head"
),
intern = TRUE
)
## [1] "itemid,label,fluid,category,loinc_code"
## [2] "51905, ,Other Body Fluid,Chemistry,"
## [3] "51532,11-Deoxycorticosterone,Blood,Chemistry,"
## [4] "51957,17-Hydroxycorticosteroids,Urine,Chemistry,"
## [5] "51958,\"17-Ketosteroids, Urine\",Urine,Chemistry,"
## [6] "52068,24 Hr,Blood,Hematology,"
## [7] "51066,24 hr Calcium,Urine,Chemistry,"
## [8] "51067,24 hr Creatinine,Urine,Chemistry,"
## [9] "51068,24 hr Protein,Urine,Chemistry,"
## [10] "50853,25-OH Vitamin D,Blood,Chemistry,"
labevents.csv.gz
.Solution: There are 122,103,667 rows in labevents.csv.gz
.
system(
str_c(
"zcat < ",
shQuote(str_c(mimic_path, "/hosp/labevents.csv.gz")),
" | tail -n +2 | wc -l"
),
intern = TRUE
)
## [1] "122103667"
labevents.csv.gz
only containing these items for the patients in icustays_tble
as a tibble labevents_tble
.Hint: labevents.csv.gz
is a data file too big to be read in by the read_csv
function in its default setting. Utilize the col_select
and lazy
options in the read_csv
function to reduce the memory burden.
Solution: I imported this and used left_join
to add label
in d_labitems.csv.gz
as a new variable. The subset has 16,698,462 rows.
labevents_master <-
read_csv(str_c(mimic_path,"/hosp/labevents_filtered_itemid.csv.gz"),
col_types = cols_only(subject_id = col_double(),
itemid = col_double(),
charttime = col_datetime(),
valuenum = col_double()),
lazy = TRUE)
d_labitems_tble <- read_csv(str_c(mimic_path,"/hosp/d_labitems.csv.gz"))
## Rows: 1630 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): label, fluid, category, loinc_code
## dbl (1): itemid
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# item list we plan to use (but we do not need this anymore)
choice_lab <- c("50912", "50971", "50983", "50902", "50882",
"51221", "51301", "50931", "50960", "50893")
labevents_tble <- labevents_master
labevents_tble <- labevents_tble %>%
arrange(subject_id, itemid) %>%
# only contain these items for the patients in icustays_tble
semi_join(icustays_tble, by = c("subject_id")) %>%
# add label in d_labitems
left_join(select(d_labitems_tble, itemid, label), by = c("itemid")) %>%
print(width = Inf)
## # A tibble: 16,698,462 × 5
## subject_id itemid charttime valuenum label
## <dbl> <dbl> <dttm> <dbl> <chr>
## 1 10000032 50882 2180-03-23 11:51:00 27 Bicarbonate
## 2 10000032 50882 2180-05-06 22:25:00 27 Bicarbonate
## 3 10000032 50882 2180-05-07 05:05:00 28 Bicarbonate
## 4 10000032 50882 2180-06-03 12:00:00 28 Bicarbonate
## 5 10000032 50882 2180-06-03 12:00:00 29 Bicarbonate
## 6 10000032 50882 2180-06-22 11:15:00 26 Bicarbonate
## 7 10000032 50882 2180-06-26 16:10:00 26 Bicarbonate
## 8 10000032 50882 2180-06-27 05:10:00 25 Bicarbonate
## 9 10000032 50882 2180-07-23 06:39:00 25 Bicarbonate
## 10 10000032 50882 2180-07-23 21:45:00 21 Bicarbonate
## # … with 16,698,452 more rows
labevents_tble
to the first lab measurement during the ICU stay.Solution: The following code made labevents_tble
51,623 rows. I used select
to keep subject_id
and ten lab measurements, or only 11 columns.
if (file.exists("labevents_tble.rds")) {
labevents_tble <- read_rds("labevents_tble.rds")
} else {
labevents_tble <- labevents_tble %>%
<<<<<<< HEAD
# add intime from icustays_tble to find the first lab measurement
left_join(select(icustays_tble, subject_id, intime, outtime),
by = c("subject_id")) %>%
# keep only lab measurements after intime
=======
# add intime and outtime from icustays_tble to find the first lab measurement
left_join(select(icustays_tble, subject_id, intime, outtime),
by = c("subject_id")) %>%
# keep only lab measurements between intime and outtime
>>>>>>> develop
filter(charttime >= intime, charttime <= outtime) %>%
# sort charttime in ascending order by subject_id and item_id
group_by(subject_id, itemid) %>%
arrange(charttime, .by_group = TRUE) %>%
# keep only the first lab measurement by group
slice_head(n = 1) %>%
# keep only 11 columns and spread label and valuenum
ungroup() %>%
select(-c(itemid, charttime, intime, outtime)) %>%
pivot_wider(names_from = label, values_from = valuenum) %>%
# avoid space in column name
rename(Calcium = "Calcium, Total", WBC = "White Blood Cells") %>%
print(width = Inf) %>%
write_rds("labevents_tble.rds")
}
## # A tibble: 51,623 × 11
## subject_id Bicarbonate Calcium Chloride Creatinine Glucose Magnesium
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10000032 21 9.3 102 0.5 115 2.3
## 2 10001217 23 8.2 104 0.4 113 1.9
## 3 10001725 24 9.1 106 0.8 146 1.5
## 4 10001884 33 9.8 96 1.1 148 2.2
## 5 10002013 23 NA 109 1.1 98 2.1
## 6 10002155 25 8.8 106 0.9 95 2
## 7 10002223 26 8.7 102 0.8 88 2
## 8 10002348 23 9.4 107 0.8 127 1.6
## 9 10002428 20 7.4 98 0.8 99 1.4
## 10 10002430 22 9.2 105 2.2 128 2.3
## Potassium Sodium Hematocrit WBC
## <dbl> <dbl> <dbl> <dbl>
## 1 4.7 132 NA NA
## 2 3.6 138 33.6 19
## 3 3.9 140 39.1 17
## 4 4 136 36 18.4
## 5 4 140 28.6 18.2
## 6 4.5 139 37.9 5.5
## 7 3.9 138 32.5 10.1
## 8 4.8 142 39.3 4.3
## 9 3.9 129 28.7 22.4
## 10 3.8 144 36.6 10.3
## # … with 51,613 more rows
Solution: For numeric, I used summary
to output summary statistics, implying that some appear to have missing ant extreme values. Taking this into account, for graphics, I removed values below the 2.5th and above the 97.5th percentile and then displayed these distributions in the following box plots. The distributions for Creatinine, Glucose, and White Blood Cells (WBC) are far from normal distribution, unlike the others.
# numerical summary
summary(labevents_tble[-1])
## Bicarbonate Calcium Chloride Creatinine
## Min. : 2.00 Min. : 0.000 Min. : 58.0 Min. : 0.000
## 1st Qu.:21.00 1st Qu.: 7.900 1st Qu.:101.0 1st Qu.: 0.700
## Median :23.00 Median : 8.400 Median :105.0 Median : 0.900
## Mean :22.96 Mean : 8.349 Mean :104.5 Mean : 1.311
## 3rd Qu.:25.00 3rd Qu.: 8.800 3rd Qu.:108.0 3rd Qu.: 1.300
## Max. :49.00 Max. :43.000 Max. :153.0 Max. :36.900
## NA's :278 NA's :2810 NA's :238 NA's :257
## Glucose Magnesium Potassium Sodium
## Min. : 7.0 Min. : 0.000 Min. : 0.800 Min. : 92.0
## 1st Qu.: 103.0 1st Qu.: 1.700 1st Qu.: 3.700 1st Qu.:136.0
## Median : 124.0 Median : 2.000 Median : 4.100 Median :139.0
## Mean : 141.4 Mean : 2.001 Mean : 4.178 Mean :138.5
## 3rd Qu.: 156.0 3rd Qu.: 2.200 3rd Qu.: 4.500 3rd Qu.:141.0
## Max. :2440.0 Max. :47.000 Max. :13.000 Max. :180.0
## NA's :354 NA's :1340 NA's :264 NA's :232
## Hematocrit WBC
## Min. : 4.30 Min. : 0.00
## 1st Qu.:28.10 1st Qu.: 7.60
## Median :32.60 Median : 10.60
## Mean :32.74 Mean : 12.07
## 3rd Qu.:37.30 3rd Qu.: 14.50
## Max. :68.60 Max. :572.50
## NA's :504 NA's :627
# graphical summary
labevents_tble %>%
select(2:11) %>%
gather() %>%
group_by(key) %>%
filter(value > quantile(value, 0.025, na.rm = TRUE)
& value < quantile(value, 0.975, na.rm = TRUE)) %>%
ungroup %>%
ggplot() +
geom_boxplot(mapping = aes(y = value)) +
facet_wrap(~key, scales = "free_y") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
chartevents.csv.gz
(https://mimic.mit.edu/docs/iv/modules/icu/chartevents/) contains all the charted data available for a patient. During their ICU stay, the primary repository of a patient’s information is their electronic chart. The itemid
variable indicates a single measurement type in the database. The value
variable is the value measured for itemid
. The first 10 lines of chartevents.csv.gz
are
system(
str_c(
"zcat < ",
shQuote(str_c(mimic_path, "/icu/chartevents.csv.gz")),
" | head"),
intern = TRUE
)
## [1] "subject_id,hadm_id,stay_id,charttime,storetime,itemid,value,valuenum,valueuom,warning"
## [2] "10003700,28623837,30600691,2165-04-24 05:10:00,2165-04-24 05:11:00,228236,0,0,,0"
## [3] "10003700,28623837,30600691,2165-04-24 05:12:00,2165-04-24 05:14:00,225067,0,0,,0"
## [4] "10003700,28623837,30600691,2165-04-24 05:12:00,2165-04-24 05:14:00,225070,1,1,,0"
## [5] "10003700,28623837,30600691,2165-04-24 05:12:00,2165-04-24 05:14:00,225076,1,1,,0"
## [6] "10003700,28623837,30600691,2165-04-24 05:12:00,2165-04-24 05:14:00,225078,1,1,,0"
## [7] "10003700,28623837,30600691,2165-04-24 05:12:00,2165-04-24 05:14:00,225086,1,1,,0"
## [8] "10003700,28623837,30600691,2165-04-24 05:12:00,2165-04-24 05:14:00,225091,1,1,,0"
## [9] "10003700,28623837,30600691,2165-04-24 05:12:00,2165-04-24 05:14:00,225103,1,1,,0"
## [10] "10003700,28623837,30600691,2165-04-24 05:12:00,2165-04-24 05:14:00,225106,1,1,,0"
d_items.csv.gz
(https://mimic.mit.edu/docs/iv/modules/icu/d_items/) is the dictionary for the itemid
in chartevents.csv.gz
.
system(
str_c(
"zcat < ",
shQuote(str_c(mimic_path, "/icu/d_items.csv.gz")),
" | head"),
intern = TRUE
)
## [1] "itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue"
## [2] "220003,ICU Admission date,ICU Admission date,datetimeevents,ADT,,Date and time,,"
## [3] "220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,"
## [4] "220046,Heart rate Alarm - High,HR Alarm - High,chartevents,Alarms,bpm,Numeric,,"
## [5] "220047,Heart Rate Alarm - Low,HR Alarm - Low,chartevents,Alarms,bpm,Numeric,,"
## [6] "220048,Heart Rhythm,Heart Rhythm,chartevents,Routine Vital Signs,,Text,,"
## [7] "220050,Arterial Blood Pressure systolic,ABPs,chartevents,Routine Vital Signs,mmHg,Numeric,90,140"
## [8] "220051,Arterial Blood Pressure diastolic,ABPd,chartevents,Routine Vital Signs,mmHg,Numeric,60,90"
## [9] "220052,Arterial Blood Pressure mean,ABPm,chartevents,Routine Vital Signs,mmHg,Numeric,,"
## [10] "220056,Arterial Blood Pressure Alarm - Low,ABP Alarm - Low,chartevents,Alarms,mmHg,Numeric,,"
chartevents.csv.gz
only containing these items for the patients in icustays_tble
as a tibble chartevents_tble
.Solution: Importing this and adding label
in d_items.csv.gz
as a new column created a tibble with 23,679,058 × 6. The subset retained all the rows of the original.
chartevents_master <-
read_csv(str_c(mimic_path,"/icu/chartevents_filtered_itemid.csv.gz"),
col_types = cols_only(subject_id = col_double(),
hadm_id = col_double(),
itemid = col_double(),
itemid = col_double(),
charttime = col_datetime(),
valuenum = col_double()),
lazy = TRUE)
d_items_tble <- read_csv(str_c(mimic_path,"/icu/d_items.csv.gz"))
## Rows: 3861 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): label, abbreviation, linksto, category, unitname, param_type
## dbl (3): itemid, lownormalvalue, highnormalvalue
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# item list we plan to use (but we do not need this anymore)
choice2 <- c("220045", "220181", "220179", "223761", "220210")
chartevents_tble <- chartevents_master
chartevents_tble <- chartevents_tble %>%
# only containing these items for the patients in icustays_tble
semi_join(icustays_tble, by = c("subject_id")) %>%
# add label in d_items
left_join(select(d_items_tble, itemid, label), by = c("itemid")) %>%
print(width = Inf)
## # A tibble: 23,679,058 × 6
## subject_id hadm_id charttime itemid valuenum
## <dbl> <dbl> <dttm> <dbl> <dbl>
## 1 10003700 28623837 2165-04-24 05:28:00 220179 152
## 2 10003700 28623837 2165-04-24 05:28:00 220181 110
## 3 10003700 28623837 2165-04-24 05:30:00 220045 65
## 4 10003700 28623837 2165-04-24 05:30:00 220210 14
## 5 10003700 28623837 2165-04-24 05:38:00 223761 97.6
## 6 10003700 28623837 2165-04-24 06:00:00 220045 56
## 7 10003700 28623837 2165-04-24 06:00:00 220179 126
## 8 10003700 28623837 2165-04-24 06:00:00 220181 88
## 9 10003700 28623837 2165-04-24 06:00:00 220210 14
## 10 10003700 28623837 2165-04-24 06:09:00 220045 55
## label
## <chr>
## 1 Non Invasive Blood Pressure systolic
## 2 Non Invasive Blood Pressure mean
## 3 Heart Rate
## 4 Respiratory Rate
## 5 Temperature Fahrenheit
## 6 Heart Rate
## 7 Non Invasive Blood Pressure systolic
## 8 Non Invasive Blood Pressure mean
## 9 Respiratory Rate
## 10 Heart Rate
## # … with 23,679,048 more rows
chartevents_tble
to the first vital measurement during the ICU stay.Solution: I ran almost the same code as Q5-3 and obtained a tibble with 53,136 × 6, which consists of only subject_id
and five vital measurements.
# file.remove("chartevents_tble.rds")
if (file.exists("chartevents_tble.rds")) {
chartevents_tble <- read_rds("chartevents_tble.rds")
} else {
chartevents_tble <- chartevents_tble %>%
<<<<<<< HEAD
# add intime from icustays_tble to find the first vital measurement
left_join(select(icustays_tble, subject_id, intime, outtime),
by = c("subject_id")) %>%
# keep only vital measurements after intime
=======
# add intime and outtime from icustays_tble to find the first vital measurement
left_join(select(icustays_tble, subject_id, intime, outtime),
by = c("subject_id")) %>%
# keep only vital measurements between intime and outtime
>>>>>>> develop
filter(charttime >= intime, charttime <= outtime) %>%
# sort charttime in ascending order by subject_id and item_id
group_by(subject_id, itemid) %>%
arrange(charttime, .by_group = TRUE) %>%
# keep only the first vital measurement by group
slice_head(n = 1) %>%
# restrict columns and spread label and valuenum
ungroup() %>%
select(c(subject_id, label, valuenum)) %>%
pivot_wider(names_from = label, values_from = valuenum) %>%
# avoid space in column name
rename(HR = "Heart Rate", RR = "Respiratory Rate",
Mean_BP = "Non Invasive Blood Pressure systolic",
Systolic_BP = "Non Invasive Blood Pressure mean",
BT = "Temperature Fahrenheit") %>%
print(width = Inf) %>%
write_rds("chartevents_tble.rds")
}
## # A tibble: 53,136 × 6
## subject_id HR Mean_BP Systolic_BP RR BT
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10000032 91 84 56 24 98.7
## 2 10000980 77 150 92 23 98
## 3 10001217 86 151 104 18 98.5
## 4 10001725 55 73 59 19 97.7
## 5 10001884 38 180 46 10 98.1
## 6 10002013 80 104 77 14 97.2
## 7 10002155 68 126 78 18 95.9
## 8 10002223 100 123 80 29 98.6
## 9 10002348 72 129 85 14 97.9
## 10 10002428 124 87 53 25 103.
## # … with 53,126 more rows
Solution: As with Q5-4, I output summary statistics, which indicates some extreme values exist. Then I used box plots that omitted values outside from the 2.5th to 97.5th percentile to make the distributions more visible. All distributions seem to be approximately close to normal.
# numerical summary
summary(chartevents_tble[-1])
## HR Mean_BP Systolic_BP RR
## Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 74.00 1st Qu.: 106.0 1st Qu.: 70.00 1st Qu.: 15.00
## Median : 85.00 Median : 122.0 Median : 81.00 Median : 18.00
## Mean : 87.47 Mean : 123.8 Mean : 87.42 Mean : 18.69
## 3rd Qu.: 99.00 3rd Qu.: 139.0 3rd Qu.: 93.00 3rd Qu.: 22.00
## Max. :941.00 Max. :12262.0 Max. :140119.00 Max. :180.00
## NA's :1 NA's :669 NA's :690 NA's :48
## BT
## Min. : 0.00
## 1st Qu.: 97.60
## Median : 98.10
## Mean : 98.03
## 3rd Qu.: 98.70
## Max. :106.00
## NA's :940
# graphical summary
chartevents_tble %>%
select(2:6) %>%
gather() %>%
group_by(key) %>%
filter(value > quantile(value, 0.025, na.rm = TRUE)
& value < quantile(value, 0.975, na.rm = TRUE)) %>%
ungroup %>%
ggplot() +
geom_boxplot(mapping = aes(y = value)) +
facet_wrap(~key, scales = "free_y") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
Let us create a tibble mimic_icu_cohort
for all ICU stays, where rows are
and columns contain at least following variables
icustays.csv.gz
admission.csv.gz
patients.csv.gz
thirty_day_mort
whether the patient died within 30 days of hospital admission (30 day mortality)Solution: Here is the code to create mimic_icu_cohort
with 53,065 rows and 43 columns. I set thirty_day_mort
as TRUE
if a person died within 30 days; FALSE
otherwise. I thought first that patients who haven’t died were in another category, though.
# file.remove("mimic_icu_cohort.rds")
if (file.exists("mimic_icu_cohort.rds")) {
mimic_icu_cohort <- read_rds("mimic_icu_cohort.rds")
} else {
mimic_icu_cohort <- icustays_tble %>%
left_join(admissions_tble, by = c("subject_id", "hadm_id")) %>%
left_join(patients_tble, by = c("subject_id")) %>%
left_join(labevents_tble, by = c("subject_id")) %>%
left_join(chartevents_tble, by = c("subject_id")) %>%
# compute age at admission
mutate(age_hadm = anchor_age + year(admittime) - anchor_year) %>%
# keep only patients aged over 18 at admission
filter(age_hadm > 18) %>%
# create thirty_day_mort with T if a patient died <= 30 days and F if not
mutate(thirty_day_mort =
ifelse(is.na(deathtime), "FALSE",
ifelse(as.Date(deathtime) - as.Date(admittime) <= 30,
"TRUE", "FALSE"))) %>%
print(width = Inf) %>%
write_rds("mimic_icu_cohort.rds")
}
## # A tibble: 53,065 × 43
## subject_id hadm_id stay_id first_careunit
## <dbl> <dbl> <dbl> <chr>
## 1 10000032 29079034 39553978 Medical Intensive Care Unit (MICU)
## 2 10000980 26913865 39765666 Medical Intensive Care Unit (MICU)
## 3 10001217 24597018 37067082 Surgical Intensive Care Unit (SICU)
## 4 10001725 25563031 31205490 Medical/Surgical Intensive Care Unit (MICU/SICU)
## 5 10001884 26184834 37510196 Medical Intensive Care Unit (MICU)
## 6 10002013 23581541 39060235 Cardiac Vascular Intensive Care Unit (CVICU)
## 7 10002155 23822395 33685454 Coronary Care Unit (CCU)
## 8 10002223 22494570 39638202 Trauma SICU (TSICU)
## 9 10002348 22725460 32610785 Neuro Intermediate
## 10 10002428 28662225 33987268 Medical Intensive Care Unit (MICU)
## last_careunit intime
## <chr> <dttm>
## 1 Medical Intensive Care Unit (MICU) 2180-07-23 14:00:00
## 2 Medical Intensive Care Unit (MICU) 2189-06-27 08:42:00
## 3 Surgical Intensive Care Unit (SICU) 2157-11-20 19:18:02
## 4 Medical/Surgical Intensive Care Unit (MICU/SICU) 2110-04-11 15:52:22
## 5 Medical Intensive Care Unit (MICU) 2131-01-11 04:20:05
## 6 Cardiac Vascular Intensive Care Unit (CVICU) 2160-05-18 10:00:53
## 7 Coronary Care Unit (CCU) 2129-08-04 12:45:00
## 8 Trauma SICU (TSICU) 2158-01-15 08:01:49
## 9 Neuro Intermediate 2112-11-30 23:24:00
## 10 Medical Intensive Care Unit (MICU) 2156-04-12 16:24:18
## outtime los admittime dischtime
## <dttm> <dbl> <dttm> <dttm>
## 1 2180-07-23 23:50:47 0.410 2180-07-23 12:35:00 2180-07-25 17:55:00
## 2 2189-06-27 20:38:27 0.498 2189-06-27 07:38:00 2189-07-03 03:00:00
## 3 2157-11-21 22:08:00 1.12 2157-11-18 22:56:00 2157-11-25 18:00:00
## 4 2110-04-12 23:59:56 1.34 2110-04-11 15:08:00 2110-04-14 15:00:00
## 5 2131-01-20 08:27:30 9.17 2131-01-07 20:39:00 2131-01-20 05:15:00
## 6 2160-05-19 17:33:33 1.31 2160-05-18 07:45:00 2160-05-23 13:30:00
## 7 2129-08-10 17:02:38 6.18 2129-08-04 12:44:00 2129-08-18 16:53:00
## 8 2158-01-16 15:19:24 1.30 2158-01-15 08:00:00 2158-01-20 19:29:00
## 9 2112-12-10 18:25:13 9.79 2112-11-30 22:22:00 2112-12-10 17:56:00
## 10 2156-04-17 15:57:08 4.98 2156-04-12 14:16:00 2156-04-29 16:26:00
## deathtime admission_type admission_location
## <dttm> <chr> <chr>
## 1 NA EW EMER. EMERGENCY ROOM
## 2 NA EW EMER. EMERGENCY ROOM
## 3 NA EW EMER. EMERGENCY ROOM
## 4 NA EW EMER. PACU
## 5 2131-01-20 05:15:00 OBSERVATION ADMIT EMERGENCY ROOM
## 6 NA SURGICAL SAME DAY ADMISSION PHYSICIAN REFERRAL
## 7 NA EW EMER. PROCEDURE SITE
## 8 NA EW EMER. EMERGENCY ROOM
## 9 NA OBSERVATION ADMIT TRANSFER FROM HOSPITAL
## 10 NA EW EMER. EMERGENCY ROOM
## discharge_location insurance language marital_status
## <chr> <chr> <chr> <chr>
## 1 HOME Medicaid ENGLISH WIDOWED
## 2 HOME HEALTH CARE Medicare ENGLISH MARRIED
## 3 HOME HEALTH CARE Other ? MARRIED
## 4 HOME Other ENGLISH MARRIED
## 5 DIED Medicare ENGLISH MARRIED
## 6 HOME HEALTH CARE Medicare ENGLISH SINGLE
## 7 CHRONIC/LONG TERM ACUTE CARE Other ENGLISH MARRIED
## 8 HOME Other ENGLISH <NA>
## 9 HOME HEALTH CARE Medicare ENGLISH SINGLE
## 10 SKILLED NURSING FACILITY Medicare ENGLISH WIDOWED
## ethnicity edregtime edouttime
## <chr> <dttm> <dttm>
## 1 WHITE 2180-07-23 05:54:00 2180-07-23 14:00:00
## 2 BLACK/AFRICAN AMERICAN 2189-06-27 06:25:00 2189-06-27 08:42:00
## 3 WHITE 2157-11-18 17:38:00 2157-11-19 01:24:00
## 4 WHITE NA NA
## 5 BLACK/AFRICAN AMERICAN 2131-01-07 13:36:00 2131-01-07 22:13:00
## 6 OTHER NA NA
## 7 WHITE 2129-08-04 11:00:00 2129-08-04 12:35:00
## 8 UNABLE TO OBTAIN 2158-01-15 06:49:00 2158-01-15 07:36:00
## 9 WHITE 2112-11-30 15:08:00 2112-11-30 23:24:00
## 10 WHITE 2156-04-12 09:56:00 2156-04-12 17:11:00
## hospital_expire_flag gender anchor_age anchor_year anchor_year_group
## <dbl> <chr> <dbl> <dbl> <chr>
## 1 0 F 52 2180 2014 - 2016
## 2 0 F 73 2186 2008 - 2010
## 3 0 F 55 2157 2011 - 2013
## 4 0 F 46 2110 2011 - 2013
## 5 1 F 68 2122 2008 - 2010
## 6 0 F 53 2156 2008 - 2010
## 7 0 F 80 2128 2008 - 2010
## 8 0 M 21 2158 2008 - 2010
## 9 0 F 77 2112 2017 - 2019
## 10 0 F 80 2155 2011 - 2013
## dod Bicarbonate Calcium Chloride Creatinine Glucose Magnesium
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NA 21 9.3 102 0.5 115 2.3
## 2 NA NA NA NA NA NA NA
## 3 NA 23 8.2 104 0.4 113 1.9
## 4 NA 24 9.1 106 0.8 146 1.5
## 5 2131-01-20 33 9.8 96 1.1 148 2.2
## 6 NA 23 NA 109 1.1 98 2.1
## 7 2131-03-10 25 8.8 106 0.9 95 2
## 8 NA 26 8.7 102 0.8 88 2
## 9 NA 23 9.4 107 0.8 127 1.6
## 10 NA 20 7.4 98 0.8 99 1.4
## Potassium Sodium Hematocrit WBC HR Mean_BP Systolic_BP RR BT
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.7 132 NA NA 91 84 56 24 98.7
## 2 NA NA NA NA 77 150 92 23 98
## 3 3.6 138 33.6 19 86 151 104 18 98.5
## 4 3.9 140 39.1 17 55 73 59 19 97.7
## 5 4 136 36 18.4 38 180 46 10 98.1
## 6 4 140 28.6 18.2 80 104 77 14 97.2
## 7 4.5 139 37.9 5.5 68 126 78 18 95.9
## 8 3.9 138 32.5 10.1 100 123 80 29 98.6
## 9 4.8 142 39.3 4.3 72 129 85 14 97.9
## 10 3.9 129 28.7 22.4 124 87 53 25 103.
## age_hadm thirty_day_mort
## <dbl> <chr>
## 1 52 FALSE
## 2 76 FALSE
## 3 55 FALSE
## 4 46 FALSE
## 5 77 TRUE
## 6 57 FALSE
## 7 81 FALSE
## 8 21 FALSE
## 9 77 FALSE
## 10 81 FALSE
## # … with 53,055 more rows
table(mimic_icu_cohort$thirty_day_mort)
##
## FALSE TRUE
## 47853 5212
Summarize following information using appropriate numerics or graphs.
thirty_day_mort
vs demographic variables (ethnicity, language, insurance, marital_status, gender, age at hospital admission)Solution: I used bar plots for categorical variables (ethnicity, language, insurance, marital_status and gender) and a box plot for a numerical variable (age at hospital admission).
thirty_day_mort
vs ethnicityBrief comment: The percentage of white patients who died within 30 days is lower than that of those who did not, while the proportion of the unknown has the opposite relationship.
# numerical information
round(prop.table(table(mimic_icu_cohort$ethnicity,
mimic_icu_cohort$thirty_day_mort) ,2), 2)
##
## FALSE TRUE
## AMERICAN INDIAN/ALASKA NATIVE 0.00 0.00
## ASIAN 0.03 0.03
## BLACK/AFRICAN AMERICAN 0.09 0.07
## HISPANIC/LATINO 0.04 0.02
## OTHER 0.05 0.04
## UNABLE TO OBTAIN 0.01 0.02
## UNKNOWN 0.10 0.22
## WHITE 0.68 0.59
# graphical information
mimic_icu_cohort %>%
ggplot() +
geom_bar(mapping = aes(x = thirty_day_mort, fill = ethnicity),
position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "30 day mortality", y = "percent") +
scale_x_discrete(limits = ) +
labs(title = "30 day mortality vs ethnicity")
thirty_day_mort
vs language, insurance, marital_status, and genderBrief comment:
NA
), the adult patients who died within 30 days have a smaller proportion of the single than those who did not die, but a larger proportion of the widowed.# numerical information
round(prop.table(table(mimic_icu_cohort$language,
mimic_icu_cohort$thirty_day_mort) ,2), 2)
##
## FALSE TRUE
## ? 0.10 0.11
## ENGLISH 0.90 0.89
round(prop.table(table(mimic_icu_cohort$insurance,
mimic_icu_cohort$thirty_day_mort) ,2), 2)
##
## FALSE TRUE
## Medicaid 0.07 0.06
## Medicare 0.42 0.53
## Other 0.51 0.41
round(prop.table(table(mimic_icu_cohort$marital_status,
mimic_icu_cohort$thirty_day_mort) ,2), 2)
##
## FALSE TRUE
## DIVORCED 0.08 0.07
## MARRIED 0.50 0.49
## SINGLE 0.30 0.24
## WIDOWED 0.13 0.20
round(prop.table(table(mimic_icu_cohort$gender,
mimic_icu_cohort$thirty_day_mort) ,2), 2)
##
## FALSE TRUE
## F 0.44 0.46
## M 0.56 0.54
# graphical information
q1 <- mimic_icu_cohort %>%
ggplot() +
geom_bar(mapping = aes(x = thirty_day_mort, fill = language),
position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "30 day mortality", y = "percent") +
labs(title = "30 day mortality vs language")
q2 <- mimic_icu_cohort %>%
ggplot() +
geom_bar(mapping = aes(x = thirty_day_mort, fill = insurance),
position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "30 day mortality", y = "percent") +
labs(title = "30 day mortality vs insurance")
q3 <- subset(mimic_icu_cohort, !is.na(marital_status)) %>%
ggplot() +
geom_bar(mapping = aes(x = thirty_day_mort, fill = marital_status),
position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "30 day mortality", y = "percent") +
labs(title = "30 day mortality vs marital status")
q4 <- mimic_icu_cohort %>%
ggplot() +
geom_bar(mapping = aes(x = thirty_day_mort, fill = gender),
position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "30 day mortality", y = "percent") +
labs(title = "30 day mortality vs gender")
gridExtra::grid.arrange(q1, q2, q3, q4, nrow = 2)
thirty_day_mort
vs age at hospital admissionBrief comment: Patients aged over 18 who died within 30 days tend to be older than those who did not.
# numerical information
tapply(mimic_icu_cohort$age_hadm, mimic_icu_cohort$thirty_day_mort, summary)
## $`FALSE`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 53.00 66.00 63.81 77.00 102.00
##
## $`TRUE`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 61.00 74.00 71.28 84.00 100.00
# graphical information
ggplot(data = mimic_icu_cohort) +
geom_boxplot(aes(x = thirty_day_mort, y = age_hadm)) +
labs(title = "30 day mortality vs age at hospital admission") +
labs(x = "30 day mortality") +
labs(y = "Age at hospital admission")
thirty_day_mort
vs first lab measurementsSolution: I used a similar method with Q5-4 except for grouping by thirty_day_mort
as follows.
# numerical summary
mimic_icu_cohort %>%
select(c(27:36, 43)) %>%
gather(key = key, value = value, -thirty_day_mort) %>%
filter(!is.na(value)) %>%
filter(value > quantile(value, 0.025, na.rm = TRUE)
& value < quantile(value, 0.975, na.rm = TRUE)) %>%
group_by(key, thirty_day_mort) %>%
summarize(Mean = mean(value, na.rm = TRUE),
Median = median(value, na.rm = TRUE),
SD = sd(value, na.rm = TRUE),
Min = min(value, na.rm = TRUE),
Max = max(value, na.rm = TRUE),
Q1 = quantile(value, 0.25, na.rm = TRUE),
Q3 = quantile(value, 0.75, na.rm = TRUE))
## `summarise()` has grouped output by 'key'. You can override using the `.groups`
## argument.
## # A tibble: 20 × 9
## # Groups: key [10]
## key thirty_day_mort Mean Median SD Min Max Q1 Q3
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bicarbonate FALSE 23.2 23 4.19 3 49 21 25
## 2 Bicarbonate TRUE 20.8 21 5.84 2 48 17 24
## 3 Calcium FALSE 8.36 8.4 0.810 1.8 27.5 7.9 8.8
## 4 Calcium TRUE 8.22 8.2 1.22 1.6 43 7.6 8.8
## 5 Chloride FALSE 105. 105 6.12 58 153 101 108
## 6 Chloride TRUE 103. 104 7.84 61 140 99 108
## 7 Creatinine FALSE 1.57 1.1 1.66 0.8 36.9 0.9 1.5
## 8 Creatinine TRUE 2.04 1.5 1.58 0.8 23.1 1.1 2.4
## 9 Glucose FALSE 114. 113 21.6 8 156 98 130
## 10 Glucose TRUE 113. 115 26.8 7 156 95 134
## 11 Hematocrit FALSE 32.8 32.7 6.43 6.4 68.6 28.1 37.3
## 12 Hematocrit TRUE 32.5 32.1 7.22 4.3 62.1 27.4 37.2
## 13 Magnesium FALSE 2.00 2 0.510 0.8 47 1.7 2.2
## 14 Magnesium TRUE 2.03 2 0.465 0.8 8.7 1.7 2.2
## 15 Potassium FALSE 4.16 4.1 0.676 0.8 13 3.7 4.5
## 16 Potassium TRUE 4.33 4.2 0.904 1.4 9.6 3.7 4.8
## 17 Sodium FALSE 138. 139 4.58 92 156 136 141
## 18 Sodium TRUE 138. 139 6.19 97 156 135 142
## 19 WBC FALSE 11.6 10.4 6.65 0.8 156. 7.6 14.2
## 20 WBC TRUE 14.8 12.7 10.6 0.8 156. 8.7 18.1
# graphical summary
mimic_icu_cohort %>%
gather(27:36, key = "key", value = "value") %>%
group_by(key) %>%
filter(value > quantile(value, 0.025, na.rm = TRUE)
& value < quantile(value, 0.975, na.rm = TRUE)) %>%
ungroup %>%
ggplot(mapping = aes(x = thirty_day_mort, y = value)) +
geom_boxplot() +
labs(x = "30 day mortality") +
facet_wrap(~key, scales = "free_y") +
labs(title = "Distributions for first lab measurements by 30 day mortality")
Brief comment: For Bicarbonate and Creatinine, there appear to be big differences between the two categories because one of the medians is closer to the edge of the other box. Also, it may be worth testing whether there are statistically significant differences for glucose and white blood cells as well.
thirty_day_mort
vs first vital measurementsSolution: I used a similar method with Q6-3 except for grouping by thirty_day_mort
.
# numerical summary
mimic_icu_cohort %>%
select(c(37:41, 43)) %>%
gather(key = key, value = value, -thirty_day_mort) %>%
filter(!is.na(value)) %>%
filter(value > quantile(value, 0.025, na.rm = TRUE)
& value < quantile(value, 0.975, na.rm = TRUE)) %>%
group_by(key, thirty_day_mort) %>%
summarize(Mean = mean(value, na.rm = TRUE),
Median = median(value, na.rm = TRUE),
SD = sd(value, na.rm = TRUE),
Min = min(value, na.rm = TRUE),
Max = max(value, na.rm = TRUE),
Q1 = quantile(value, 0.25, na.rm = TRUE),
Q3 = quantile(value, 0.75, na.rm = TRUE))
## `summarise()` has grouped output by 'key'. You can override using the `.groups`
## argument.
## # A tibble: 10 × 9
## # Groups: key [5]
## key thirty_day_mort Mean Median SD Min Max Q1 Q3
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BT FALSE 98.1 98.2 2.23 33.8 106 97.6 98.7
## 2 BT TRUE 97.5 98 5.59 30.9 106. 97.4 98.7
## 3 HR FALSE 86.6 84 18.9 25 152 74 98
## 4 HR TRUE 92.7 92 21.4 24 152 77 108
## 5 Mean_BP FALSE 118. 119 18.4 36 152 105 132
## 6 Mean_BP TRUE 113. 113 21.5 31 152 98 130
## 7 RR FALSE 20.0 19 5.36 14 152 16 22
## 8 RR TRUE 22.2 21 6.22 14 115 18 25
## 9 Systolic_BP FALSE 82.4 81 17.4 14 152 70 93
## 10 Systolic_BP TRUE 79.9 78 20.1 19 152 66 93
# graphical summary
mimic_icu_cohort %>%
gather(37:41, key = "key", value = "value") %>%
group_by(key) %>%
filter(value > quantile(value, 0.025, na.rm = TRUE)
& value < quantile(value, 0.975, na.rm = TRUE)) %>%
ungroup %>%
ggplot(mapping = aes(x = thirty_day_mort, y = value)) +
geom_boxplot() +
labs(x = "30 day mortality") +
facet_wrap(~key, scales = "free_y") +
labs(title =
"Distributions for first vital measurements by 30 day mortality")
Brief comment: The values for Heart Rate (HR) and Respiratory Rate (RR) tend to be relatively higher for patients who died within 30 days than those who did not. For the rest, no major differences can be seen.
thirty_day_mort
vs first ICU unitSolution: I summarized this in a similar way to thirty_day_mort
vs demographic variables.
# numerical summary
round(prop.table(table(mimic_icu_cohort$first_careunit,
mimic_icu_cohort$thirty_day_mort) ,2), 2)
##
## FALSE TRUE
## Cardiac Vascular Intensive Care Unit (CVICU) 0.19 0.05
## Coronary Care Unit (CCU) 0.11 0.14
## Medical Intensive Care Unit (MICU) 0.18 0.27
## Medical/Surgical Intensive Care Unit (MICU/SICU) 0.16 0.20
## Neuro Intermediate 0.03 0.01
## Neuro Stepdown 0.01 0.00
## Neuro Surgical Intensive Care Unit (Neuro SICU) 0.02 0.04
## Surgical Intensive Care Unit (SICU) 0.15 0.17
## Trauma SICU (TSICU) 0.13 0.13
# graphical summary
mimic_icu_cohort %>%
ggplot() +
geom_bar(mapping = aes(x = thirty_day_mort, fill = first_careunit),
position = "fill") +
labs(y = "percent") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Proportion for the first ICU unit by 30 day mortality") +
<<<<<<< HEAD
labs(x = "30 day mortality", fill ="First ICU unit")
=======
labs(x = "30 day mortality", fill = "First ICU unit")
>>>>>>> develop
Brief comment: The proportion of CVICU for patients who died within 30 days is 14 points lower than that of those who did not. On the other hand, for MICU, the proportion of patients who died within 30 days is 9 points higher. Other than that, there appears to be little difference.