Pull Covariates for Regression Adjustment

Author

Jacob Simmering

library(tidyverse)
-- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
v dplyr     1.1.2     v readr     2.1.4
v forcats   1.0.0     v stringr   1.5.0
v ggplot2   3.4.2     v tibble    3.2.1
v lubridate 1.9.2     v tidyr     1.3.0
v purrr     1.0.1     
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(parallel)
library(icd)

Data Load

We want to find diagnostic information for our cohort. Start by loading the cohort data to use to filter the databases:

cohort <- read_rds("/Shared/lss_jsimmeri/als/cohort.rds")
als_dates <- read_rds("/Shared/lss_jsimmeri/als/first_als_date.rds")

Find All Diagnostic Codes

We want to find and return all diagnostic codes for the 110,618 people in our dataset from outpatient, inpatient, and facilities tables. Define functions to do the extraction:

find_outpatient_dx_by_enrolid <- function(source, year, required_enrolid) {
  db <- DBI::dbConnect(RSQLite::SQLite(),
                       glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  if (as.numeric(year) <= 14) {
    outpatient <- tbl(db, glue::glue("outpatient_dx_{source}_{year}")) %>%
      filter(enrolid %in% required_enrolid) %>%
      select(enrolid, svcdate, dx) %>%
      mutate(icd_version = 9,
             enrolid = as.character(enrolid)) %>%
      collect()
  } else {
    outpatient_9 <- tbl(db, glue::glue("outpatient_dx9_{source}_{year}")) %>%
      filter(enrolid %in% required_enrolid) %>%
      select(enrolid, svcdate, dx) %>%
      mutate(icd_version = 9,
             enrolid = as.character(enrolid)) %>%
      collect()
    outpatient_10 <- tbl(db, glue::glue("outpatient_dx10_{source}_{year}")) %>%
      filter(enrolid %in% required_enrolid) %>%
      select(enrolid, svcdate, dx) %>%
      mutate(icd_version = 10,
             enrolid = as.character(enrolid)) %>%
      collect()
    if (nrow(outpatient_9) > 0 & nrow(outpatient_10) > 0) {
      outpatient <- bind_rows(outpatient_9, outpatient_10)
    } else if (nrow(outpatient_10) > 0) {
      outpatient <- outpatient_10
    } else {
      outpatient <- outpatient_9
    }
  }

  outpatient <- outpatient %>%
    distinct()
  DBI::dbDisconnect(db)
  return(outpatient)
}

find_inpatient_dx_by_enrolid <- function(source, year, required_enrolid) {
  db <- DBI::dbConnect(RSQLite::SQLite(),
                       glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  inpatient_core <- tbl(db, glue::glue("inpatient_core_{source}_{year}")) %>%
    filter(enrolid %in% required_enrolid) %>%
    mutate(enrolid = as.character(enrolid)) %>%
    select(caseid, enrolid, admdate, los) %>%
    collect() %>%
    mutate(source = source, year = year)
  if (as.numeric(year) <= 14) {
    inpatient_dx <- tbl(db, glue::glue("inpatient_dx_{source}_{year}")) %>%
      filter(caseid %in% local(inpatient_core$caseid)) %>%
      select(caseid, dx) %>%
      collect() %>%
      mutate(icd_version = 9)
  } else {
    inpatient_9 <- tbl(db, glue::glue("inpatient_dx9_{source}_{year}")) %>%
      filter(caseid %in% local(inpatient_core$caseid)) %>%
      select(caseid, dx) %>%
      collect() %>%
      mutate(icd_version = 9)
    inpatient_10 <- tbl(db, glue::glue("inpatient_dx10_{source}_{year}")) %>%
      filter(caseid %in% local(inpatient_core$caseid)) %>%
      select(caseid, dx) %>%
      collect() %>%
      mutate(icd_version = 10)
    if (nrow(inpatient_9) > 0 & nrow(inpatient_10) > 0) {
      inpatient_dx <- bind_rows(inpatient_9, inpatient_10)
    } else if (nrow(inpatient_10) > 0) {
      inpatient_dx <- inpatient_10
    } else {
      inpatient_dx <- inpatient_9
    }
  }
  
  inpatient_dx <- inpatient_dx %>%
    inner_join(inpatient_core, by = "caseid") %>%
    select(caseid, enrolid, admdate, icd_version, dx)

  DBI::dbDisconnect(db)
  return(inpatient_dx)
}

find_facility_dx_by_enrolid <- function(source, year, required_enrolid) {
  facility_db <- DBI::dbConnect(RSQLite::SQLite(), 
                                glue::glue("/Shared/Statepi_Marketscan/databases/Truven/facilities_dbs/facilities_{year}.db"))

  events <- tbl(facility_db, glue::glue("facility_dx_{source}_{year}")) |>
    filter(enrolid %in% required_enrolid) |>
    select(enrolid, date = svcdate, dx, icd_version = dx_ver) |>
    mutate(enrolid = as.character(enrolid)) |>
    collect() |>
    distinct() |>
    mutate(icd_version = ifelse(icd_version == 0, "10", "9"))

  return(events)
}

Which, as in build_cohort.qmd needs a wrapper for functional application:

find_events <- function(args, required_enrolid) {
  source <- args[[1]]
  year <- args[[2]]
  table <- args[[3]]

  if (table == "inpatient") {
    events <- find_inpatient_dx_by_enrolid(source, year, required_enrolid)
  } else if (table == "outpatient") {
    events <- find_outpatient_dx_by_enrolid(source, year, required_enrolid)
  } else if (table == "facility") {
    events <- find_facility_dx_by_enrolid(source, year, required_enrolid)
  }

  return(events)
}

We want to pull data from 2001 to 2021. Note that the facilities table does not exist for 2001:

conditions <- vector("list", length = 2 * 21 * 3 - 2)
i <- 1
for (source in c("ccae", "mdcr")) {
  for (year in stringr::str_pad(1:21, width = 2, pad = "0")) {
    for (table in c("inpatient", "outpatient", "facility")) {
      if (year != "01" | table != "facility") {
        conditions[[i]] <- c(source, year, table)
        i <- i + 1
      }
    }
  }
}

Next, start the cluster and load packages and functions:

cluster <- makeCluster(56)
clusterEvalQ(cluster, library(tidyverse))
[[1]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[2]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[3]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[4]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[5]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[6]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[7]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[8]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[9]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[10]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[11]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[12]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[13]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[14]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[15]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[16]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[17]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[18]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[19]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[20]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[21]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[22]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[23]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[24]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[25]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[26]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[27]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[28]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[29]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[30]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[31]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[32]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[33]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[34]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[35]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[36]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[37]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[38]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[39]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[40]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[41]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[42]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[43]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[44]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[45]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[46]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[47]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[48]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[49]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[50]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[51]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[52]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[53]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[54]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[55]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     

[[56]]
 [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
 [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
[13] "grDevices" "utils"     "datasets"  "methods"   "base"     
clusterExport(cluster, c("find_inpatient_dx_by_enrolid",
                         "find_outpatient_dx_by_enrolid", 
                         "find_facility_dx_by_enrolid"))

And pull back the diagnoses:

all_dx <- parLapplyLB(cluster,
                      conditions,
                      find_events,
                      required_enrolid = cohort$enrolid)

# write_rds(all_dx, "/Shared/lss_jsimmeri/als/all_dx.rds")
# all_dx <- read_rds("/Shared/lss_jsimmeri/als/all_dx.rds")

Then clean and unnest:

clean_inpatient <- function(data) {
  data <- data |>
    select(enrolid, date = admdate, icd_version, dx)
  return(data)
}

clean_outpatient <- function(data) {
  data <- data |>
    select(enrolid, date = svcdate, icd_version, dx)
  return(data)
}

clean_switch <- function(data, setting) {
  if (setting == "inpatient") {
    data <- clean_inpatient(data)
  }
  if (setting == "outpatient") {
    data <- clean_outpatient(data)
  }
  if (setting == "facility") {
    data <- data |>
      mutate(icd_version = as.numeric(icd_version))
  }
  return(data)
}

dx_tbl <- conditions |>
  lapply(function(x) {as_tibble(x); names(x) <- c("source", "year", "setting"); return(x)}) |>
  bind_rows() |>
  mutate(
    data = all_dx
  ) |>
  mutate(
    data = map2(data, setting, clean_switch)
  ) |>
  unnest(cols = data)

And then reduce to diagnoses that occurred before the ALS diagnosis date for cases and keep all for controls:

cohort <- left_join(cohort, als_dates, by = c("enrolid"))

dx_tbl <- dx_tbl |>
  inner_join(cohort, by = "enrolid") |>
  filter(is.na(als_date) | als_date > date)

Then, reduce to unique DX by setting and date:

unique_by_setting_date <- dx_tbl |>
  select(enrolid, setting, date, icd_version, dx) |>
  distinct()

And also the set of unique diagnoses recorded during the period:

unique_by_enrolid <- dx_tbl |>
  group_by(enrolid, icd_version, dx) |>
  summarize(first_dx_date = min(date),
            .groups = "drop")

Calculate Comorbidity Flags

dx_9 <- unique_by_enrolid |>
  filter(icd_version == 9)
dx_10 <- unique_by_enrolid |>
  filter(icd_version == 10)

elix_9 <- comorbid(dx_9, 
                   icd9_map_ahrq,
                   icd_name = "dx",
                   visit_name = "enrolid",
                   return_df = TRUE) |>
                   as_tibble() 

elix_10 <- comorbid(dx_10, 
                   icd10_map_ahrq,
                   icd_name = "dx",
                   visit_name = "enrolid",
                   return_df = TRUE) |>
                   as_tibble() 

elix <- bind_rows(elix_9, elix_10) |>
  pivot_longer(-enrolid, names_to = "cm", values_to = "flag") |>
  filter(flag == TRUE) |>
  distinct() |>
  mutate(cm = glue::glue("elix_{cm}")) |>
  pivot_wider(names_from = "cm", values_from = "flag", values_fill = FALSE)

Find the Volume of Care Consumed

For the outpatient data, we reduce this to patient-days with >= 1 diagnosis since a single visit can generate many claims. So count those:

outpatient_volume <- unique_by_setting_date |>
  filter(setting == "outpatient") |>
  group_by(enrolid, date) |>
  summarize(
    n_dx = n(),
    .groups = "drop"
  ) |>
  group_by(enrolid) |>
  summarize(
    n_outpatient_visits = n(),
    mean_dx_per_outpatient_visit = mean(n_dx)
  )

For the inpatient volume (number of stays, LOS) we need to pull the data again:

find_inpatient_volume_by_enrolid <- function(args, required_enrolid) {
  source <- args[[1]]
  year <- args[[2]]
  db <- DBI::dbConnect(RSQLite::SQLite(),
                       glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  inpatient_core <- tbl(db, glue::glue("inpatient_core_{source}_{year}")) %>%
    filter(enrolid %in% required_enrolid) %>%
    mutate(enrolid = as.character(enrolid)) %>%
    select(caseid, enrolid, date = admdate, los) %>%
    collect() %>%
    mutate(source = source, year = year)

  DBI::dbDisconnect(db)
  return(inpatient_core)
}

Which we obviously only need to apply to the inpatient tables:

conditions_inpatient_only <- vector("list", length = 2 * 20)
i <- 1
for (source in c("ccae", "mdcr")) {
  for (year in stringr::str_pad(1:21, width = 2, pad = "0")) {
    conditions_inpatient_only[[i]] <- c(source, year)
    i <- i + 1
  }
}

And pull back the volume using the cluster cluster:

inpatient_volume <- parLapplyLB(cluster,
                                conditions_inpatient_only,
                                find_inpatient_volume_by_enrolid,
                                required_enrolid = cohort$enrolid)

inpatient_volume <- bind_rows(inpatient_volume)

Reduce and summarize:

inpatient_volume <- inpatient_volume |>
  inner_join(cohort, by = "enrolid") |>
  filter(is.na(als_date) | als_date > date) |>
  group_by(enrolid) |>
  summarize(
    n_hospital_admissions = n(),
    n_hospital_days = sum(los)
  )

Find Exposures

Largely based on:

https://bmcneurol.biomedcentral.com/articles/10.1186/1471-2377-13-160

bulbar_symptoms <- tibble(
  classification = c("speech", "swallowing"),
  icd_9 = list(
    c("31539", "4381", "7843", "7844", "7845", "7872", "43882"),
    c("5277", "5278", "7841")
  )
) |> 
  mutate(icd_9 = map(icd_9, children)) |>
  unnest(icd_9) |>
  mutate(group = "bulbar") |>
  filter(is_billable(icd_9))

motor_symptoms <- tibble(
  classification = c("strength", "gait", "involuntary_movement", "pain", 
                     "other", "falls"),
  icd_9 = list(
    c("7282", "72887"),
    c("7197", "7812", "7813"),
    c("72982", "72885", "7810"),
    c("7194", "7295"),
    c("728"),
    c("E8889", "V1588")

  )
) |>
  mutate(icd_9 = map(icd_9, children)) |>
  unnest(icd_9) |>
  mutate(group = "motor") |>
  filter(is_billable(icd_9))

And we need to crosswalk these using the CMS mappings. Setting the relationship to “many-to-many” due to the not 1:1 mapping between ICD-9 and ICD-10. This really just suppresses a warning from the join and doesn’t change how the join works.

mapping_9_to_10 <- read_csv("https://data.nber.org/gem/icd9toicd10cmgem.csv") |>
  rename(icd_9 = icd9cm, icd_10 = icd10cm) |>
  select(icd_9, icd_10)
Rows: 23912 Columns: 8
-- Column specification --------------------------------------------------------
Delimiter: ","
chr (3): icd9cm, icd10cm, flags
dbl (5): approximate, no_map, combination, scenario, choice_list

i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
bulbar_symptoms <- bulbar_symptoms |>
  mutate(icd_9 = as.character(icd_9)) |>
  left_join(
    mapping_9_to_10,
    by = "icd_9",    
    relationship = "many-to-many"
  )

motor_symptoms <- motor_symptoms |>
  mutate(icd_9 = as.character(icd_9)) |>
  left_join(
    mapping_9_to_10,
    by = "icd_9",
    relationship = "many-to-many"
  )

So our final set of bulbar symptoms are:

bulbar_symptoms |>
  select(group, classification, icd_9, icd_10) |>
  pivot_longer(cols = c(icd_9, icd_10), values_to = "code", names_to = "version") |>
  arrange(group, classification, version, code) |>
  group_by(version) |>
  nest() |>
  mutate(
    data = map(data, \(x) x |> mutate(label = explain_table(code)$long_desc))
  ) |>
  unnest(data)
# A tibble: 64 x 5
# Groups:   version [2]
   version group  classification code   label                                   
   <chr>   <chr>  <chr>          <chr>  <chr>                                   
 1 icd_10  bulbar speech         F800   Phonological disorder                   
 2 icd_10  bulbar speech         F8089  Other developmental disorders of speech~
 3 icd_10  bulbar speech         I69920 Aphasia following unspecified cerebrova~
 4 icd_10  bulbar speech         I69921 Dysphasia following unspecified cerebro~
 5 icd_10  bulbar speech         I69922 Dysarthria following unspecified cerebr~
 6 icd_10  bulbar speech         I69923 Fluency disorder following unspecified ~
 7 icd_10  bulbar speech         I69928 Other speech and language deficits foll~
 8 icd_10  bulbar speech         I69928 Other speech and language deficits foll~
 9 icd_10  bulbar speech         I69991 Dysphagia following unspecified cerebro~
10 icd_10  bulbar speech         R130   Aphagia                                 
# i 54 more rows

And limb or motor symptoms:

motor_symptoms |>
  select(group, classification, icd_9, icd_10) |>
  pivot_longer(cols = c(icd_9, icd_10), values_to = "code", names_to = "version") |>
  arrange(group, classification, version, code) |>
  group_by(version) |>
  nest() |>
  mutate(
    data = map(data, \(x) x |> mutate(label = explain_table(code)$long_desc))
  ) |>
  unnest(data)
# A tibble: 116 x 5
# Groups:   version [2]
   version group classification code    label                                   
   <chr>   <chr> <chr>          <chr>   <chr>                                   
 1 icd_10  motor falls          W19XXXA Unspecified fall, initial encounter     
 2 icd_10  motor falls          Z9181   History of falling                      
 3 icd_10  motor gait           R260    Ataxic gait                             
 4 icd_10  motor gait           R261    Paralytic gait                          
 5 icd_10  motor gait           R262    Difficulty in walking, not elsewhere cl~
 6 icd_10  motor gait           R2689   Other abnormalities of gait and mobility
 7 icd_10  motor gait           R269    Unspecified abnormalities of gait and m~
 8 icd_10  motor gait           R270    Ataxia, unspecified                     
 9 icd_10  motor gait           R278    Other lack of coordination              
10 icd_10  motor gait           R279    Unspecified lack of coordination        
# i 106 more rows
symptoms <- bind_rows(bulbar_symptoms, motor_symptoms) |>
  select(group, classification, icd_9, icd_10) |>
  pivot_longer(cols = c(icd_9, icd_10), values_to = "dx", names_to = "version") |>
  mutate(icd_version = ifelse(version == "icd_10", 10, 9)) |>
  select(group, classification, icd_version, dx)

And then use this table to annotate the unique_by_enrolid diagnoses:

exposures <- unique_by_enrolid |>
  inner_join(symptoms, 
             by = c("icd_version", "dx"),
             relationship = "many-to-many") |>
  group_by(enrolid, group, classification) |>
  summarize(first_dx_date = min(first_dx_date),
            .groups = "drop")

Find Urban/Rural

Urban is defined as having ever lived in an MSA. Query the detailed enrollment tables to find out:

check_msa <- function(args, required_enrolid) {
  source <- args[[1]]
  year <- args[[2]]
  db <- DBI::dbConnect(RSQLite::SQLite(),
                       glue::glue("/Shared/Statepi_Marketscan/databases/Truven/truven_{year}.db"))
  enrollment_msa <- tbl(db, glue::glue("enrollment_detail_{source}_{year}")) |>
    filter(enrolid %in% required_enrolid) |>
    select(enrolid, msa) |>
    mutate(enrolid = as.character(enrolid)) |>
    collect()
  enrollment_msa <- enrollment_msa |>
    mutate(lives_in_msa = !is.na(msa) & msa != 0 & msa != "") |>
    group_by(enrolid) |> 
    summarize(lives_in_msa = any(lives_in_msa))
  return(enrollment_msa)
}

And then apply:

resides_in_msa <- parLapplyLB(cluster,
                              conditions_inpatient_only,
                              check_msa,
                              required_enrolid = cohort$enrolid)

And then aggregate across years:

resides_in_msa <- resides_in_msa |> 
  bind_rows() |>
  group_by(enrolid) |>
  summarize(
    lives_in_msa = any(lives_in_msa)
  )

Assemble Model Data Set

In order to normalize volume, we need enrollment duration in days:

enrollment_db <- DBI::dbConnect(
  RSQLite::SQLite(), 
  "/Shared/Statepi_Marketscan/databases/Truven/enrollment_dbs/all_enroll_01_21q4.db")

enrollments <- tbl(enrollment_db, "all_enrollees") |>
  filter(enrolid %in% local(cohort$enrolid)) |>
  collect()

For people who have ALS, replace last date and calculate enrollment duration:

enrollment_duration <- enrollments |>
  mutate(enrolid = as.character(enrolid)) |>
  inner_join(cohort |> select(enrolid, als_date), by = "enrolid") |>
  mutate(
    end_date = ifelse(is.na(als_date), last_date,
                      ifelse(als_date < last_date, als_date, last_date))
  ) |>
  mutate(duration = end_date - first_date) |>
  select(enrolid, duration)

Then make normalized volume:

volume <- full_join(
  outpatient_volume |>
    inner_join(enrollment_duration, by = "enrolid") |>
    mutate(
      rate_outpatient_visits = n_outpatient_visits / duration * 365,
      intensity_outpatient_visits_dx = mean_dx_per_outpatient_visit
    ) |>
    select(enrolid, rate_outpatient_visits, intensity_outpatient_visits_dx),
  inpatient_volume |>
    inner_join(enrollment_duration, by = "enrolid") |>
    mutate(
      rate_inpatient_stays = n_hospital_admissions / duration * 365,
      intensity_inpatient_stays_los = n_hospital_days / n_hospital_admissions
    ) |>
    select(enrolid, rate_inpatient_stays, intensity_inpatient_stays_los),
  by = "enrolid"
)

And then add to the cohort data:

model_data <- cohort |>
  select(enrolid, als, first_year, last_year, dobyr, sex) |>
  left_join(volume, by = "enrolid") |>
  mutate(
    rate_outpatient_visits = replace_na(rate_outpatient_visits, 0),
    intensity_outpatient_visits_dx = replace_na(intensity_outpatient_visits_dx, 0),
    rate_inpatient_stays = replace_na(rate_inpatient_stays, 0),
    intensity_inpatient_stays_los = replace_na(intensity_inpatient_stays_los, 0)
  )

And the Elixhauser flags:

model_data <- model_data |>
  left_join(elix, by = "enrolid") |>
  mutate(
    across(
      starts_with("elix"),
      \(x) replace_na(x, FALSE)
    )
  )

Add the exposures. First, the major classes:

model_data <- model_data |>
  left_join(
    exposures |> 
      select(enrolid, group) |>
      distinct() |>
      mutate(value = TRUE) |>
      pivot_wider(names_from = group, values_from = value),
    by = "enrolid")

And then the sub-classifications:

model_data <- model_data |>
  left_join(
    exposures |> 
      select(enrolid, classification) |>
      distinct() |>
      mutate(value = TRUE) |>
      pivot_wider(names_from = classification, values_from = value),
    by = "enrolid")

And then add the exposure dates

first_exposure_date <- exposures |>
  group_by(enrolid) |>
  summarize(first_symptom_date = min(first_dx_date))

model_data <- model_data |>
  left_join(first_exposure_date, by = "enrolid")

And add end dates (ALS diagnosis or data exit, whichever comes first):

end_dates <- enrollments |>
  mutate(enrolid = as.character(enrolid)) |>
  left_join(als_dates, by = "enrolid") |>
  mutate(
    end_date = ifelse(is.na(als_date), last_date, als_date)
  ) |>
  select(enrolid, end_date)
model_data <- inner_join(model_data, end_dates, by = "enrolid")

model_data <- inner_join(
  model_data, 
  enrollments |> select(enrolid, first_date) |> mutate(enrolid = as.character(enrolid)), 
  by = "enrolid"
)

Whether or not they live in an MSA:

model_data <- model_data |>
  left_join(resides_in_msa, by = "enrolid")

And then addressing the implicit FALSE values:

model_data <- model_data |>
  mutate(
    motor = replace_na(motor, FALSE),
    bulbar = replace_na(bulbar, FALSE),
    speech = replace_na(speech, FALSE),
    swallowing = replace_na(swallowing, FALSE),
    strength = replace_na(strength, FALSE),
    gait = replace_na(gait, FALSE),
    involuntary_movement = replace_na(involuntary_movement, FALSE),
    pain = replace_na(pain, FALSE),
    other = replace_na(other, FALSE),
    falls = replace_na(falls, FALSE),
    lives_in_msa = replace_na(lives_in_msa, FALSE)
  )

Save Model Data

write_rds(model_data, "/Shared/lss_jsimmeri/als/model_data.rds")

Session Information

sessionInfo()
R version 4.1.3 (2022-03-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /cvmfs/argon.hpc.uiowa.edu/2022.1/apps/linux-centos7-x86_64/gcc-9.4.0/intel-oneapi-mkl-2022.0.2-s35g6hp/mkl/2022.0.2/lib/intel64/libmkl_gf_lp64.so.2

locale:
[1] C

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] icd_4.0.9.9000  lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0  
 [5] dplyr_1.1.2     purrr_1.0.1     readr_2.1.4     tidyr_1.3.0    
 [9] tibble_3.2.1    ggplot2_3.4.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.0  xfun_0.32         colorspace_2.0-3  vctrs_0.6.2      
 [5] generics_0.1.3    htmltools_0.5.3   yaml_2.3.5        utf8_1.2.3       
 [9] blob_1.2.3        rlang_1.1.0       pillar_1.9.0      glue_1.6.2       
[13] withr_2.5.0       DBI_1.1.3         bit64_4.0.5       dbplyr_2.3.3     
[17] lifecycle_1.0.3   munsell_0.5.0     gtable_0.3.1      htmlwidgets_1.5.4
[21] evaluate_0.16     memoise_2.0.1     knitr_1.40        tzdb_0.3.0       
[25] fastmap_1.1.0     curl_5.0.0        fansi_1.0.4       Rcpp_1.0.10      
[29] scales_1.2.1      cachem_1.0.6      vroom_1.6.3       jsonlite_1.8.4   
[33] bit_4.0.4         hms_1.1.2         digest_0.6.29     stringi_1.7.12   
[37] grid_4.1.3        cli_3.6.1         tools_4.1.3       magrittr_2.0.3   
[41] RSQLite_2.2.18    crayon_1.5.1      pkgconfig_2.0.3   ellipsis_0.3.2   
[45] timechange_0.1.1  rmarkdown_2.16    R6_2.5.1          compiler_4.1.3