title: “Downloading All Data Dictionaries”
output: rmarkdown::html_vignette
vignette: >
%
%
%

Get all Data Set Slugs

library(nsrr)
df = nsrr_datasets()
head(df)
#>                                          name        slug
#> 1    Apnea, Bariatric surgery, and CPAP study         abc
#> 2        Best Apnea Interventions in Research     bestair
#> 3          Childhood Adenotonsillectomy Trial        chat
#> 4 Cleveland Children's Sleep and Health Study       ccshs
#> 5                      Cleveland Family Study         cfs
#> 6    Cox & Fell (2020) Sleep Medicine Reviews coxfell2020
#>                      created_at                    updated_at
#> 1 2014-07-11T16:32:56.676-04:00 2020-04-09T09:37:04.026-04:00
#> 2 2013-10-31T11:46:58.631-04:00 2020-04-04T20:10:23.013-04:00
#> 3 2013-10-31T11:46:21.675-04:00 2020-04-04T20:10:22.986-04:00
#> 4 2015-09-02T12:16:35.426-04:00 2020-04-04T20:10:23.218-04:00
#> 5 2014-06-24T14:36:20.767-04:00 2020-04-04T20:10:23.083-04:00
#> 6 2020-12-14T13:48:47.711-05:00 2020-12-14T14:52:38.595-05:00
#>                         path                            files page
#> 1         /datasets/abc.json         /datasets/abc/files.json    1
#> 2     /datasets/bestair.json     /datasets/bestair/files.json    1
#> 3        /datasets/chat.json        /datasets/chat/files.json    1
#> 4       /datasets/ccshs.json       /datasets/ccshs/files.json    1
#> 5         /datasets/cfs.json         /datasets/cfs/files.json    1
#> 6 /datasets/coxfell2020.json /datasets/coxfell2020/files.json    1

Here we will loop through each data set to get a list of the files in the datasets/ path:

datasets = df$slug
L = vector(mode = "list", length = length(datasets))
names(L) = datasets
for (dataset in datasets) {
  a = nsrr_dataset_files(dataset = dataset, path = "datasets")
  L[[dataset]] = a
  print(dataset)
}
#> [1] "abc"
#> [1] "bestair"
#> [1] "chat"
#> [1] "ccshs"
#> [1] "cfs"
#> [1] "coxfell2020"
#> [1] "heartbeat"
#> [1] "hchs"
#> [1] "homepap"
#> [1] "haassa"
#> [1] "learn"
#> [1] "mnc"
#> [1] "mros"
#> [1] "mesa"
#> [1] "nchsdb"
#> [1] "numom2b"
#> [1] "oya"
#> [1] "shhs"
#> [1] "sof"
#> [1] "wsc"
head(L[[1]])
#>   dataset                                        full_path    folder
#> 1     abc                                 datasets/archive datasets/
#> 2     abc          datasets/abc-baseline-dataset-0.3.0.csv datasets/
#> 3     abc   datasets/abc-data-dictionary-0.3.0-domains.csv datasets/
#> 4     abc     datasets/abc-data-dictionary-0.3.0-forms.csv datasets/
#> 5     abc datasets/abc-data-dictionary-0.3.0-variables.csv datasets/
#> 6     abc           datasets/abc-month09-dataset-0.3.0.csv datasets/
#>                                 file_name is_file file_size
#> 1                                 archive   FALSE         0
#> 2          abc-baseline-dataset-0.3.0.csv    TRUE     16229
#> 3   abc-data-dictionary-0.3.0-domains.csv    TRUE       684
#> 4     abc-data-dictionary-0.3.0-forms.csv    TRUE        33
#> 5 abc-data-dictionary-0.3.0-variables.csv    TRUE     20553
#> 6           abc-month09-dataset-0.3.0.csv    TRUE     13922
#>                  file_checksum_md5 archived
#> 1                             <NA>    FALSE
#> 2 35a5cc9bfad4fc03cece0638e447d8e5    FALSE
#> 3 f66f6a8b45f6f16b909af014c7a863a2    FALSE
#> 4 d8f70b400480b548e21038a737d12691    FALSE
#> 5 03f88fb3dac38faff83bb79121e752b2    FALSE
#> 6 e3c976d638ba74867e2e20a87a46cc05    FALSE

Let’s subset the data on only the files we want to find (data dictionaries):

files = lapply(L, function(x) {
  if (length(x) == 0) {
    return(NULL)
  }
  x[ grepl("dictionary", tolower(x$file_name)),]
})
files = do.call("rbind", files)
rownames(files) = NULL
head(files)
#>   dataset                                            full_path    folder
#> 1     abc       datasets/abc-data-dictionary-0.3.0-domains.csv datasets/
#> 2     abc         datasets/abc-data-dictionary-0.3.0-forms.csv datasets/
#> 3     abc     datasets/abc-data-dictionary-0.3.0-variables.csv datasets/
#> 4 bestair   datasets/bestair-data-dictionary-0.5.0-domains.csv datasets/
#> 5 bestair     datasets/bestair-data-dictionary-0.5.0-forms.csv datasets/
#> 6 bestair datasets/bestair-data-dictionary-0.5.0-variables.csv datasets/
#>                                     file_name is_file file_size
#> 1       abc-data-dictionary-0.3.0-domains.csv    TRUE       684
#> 2         abc-data-dictionary-0.3.0-forms.csv    TRUE        33
#> 3     abc-data-dictionary-0.3.0-variables.csv    TRUE     20553
#> 4   bestair-data-dictionary-0.5.0-domains.csv    TRUE      3118
#> 5     bestair-data-dictionary-0.5.0-forms.csv    TRUE      1214
#> 6 bestair-data-dictionary-0.5.0-variables.csv    TRUE     30816
#>                  file_checksum_md5 archived
#> 1 f66f6a8b45f6f16b909af014c7a863a2    FALSE
#> 2 d8f70b400480b548e21038a737d12691    FALSE
#> 3 03f88fb3dac38faff83bb79121e752b2    FALSE
#> 4 ffda209b3cee629b0fa90f8583a3f1c0    FALSE
#> 5 4cb7a606ce707a009a7984070f214dd7    FALSE
#> 6 08d942218c9c38f53c8902313dce7072    FALSE
vars = files[ grepl("variables", tolower(files$file_name)),]
head(vars)
#>      dataset                                              full_path    folder
#> 3        abc       datasets/abc-data-dictionary-0.3.0-variables.csv datasets/
#> 6    bestair   datasets/bestair-data-dictionary-0.5.0-variables.csv datasets/
#> 9       chat     datasets/chat-data-dictionary-0.11.0-variables.csv datasets/
#> 12     ccshs     datasets/ccshs-data-dictionary-0.6.0-variables.csv datasets/
#> 15       cfs       datasets/cfs-data-dictionary-0.5.0-variables.csv datasets/
#> 18 heartbeat datasets/heartbeat-data-dictionary-0.4.0-variables.csv datasets/
#>                                        file_name is_file file_size
#> 3        abc-data-dictionary-0.3.0-variables.csv    TRUE     20553
#> 6    bestair-data-dictionary-0.5.0-variables.csv    TRUE     30816
#> 9      chat-data-dictionary-0.11.0-variables.csv    TRUE    538915
#> 12     ccshs-data-dictionary-0.6.0-variables.csv    TRUE     46703
#> 15       cfs-data-dictionary-0.5.0-variables.csv    TRUE    420037
#> 18 heartbeat-data-dictionary-0.4.0-variables.csv    TRUE    141127
#>                   file_checksum_md5 archived
#> 3  03f88fb3dac38faff83bb79121e752b2    FALSE
#> 6  08d942218c9c38f53c8902313dce7072    FALSE
#> 9  033ab86c850af00f86faa53b2b010ae4    FALSE
#> 12 8d4c0702431f15f8273218841da0b239    FALSE
#> 15 f1ab4073e340e6f38901adeae9b6a824    FALSE
#> 18 ff3921f090f9b4d71dc60d326508832d    FALSE

Here we will download one of the data dictionaries:

i = 3
# for (i in seq(nrow(vars))) {
  idf = vars[i,]
  out = nsrr::nsrr_download_file(
    dataset = idf$dataset, 
    path = idf$full_path,
    check_md5 = FALSE
  )
  if (requireNamespace("readr", quietly = TRUE)) {
    var_df = readr::read_csv(out$outfile)
  } else {
    var_df = utils::read.csv(out$outfile, as.is = TRUE)
  }
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   folder = col_character(),
#>   id = col_character(),
#>   display_name = col_character(),
#>   description = col_character(),
#>   type = col_character(),
#>   units = col_character(),
#>   domain = col_character(),
#>   labels = col_character(),
#>   calculation = col_character(),
#>   commonly_used = col_logical(),
#>   forms = col_character()
#> )
  print(head(var_df))
#> # A tibble: 6 x 11
#>   folder  id     display_name  description type  units domain labels calculation
#>   <chr>   <chr>  <chr>         <chr>       <chr> <chr> <chr>  <chr>  <chr>      
#> 1 Admini… chang… Change in sy… <NA>        choi… <NA>  sympt… <NA>   <NA>       
#> 2 Admini… compl… Participant … <NA>        choi… <NA>  noyes  <NA>   <NA>       
#> 3 Admini… cross… Did this chi… <NA>        choi… <NA>  noyes  <NA>   <NA>       
#> 4 Admini… follo… Patient rece… <NA>        choi… <NA>  follo… <NA>   <NA>       
#> 5 Admini… had_s… Child had su… <NA>        choi… <NA>  noyes  tonsi… <NA>       
#> 6 Admini… itt    Completed st… <NA>        choi… <NA>  noyes  <NA>   if ran7=1 …
#> # … with 2 more variables: commonly_used <lgl>, forms <chr>
# }

Run the outer loop and bind the data together using a similar list of data.frames as above to get all the variables from all data sets.