vignettes/dictionary.Rmd
dictionary.Rmd
title: “Downloading All Data Dictionaries” |
output: rmarkdown::html_vignette |
vignette: > |
% |
% |
% |
library(nsrr)
df = nsrr_datasets()
head(df)
#> name slug
#> 1 Apnea, Bariatric surgery, and CPAP study abc
#> 2 Best Apnea Interventions in Research bestair
#> 3 Childhood Adenotonsillectomy Trial chat
#> 4 Cleveland Children's Sleep and Health Study ccshs
#> 5 Cleveland Family Study cfs
#> 6 Cox & Fell (2020) Sleep Medicine Reviews coxfell2020
#> created_at updated_at
#> 1 2014-07-11T16:32:56.676-04:00 2020-04-09T09:37:04.026-04:00
#> 2 2013-10-31T11:46:58.631-04:00 2020-04-04T20:10:23.013-04:00
#> 3 2013-10-31T11:46:21.675-04:00 2020-04-04T20:10:22.986-04:00
#> 4 2015-09-02T12:16:35.426-04:00 2020-04-04T20:10:23.218-04:00
#> 5 2014-06-24T14:36:20.767-04:00 2020-04-04T20:10:23.083-04:00
#> 6 2020-12-14T13:48:47.711-05:00 2020-12-14T14:52:38.595-05:00
#> path files page
#> 1 /datasets/abc.json /datasets/abc/files.json 1
#> 2 /datasets/bestair.json /datasets/bestair/files.json 1
#> 3 /datasets/chat.json /datasets/chat/files.json 1
#> 4 /datasets/ccshs.json /datasets/ccshs/files.json 1
#> 5 /datasets/cfs.json /datasets/cfs/files.json 1
#> 6 /datasets/coxfell2020.json /datasets/coxfell2020/files.json 1
Here we will loop through each data set to get a list of the files in the datasets/
path:
datasets = df$slug
L = vector(mode = "list", length = length(datasets))
names(L) = datasets
for (dataset in datasets) {
a = nsrr_dataset_files(dataset = dataset, path = "datasets")
L[[dataset]] = a
print(dataset)
}
#> [1] "abc"
#> [1] "bestair"
#> [1] "chat"
#> [1] "ccshs"
#> [1] "cfs"
#> [1] "coxfell2020"
#> [1] "heartbeat"
#> [1] "hchs"
#> [1] "homepap"
#> [1] "haassa"
#> [1] "learn"
#> [1] "mnc"
#> [1] "mros"
#> [1] "mesa"
#> [1] "nchsdb"
#> [1] "numom2b"
#> [1] "oya"
#> [1] "shhs"
#> [1] "sof"
#> [1] "wsc"
head(L[[1]])
#> dataset full_path folder
#> 1 abc datasets/archive datasets/
#> 2 abc datasets/abc-baseline-dataset-0.3.0.csv datasets/
#> 3 abc datasets/abc-data-dictionary-0.3.0-domains.csv datasets/
#> 4 abc datasets/abc-data-dictionary-0.3.0-forms.csv datasets/
#> 5 abc datasets/abc-data-dictionary-0.3.0-variables.csv datasets/
#> 6 abc datasets/abc-month09-dataset-0.3.0.csv datasets/
#> file_name is_file file_size
#> 1 archive FALSE 0
#> 2 abc-baseline-dataset-0.3.0.csv TRUE 16229
#> 3 abc-data-dictionary-0.3.0-domains.csv TRUE 684
#> 4 abc-data-dictionary-0.3.0-forms.csv TRUE 33
#> 5 abc-data-dictionary-0.3.0-variables.csv TRUE 20553
#> 6 abc-month09-dataset-0.3.0.csv TRUE 13922
#> file_checksum_md5 archived
#> 1 <NA> FALSE
#> 2 35a5cc9bfad4fc03cece0638e447d8e5 FALSE
#> 3 f66f6a8b45f6f16b909af014c7a863a2 FALSE
#> 4 d8f70b400480b548e21038a737d12691 FALSE
#> 5 03f88fb3dac38faff83bb79121e752b2 FALSE
#> 6 e3c976d638ba74867e2e20a87a46cc05 FALSE
Let’s subset the data on only the files we want to find (data dictionaries):
files = lapply(L, function(x) {
if (length(x) == 0) {
return(NULL)
}
x[ grepl("dictionary", tolower(x$file_name)),]
})
files = do.call("rbind", files)
rownames(files) = NULL
head(files)
#> dataset full_path folder
#> 1 abc datasets/abc-data-dictionary-0.3.0-domains.csv datasets/
#> 2 abc datasets/abc-data-dictionary-0.3.0-forms.csv datasets/
#> 3 abc datasets/abc-data-dictionary-0.3.0-variables.csv datasets/
#> 4 bestair datasets/bestair-data-dictionary-0.5.0-domains.csv datasets/
#> 5 bestair datasets/bestair-data-dictionary-0.5.0-forms.csv datasets/
#> 6 bestair datasets/bestair-data-dictionary-0.5.0-variables.csv datasets/
#> file_name is_file file_size
#> 1 abc-data-dictionary-0.3.0-domains.csv TRUE 684
#> 2 abc-data-dictionary-0.3.0-forms.csv TRUE 33
#> 3 abc-data-dictionary-0.3.0-variables.csv TRUE 20553
#> 4 bestair-data-dictionary-0.5.0-domains.csv TRUE 3118
#> 5 bestair-data-dictionary-0.5.0-forms.csv TRUE 1214
#> 6 bestair-data-dictionary-0.5.0-variables.csv TRUE 30816
#> file_checksum_md5 archived
#> 1 f66f6a8b45f6f16b909af014c7a863a2 FALSE
#> 2 d8f70b400480b548e21038a737d12691 FALSE
#> 3 03f88fb3dac38faff83bb79121e752b2 FALSE
#> 4 ffda209b3cee629b0fa90f8583a3f1c0 FALSE
#> 5 4cb7a606ce707a009a7984070f214dd7 FALSE
#> 6 08d942218c9c38f53c8902313dce7072 FALSE
vars = files[ grepl("variables", tolower(files$file_name)),]
head(vars)
#> dataset full_path folder
#> 3 abc datasets/abc-data-dictionary-0.3.0-variables.csv datasets/
#> 6 bestair datasets/bestair-data-dictionary-0.5.0-variables.csv datasets/
#> 9 chat datasets/chat-data-dictionary-0.11.0-variables.csv datasets/
#> 12 ccshs datasets/ccshs-data-dictionary-0.6.0-variables.csv datasets/
#> 15 cfs datasets/cfs-data-dictionary-0.5.0-variables.csv datasets/
#> 18 heartbeat datasets/heartbeat-data-dictionary-0.4.0-variables.csv datasets/
#> file_name is_file file_size
#> 3 abc-data-dictionary-0.3.0-variables.csv TRUE 20553
#> 6 bestair-data-dictionary-0.5.0-variables.csv TRUE 30816
#> 9 chat-data-dictionary-0.11.0-variables.csv TRUE 538915
#> 12 ccshs-data-dictionary-0.6.0-variables.csv TRUE 46703
#> 15 cfs-data-dictionary-0.5.0-variables.csv TRUE 420037
#> 18 heartbeat-data-dictionary-0.4.0-variables.csv TRUE 141127
#> file_checksum_md5 archived
#> 3 03f88fb3dac38faff83bb79121e752b2 FALSE
#> 6 08d942218c9c38f53c8902313dce7072 FALSE
#> 9 033ab86c850af00f86faa53b2b010ae4 FALSE
#> 12 8d4c0702431f15f8273218841da0b239 FALSE
#> 15 f1ab4073e340e6f38901adeae9b6a824 FALSE
#> 18 ff3921f090f9b4d71dc60d326508832d FALSE
Here we will download one of the data dictionaries:
i = 3
# for (i in seq(nrow(vars))) {
idf = vars[i,]
out = nsrr::nsrr_download_file(
dataset = idf$dataset,
path = idf$full_path,
check_md5 = FALSE
)
if (requireNamespace("readr", quietly = TRUE)) {
var_df = readr::read_csv(out$outfile)
} else {
var_df = utils::read.csv(out$outfile, as.is = TRUE)
}
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> folder = col_character(),
#> id = col_character(),
#> display_name = col_character(),
#> description = col_character(),
#> type = col_character(),
#> units = col_character(),
#> domain = col_character(),
#> labels = col_character(),
#> calculation = col_character(),
#> commonly_used = col_logical(),
#> forms = col_character()
#> )
print(head(var_df))
#> # A tibble: 6 x 11
#> folder id display_name description type units domain labels calculation
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Admini… chang… Change in sy… <NA> choi… <NA> sympt… <NA> <NA>
#> 2 Admini… compl… Participant … <NA> choi… <NA> noyes <NA> <NA>
#> 3 Admini… cross… Did this chi… <NA> choi… <NA> noyes <NA> <NA>
#> 4 Admini… follo… Patient rece… <NA> choi… <NA> follo… <NA> <NA>
#> 5 Admini… had_s… Child had su… <NA> choi… <NA> noyes tonsi… <NA>
#> 6 Admini… itt Completed st… <NA> choi… <NA> noyes <NA> if ran7=1 …
#> # … with 2 more variables: commonly_used <lgl>, forms <chr>
# }
Run the outer loop and bind the data together using a similar list of data.frame
s as above to get all the variables from all data sets.