Title: | Import Data from EDC Software |
---|---|
Description: | A convenient toolbox to import data exported from Electronic Data Capture (EDC) software 'TrialMaster'. |
Authors: | Dan Chaltiel [aut, cre] |
Maintainer: | Dan Chaltiel <[email protected]> |
License: | GPL-3 |
Version: | 0.5.2.9000 |
Built: | 2024-11-14 23:23:04 UTC |
Source: | https://github.com/DanChaltiel/EDCimport |
Check that there is no duplicate on the column holding patient ID in a pipeable style.
Mostly useful after joining two datasets.
assert_no_duplicate(df, by = NULL, id_col = get_subjid_cols())
assert_no_duplicate(df, by = NULL, id_col = get_subjid_cols())
df |
a dataframe |
by |
(optional) grouping columns |
id_col |
the name of the columns holding patient ID |
the df
dataset, unchanged
## Not run: #without duplicate => no error, continue the pipeline tibble(subjid=c(1:10)) %>% assert_no_duplicate() %>% nrow() #with duplicate => throws an error tibble(subjid=c(1:10, 1:2)) %>% assert_no_duplicate() %>% nrow() #By groups df = tibble(subjid=rep(1:10, 4), visit=rep(c("V1", "V2"), 2, each=10), group=rep(c("A", "B"), each=20)) df %>% assert_no_duplicate() #error df %>% assert_no_duplicate(by=c(visit, group)) #no error ## End(Not run)
## Not run: #without duplicate => no error, continue the pipeline tibble(subjid=c(1:10)) %>% assert_no_duplicate() %>% nrow() #with duplicate => throws an error tibble(subjid=c(1:10, 1:2)) %>% assert_no_duplicate() %>% nrow() #By groups df = tibble(subjid=rep(1:10, 4), visit=rep(c("V1", "V2"), 2, each=10), group=rep(c("A", "B"), each=20)) df %>% assert_no_duplicate() #error df %>% assert_no_duplicate(by=c(visit, group)) #no error ## End(Not run)
Generate a lookup table
build_lookup(data_list)
build_lookup(data_list)
data_list |
a list containing at least 1 dataframe |
a dataframe summarizing column names and labels
x = edc_example() x$.lookup=NULL lk = build_lookup(x) lk lk %>% tidyr::unnest(c(names, labels))
x = edc_example() x$.lookup=NULL lk = build_lookup(x) lk lk %>% tidyr::unnest(c(names, labels))
Generate a barplot showing the distribution of CRF status (Complete, Incomplete, ...) for each dataset of the database.
crf_status_plot( crfstat_col = "CRFSTAT", ..., details = FALSE, pal = edc_pal_crf(), crfstat_lvls = names(pal), x_label = "{dataset}", treat_as_worst = NULL ) edc_pal_crf()
crf_status_plot( crfstat_col = "CRFSTAT", ..., details = FALSE, pal = edc_pal_crf(), crfstat_lvls = names(pal), x_label = "{dataset}", treat_as_worst = NULL ) edc_pal_crf()
crfstat_col |
the column name of the CRF status |
... |
unused |
details |
whether to show all the CRF status levels. When |
pal |
the palette, defaulting to the helper |
crfstat_lvls |
the CRF status levels, from "best" to "worst". The plot is ordered by the "worst" level. |
x_label |
a glue pattern determining the tick label in the x axis. Available variables are |
treat_as_worst |
a regex for levels that should be treated as worst in the ordering |
a ggplot
ggsci:::ggsci_db$lancet[["lanonc"]] %>% dput()
## Not run: #import a TM database and use load_list(), then: crf_status_plot() + ggtitle(date_extraction) crf_status_plot(pal=rev(edc_pal_crf())) crf_status_plot(details=TRUE, treat_as_worst="No Data") crf_status_plot(x_label="{crfname} (N={n_id}, n={nrow})") p = crf_status_plot(details=TRUE) p$data$crfstat %>% unique() #> [1] "Incomplete" "No Data Locked" "No Data" "Signed" #> [5] "Partial Monitored" "Monitored" "Complete Locked" "Complete" ## End(Not run)
## Not run: #import a TM database and use load_list(), then: crf_status_plot() + ggtitle(date_extraction) crf_status_plot(pal=rev(edc_pal_crf())) crf_status_plot(details=TRUE, treat_as_worst="No Data") crf_status_plot(x_label="{crfname} (N={n_id}, n={nrow})") p = crf_status_plot(details=TRUE) p$data$crfstat %>% unique() #> [1] "Incomplete" "No Data Locked" "No Data" "Signed" #> [5] "Partial Monitored" "Monitored" "Complete Locked" "Complete" ## End(Not run)
List of tables used in EDCimport examples:
edc_example()
can be used as the result of read_trialmaster()
edc_example_plot()
can be used to test edc_swimmerplot()
edc_example_mixed()
can be used to test split_mixed_datasets()
edc_example_mixed(N = 100, seed = 42) edc_example(N = 50, seed = 42) edc_example_plot(N = 50, seed = 42) edc_example_ae(N = 50, seed = 42)
edc_example_mixed(N = 100, seed = 42) edc_example(N = 50, seed = 42) edc_example_plot(N = 50, seed = 42) edc_example_ae(N = 50, seed = 42)
N |
the number of patients |
seed |
the random seed |
a list of tables
When checking your data, filter your dataset to get only problematic rows.
Then, use either:
edc_data_warn()
to generate a standardized warning that can be forwarded to the datamanager
edc_data_warn()
to abort the script if the problem is too serious
Database issues should be traced in a separate file, each with an identifying row number, and the file should be shared with the data-manager.
Use edc_data_warnings()
to generate the table for such a file.
edc_data_warn( df, message, ..., issue_n = "xx", max_subjid = 5, csv_path = FALSE, col_subjid = get_subjid_cols() ) edc_data_stop(df, message, ..., issue_n, max_subjid, csv_path, col_subjid) edc_data_warnings()
edc_data_warn( df, message, ..., issue_n = "xx", max_subjid = 5, csv_path = FALSE, col_subjid = get_subjid_cols() ) edc_data_stop(df, message, ..., issue_n, max_subjid, csv_path, col_subjid) edc_data_warnings()
df |
the filtered dataframe |
message |
the message. Can use cli formats. |
... |
unused |
issue_n |
identifying row number |
max_subjid |
max number of subject ID to show in the message |
csv_path |
a path to save |
col_subjid |
column name for subject ID. Set to |
df
invisibly
library(dplyr) tm = edc_example() load_list(tm) db0 %>% filter(age>70) %>% edc_data_warn("Age should not be >70", issue_n=1) db0 %>% filter(age<25) %>% edc_data_warn("Age should not be <25", issue_n=2) db1 %>% filter(n()>1, .by=SUBJID) %>% edc_data_warn("There are duplicated patients in `db1` ({nrow(.data)} rows)", issue_n=3) db0 %>% filter(age<25) %>% edc_data_warn("Age should not be <25", issue_n=NULL) edc_data_warnings() ## Not run: db0 %>% filter(age<25) %>% edc_data_warn("Age should not be <25", csv_path="check/check_age_25.csv") db0 %>% filter(age<25) %>% edc_data_stop("Age should *never* be <25") ## End(Not run)
library(dplyr) tm = edc_example() load_list(tm) db0 %>% filter(age>70) %>% edc_data_warn("Age should not be >70", issue_n=1) db0 %>% filter(age<25) %>% edc_data_warn("Age should not be <25", issue_n=2) db1 %>% filter(n()>1, .by=SUBJID) %>% edc_data_warn("There are duplicated patients in `db1` ({nrow(.data)} rows)", issue_n=3) db0 %>% filter(age<25) %>% edc_data_warn("Age should not be <25", issue_n=NULL) edc_data_warnings() ## Not run: db0 %>% filter(age<25) %>% edc_data_warn("Age should not be <25", csv_path="check/check_age_25.csv") db0 %>% filter(age<25) %>% edc_data_stop("Age should *never* be <25") ## End(Not run)
Because RStudio is not very good at showing data, it can be more convenient to browse the
database using MS Excel. This function turns the whole TM export (or any named list of datasets)
into an Excel workbook, with one tab for each dataset.
Use edc_db_to_excel()
to create the file and edc_browse_excel()
to open it.
edc_db_to_excel( filename = tempfile(fileext = ".xlsx"), ..., datasets = get_datasets(), overwrite = FALSE, open = FALSE ) edc_browse_excel()
edc_db_to_excel( filename = tempfile(fileext = ".xlsx"), ..., datasets = get_datasets(), overwrite = FALSE, open = FALSE ) edc_browse_excel()
filename |
the path to the Excel output file. Default to a temporary file. Use the special value |
... |
unused |
datasets |
a named list of dataframes. Default to the TM export. |
overwrite |
whether to overwrite any existing file. Default to |
open |
whether to open the Excel file afterward. Default to |
nothing
## Not run: tm = edc_example() load_list(tm) edc_db_to_excel() #default arguments are usually OK edc_db_to_excel(filename=TRUE) ## End(Not run)
## Not run: tm = edc_example() load_list(tm) edc_db_to_excel() #default arguments are usually OK edc_db_to_excel(filename=TRUE) ## End(Not run)
Shows how many code you wrote
edc_inform_code(main = "main.R", Rdir = "R/")
edc_inform_code(main = "main.R", Rdir = "R/")
main |
the main R file, which sources the other ones |
Rdir |
the R directory, where sourced R files are located |
Nothing
Retrieve the lookup table from options
edc_lookup(..., check_null = TRUE)
edc_lookup(..., check_null = TRUE)
... |
passed on to |
check_null |
whether to stop if lookup is NULL |
the lookup dataframe summarizing the database import
build_lookup()
, extend_lookup()
tm = edc_example() load_list(tm) edc_lookup() edc_lookup(dataset)
tm = edc_example() load_list(tm) edc_lookup() edc_lookup(dataset)
EDCimport
Use this function to manage your EDCimport
parameters globally while taking advantage of autocompletion.
Use edc_peek_options()
to see which option is currently set and edc_reset_options()
to set all options back to default.
edc_options( ..., trialmaster_pw, path_7zip, edc_lookup, edc_subjid_ref, edc_plotly, edc_fct_yesno, edc_cols_subjid, edc_cols_meta, edc_cols_id, edc_cols_crfname, edc_meta_cols_pct, edc_warn_max_subjid, edc_read_verbose, edc_correction_verbose, edc_get_key_cols_verbose, edc_lookup_overwrite_warn, .local = FALSE )
edc_options( ..., trialmaster_pw, path_7zip, edc_lookup, edc_subjid_ref, edc_plotly, edc_fct_yesno, edc_cols_subjid, edc_cols_meta, edc_cols_id, edc_cols_crfname, edc_meta_cols_pct, edc_warn_max_subjid, edc_read_verbose, edc_correction_verbose, edc_get_key_cols_verbose, edc_lookup_overwrite_warn, .local = FALSE )
... |
unused |
trialmaster_pw |
the password of the trialmaster zip archive. For instance, you can use |
path_7zip |
the path to the 7zip executable. Default to |
edc_lookup |
(Internal) a reference to the lookup table (usually |
edc_subjid_ref |
used in edc_warn_patient_diffs the vector of the reference subject IDs. You should usually write |
edc_plotly |
used in edc_swimmerplot whether to use plotly to visualize the plot. |
edc_fct_yesno |
used in fct_yesno list of values to be considered as Yes/No values. Defaults to |
edc_cols_subjid , edc_cols_meta
|
used in get_key_cols the name of the columns holding the subject id (default to |
edc_cols_id , edc_cols_crfname
|
deprecated |
edc_meta_cols_pct |
The minimal proportion of datasets a column has to reach to be considered "meta" |
edc_warn_max_subjid |
The max number of subject IDs to show in edc_data_warn |
edc_read_verbose , edc_correction_verbose , edc_get_key_cols_verbose
|
the verbosity of the output of functions read_trialmaster and read_tm_all_xpt, manual_correction, and get_key_cols. For example, set |
edc_lookup_overwrite_warn |
default to TRUE. Whether there should be warning when overwriting |
.local |
if TRUE, the effect will only apply to the local frame (internally using |
Nothing, called for its side effects
EDCimport
option is currently set.See which EDCimport
option is currently set.
edc_peek_options(keep_null = FALSE)
edc_peek_options(keep_null = FALSE)
keep_null |
set to TRUE to get a list |
A named list of EDCimport options
In a RCT, you usually have several populations of analysis, and this function allow to show which patient is in which population graphically.
edc_population_plot(x, id_per_row = 50, ref = "first")
edc_population_plot(x, id_per_row = 50, ref = "first")
x |
a named list of subject ID. |
id_per_row |
number of patients per rows. |
ref |
the whole population. Default to the first member of |
a ggplot
#in real word code, use filter and pull to get these vectors pop_total = c(1:180) %>% setdiff(55) #screen failure, no patient 55 pop_itt = pop_total %>% setdiff(10) #patient 10 has had the wrong treatment pop_safety = pop_total %>% setdiff(c(40,160)) #patients 40 and 160 didn't receive any treatment pop_m_itt = pop_total %>% setdiff(c(40,160,80)) #patient 80 had a wrong inclusion criterion pop_evaluable = pop_total %>% setdiff(c(40,160,101,147,186)) #patients with no recist evaluation l = list( "Total population"=pop_total, "ITT population"=pop_itt, "Safety population"=pop_safety, "mITT population"=pop_m_itt, "Evaluable population"=pop_evaluable ) edc_population_plot(l) edc_population_plot(l[-1], ref=pop_total) edc_population_plot(l, ref=1:200) edc_population_plot(l, id_per_row=60)
#in real word code, use filter and pull to get these vectors pop_total = c(1:180) %>% setdiff(55) #screen failure, no patient 55 pop_itt = pop_total %>% setdiff(10) #patient 10 has had the wrong treatment pop_safety = pop_total %>% setdiff(c(40,160)) #patients 40 and 160 didn't receive any treatment pop_m_itt = pop_total %>% setdiff(c(40,160,80)) #patient 80 had a wrong inclusion criterion pop_evaluable = pop_total %>% setdiff(c(40,160,101,147,186)) #patients with no recist evaluation l = list( "Total population"=pop_total, "ITT population"=pop_itt, "Safety population"=pop_safety, "mITT population"=pop_m_itt, "Evaluable population"=pop_evaluable ) edc_population_plot(l) edc_population_plot(l[-1], ref=pop_total) edc_population_plot(l, ref=1:200) edc_population_plot(l, id_per_row=60)
EDCimport
options.Reset all EDCimport
options.
edc_reset_options( except = c("edc_lookup", "trialmaster_pw", "path_7zip"), quiet = FALSE )
edc_reset_options( except = c("edc_lookup", "trialmaster_pw", "path_7zip"), quiet = FALSE )
except |
options that are not reset by default |
quiet |
set to |
Nothing, called for its side effects
Join all tables from .lookup$dataset
on id
edc_swimmerplot( .lookup = edc_lookup(), ..., id = get_subjid_cols(), group = NULL, origin = NULL, id_lim = NULL, exclude = NULL, time_unit = c("days", "weeks", "months", "years"), aes_color = c("variable", "label"), plotly = getOption("edc_plotly", FALSE) )
edc_swimmerplot( .lookup = edc_lookup(), ..., id = get_subjid_cols(), group = NULL, origin = NULL, id_lim = NULL, exclude = NULL, time_unit = c("days", "weeks", "months", "years"), aes_color = c("variable", "label"), plotly = getOption("edc_plotly", FALSE) )
.lookup |
the lookup table, default to |
... |
not used |
id |
the patient identifier. Will be coerced as numeric. |
group |
a grouping variable, given as "dataset$column" |
origin |
a variable to consider as time 0, given as "dataset$column" |
id_lim |
a numeric vector of length 2 providing the minimum and maximum |
exclude |
a character vector of variables to exclude, in the form |
time_unit |
if |
aes_color |
either |
plotly |
whether to use |
either a plotly
or a ggplot
#tm = read_trialmaster("filename.zip", pw="xx") tm = edc_example_plot() load_list(tm) p = edc_swimmerplot(.lookup, id_lim=c(5,45)) p2 = edc_swimmerplot(.lookup, origin="db0$date_naissance", time_unit="weeks", exclude=c("DB1$DATE2", "db3$.*")) p3 = edc_swimmerplot(.lookup, group="db0$group", aes_color="label") ## Not run: #save the plotly plot as HTML to share it save_plotly(p, "edc_swimmerplot.html") ## End(Not run)
#tm = read_trialmaster("filename.zip", pw="xx") tm = edc_example_plot() load_list(tm) p = edc_swimmerplot(.lookup, id_lim=c(5,45)) p2 = edc_swimmerplot(.lookup, origin="db0$date_naissance", time_unit="weeks", exclude=c("DB1$DATE2", "db3$.*")) p3 = edc_swimmerplot(.lookup, group="db0$group", aes_color="label") ## Not run: #save the plotly plot as HTML to share it save_plotly(p, "edc_swimmerplot.html") ## End(Not run)
Warn if extraction is too old
edc_warn_extraction_date(max_days = 30)
edc_warn_extraction_date(max_days = 30)
max_days |
the max acceptable age of the data |
nothing
tm = edc_example() load_list(tm) edc_warn_extraction_date()
tm = edc_example() load_list(tm) edc_warn_extraction_date()
Compare a subject ID vector to the study's reference subject ID (usually something like enrolres$subjid
), and warn if any patient is missing or extra. check_subjid()
is the old, deprecated name.
edc_warn_patient_diffs( x, ref = getOption("edc_subjid_ref"), issue_n = "xx", data_name = NULL, col_subjid = get_subjid_cols() )
edc_warn_patient_diffs( x, ref = getOption("edc_subjid_ref"), issue_n = "xx", data_name = NULL, col_subjid = get_subjid_cols() )
x |
the subject ID vector to check, or a dataframe which ID column will be guessed |
ref |
the reference for subject ID. Should usually be set through |
issue_n |
identifying row number |
data_name |
the name of the data (for the warning message) |
col_subjid |
name of the subject ID column if |
nothing, called for errors/warnings
tm = edc_example() load_list(tm) options(edc_subjid_ref=db0$SUBJID) #usually, you set something like: #options(edc_subjid_ref=enrolres$subjid) edc_warn_patient_diffs(db1) db1 %>% dplyr::filter(SUBJID>1) %>% edc_warn_patient_diffs() edc_warn_patient_diffs(c(db1$SUBJID, 99, 999))
tm = edc_example() load_list(tm) options(edc_subjid_ref=db0$SUBJID) #usually, you set something like: #options(edc_subjid_ref=enrolres$subjid) edc_warn_patient_diffs(db1) db1 %>% dplyr::filter(SUBJID>1) %>% edc_warn_patient_diffs() edc_warn_patient_diffs(c(db1$SUBJID, 99, 999))
This utility extends the lookup table to include:
n_id
the number of patients present in the dataset
rows_per_id
the mean number of row per patient
crfname
the actual name of the dataset
extend_lookup( lookup, ..., id_cols = get_subjid_cols(lookup), crf_cols = get_crfname_cols(lookup), datasets = get_datasets(lookup, envir = parent.frame()) )
extend_lookup( lookup, ..., id_cols = get_subjid_cols(lookup), crf_cols = get_crfname_cols(lookup), datasets = get_datasets(lookup, envir = parent.frame()) )
lookup |
[ |
... |
unused |
id_cols , crf_cols
|
[ |
datasets |
[ |
the lookup, extended
#tm = read_trialmaster("filename.zip", pw="xx") tm = edc_example_mixed() load_list(tm) .lookup .lookup = extend_lookup(.lookup) .lookup
#tm = read_trialmaster("filename.zip", pw="xx") tm = edc_example_mixed() load_list(tm) .lookup .lookup = extend_lookup(.lookup) .lookup
Format factor levels as arbitrary values of Yes/No (with Yes always first) while leaving untouched all vectors that contain other information.
fct_yesno( x, input = list(yes = c("Yes", "Oui"), no = c("No", "Non")), output = c("Yes", "No"), strict = FALSE, mutate_character = TRUE, fail = TRUE )
fct_yesno( x, input = list(yes = c("Yes", "Oui"), no = c("No", "Non")), output = c("Yes", "No"), strict = FALSE, mutate_character = TRUE, fail = TRUE )
x |
a vector of any type/class. |
input |
list of values to be considered as "yes" and "no". |
output |
the output factor levels. |
strict |
whether to match the input strictly or use stringr::str_detect to find them. |
mutate_character |
whether to turn characters into factor. |
fail |
whether to fail if some levels cannot be recoded to yes/no. |
a factor, or x
untouched.
fct_yesno(c("No", "Yes")) #levels are in order set.seed(42) N=6 x = tibble( a=sample(c("Yes", "No"), size=N, replace=TRUE), b=sample(c("Oui", "Non"), size=N, replace=TRUE), c=sample(0:1, size=N, replace=TRUE), d=sample(c(TRUE, FALSE), size=N, replace=TRUE), e=sample(c("1-Yes", "0-No"), size=N, replace=TRUE), y=sample(c("aaa", "bbb", "ccc"), size=N, replace=TRUE), z=1:N, ) x #y and z are left untouched (or throw an error if fail=TRUE) sapply(x, fct_yesno, fail=FALSE) # as "1-Yes" is not in `input`, x$e is untouched/fails if strict=TRUE fct_yesno(x$e) fct_yesno(x$e, strict=TRUE, fail=FALSE) fct_yesno(x$e, output=c("Ja", "Nein"))
fct_yesno(c("No", "Yes")) #levels are in order set.seed(42) N=6 x = tibble( a=sample(c("Yes", "No"), size=N, replace=TRUE), b=sample(c("Oui", "Non"), size=N, replace=TRUE), c=sample(0:1, size=N, replace=TRUE), d=sample(c(TRUE, FALSE), size=N, replace=TRUE), e=sample(c("1-Yes", "0-No"), size=N, replace=TRUE), y=sample(c("aaa", "bbb", "ccc"), size=N, replace=TRUE), z=1:N, ) x #y and z are left untouched (or throw an error if fail=TRUE) sapply(x, fct_yesno, fail=FALSE) # as "1-Yes" is not in `input`, x$e is untouched/fails if strict=TRUE fct_yesno(x$e) fct_yesno(x$e, strict=TRUE, fail=FALSE) fct_yesno(x$e, output=c("Ja", "Nein"))
Find a keyword in all names and labels of a list of datasets.
find_keyword(keyword, data = edc_lookup(), ignore_case = TRUE)
find_keyword(keyword, data = edc_lookup(), ignore_case = TRUE)
keyword |
the keyword to search for. Can handle regular expressions (see examples). |
data |
the lookup dataframe where to search the keyword. Can be set using |
ignore_case |
should case differences be ignored in the match? Default to |
a tibble
## Not run: path = system.file("extdata/Example_Export_SAS_XPORT_2022_08_25_15_16.zip", package="EDCimport", mustWork=TRUE) w = read_trialmaster(path, verbose=FALSE) find_keyword("patient") #with regex find_keyword("patient$") find_keyword("\\d") find_keyword("(Trial|Form) Name") find_keyword("\\(") #you need to escape special characters ## End(Not run)
## Not run: path = system.file("extdata/Example_Export_SAS_XPORT_2022_08_25_15_16.zip", package="EDCimport", mustWork=TRUE) w = read_trialmaster(path, verbose=FALSE) find_keyword("patient") #with regex find_keyword("patient$") find_keyword("\\d") find_keyword("(Trial|Form) Name") find_keyword("\\(") #you need to escape special characters ## End(Not run)
Attempt to list all columns in the database and group the ones that are common to some datasets. Useful to find keys to pivot or summarise data.
get_common_cols(lookup = edc_lookup(), min_datasets = 3) ## S3 method for class 'common_cols' summary(object, ...)
get_common_cols(lookup = edc_lookup(), min_datasets = 3) ## S3 method for class 'common_cols' summary(object, ...)
lookup |
the lookup table, default to |
min_datasets |
the minimal number of datasets to be considered |
object |
an object of class "common_cols" |
... |
unused |
a tibble of class "common_cols"
tm = edc_example() load_list(tm) x = get_common_cols(min_datasets=1) x summary(x)
tm = edc_example() load_list(tm) x = get_common_cols(min_datasets=1) x summary(x)
Get the datasets from the lookup table as a list of data.frames.
get_datasets(lookup = edc_lookup(), envir = parent.frame())
get_datasets(lookup = edc_lookup(), envir = parent.frame())
lookup |
the lookup table |
envir |
(internal use) |
a list of all datasets
Retrieve names of patient_id
(usually "SUBJID" and "PATNO") and crfname
(usually "CRFNAME") from the actual names of the datasets
get_key_cols(lookup = edc_lookup())
get_key_cols(lookup = edc_lookup())
lookup |
the lookup table |
a list(2) of characters with names patient_id
and crfname
In most trialmaster exports, many datasets share a certain amount of columns containing meta-data that are often irrelevant to the point. This function identifies the columns that are present in at least 95% of datasets (by default)
get_meta_cols(min_pct = getOption("edc_meta_cols_pct", 0.95))
get_meta_cols(min_pct = getOption("edc_meta_cols_pct", 0.95))
min_pct |
Default= |
a character vector
tm = edc_example_mixed() load_list(tm) meta_cols = get_meta_cols() long_mixed %>% dplyr::select(-dplyr::any_of(meta_cols))
tm = edc_example_mixed() load_list(tm) meta_cols = get_meta_cols() long_mixed %>% dplyr::select(-dplyr::any_of(meta_cols))
Retrieve names of patient ID and CRF name from the actual names of the datasets, without respect of the case. Default values should be set through options.
get_subjid_cols(lookup = edc_lookup()) get_crfname_cols(lookup = edc_lookup())
get_subjid_cols(lookup = edc_lookup()) get_crfname_cols(lookup = edc_lookup())
lookup |
the lookup table |
a character vector
Use edc_options()
to set default values:
edc_cols_subjid
defaults to c("PTNO", "SUBJID")
edc_cols_crfname
defaults to c("CRFNAME")
get_subjid_cols() get_crfname_cols()
get_subjid_cols() get_crfname_cols()
Turns the subject ID columns of all datasets into a factor containing levels for all the subjects of the database. Avoid problems when joining tables, and some checks can be performed on the levels.
harmonize_subjid(datalist, preprocess = NULL, col_subjid = get_subjid_cols())
harmonize_subjid(datalist, preprocess = NULL, col_subjid = get_subjid_cols())
datalist |
a list of dataframes |
preprocess |
an optional function to modify the subject ID column, for example |
col_subjid |
the names of the columns holding the subject ID (as character) |
datalist, with subject id modified
db = edc_example() db$db0 = head(db$db0, 10) db$db0$SUBJID %>% head() db = harmonize_subjid(db) db$db0$SUBJID %>% head() db = harmonize_subjid(db, preprocess=function(x) paste0("#", x)) db$db0$SUBJID %>% head()
db = edc_example() db$db0 = head(db$db0, 10) db$db0$SUBJID %>% head() db = harmonize_subjid(db) db$db0$SUBJID %>% head() db = harmonize_subjid(db, preprocess=function(x) paste0("#", x)) db$db0$SUBJID %>% head()
This function search for date columns in every tables and returns the latest date for each patient with the variable it comes from. Useful in survival analysis to get the right censoring time.
lastnews_table( except = NULL, with_ties = FALSE, numeric_id = TRUE, prefer = NULL, warn_if_future = TRUE )
lastnews_table( except = NULL, with_ties = FALSE, numeric_id = TRUE, prefer = NULL, warn_if_future = TRUE )
except |
the datasets/columns that should not be searched. Example: a scheduled visit for which the patient may have died before attending should not be considered. |
with_ties |
in case of tie, whether to return the first |
numeric_id |
set to FALSE if the patient ID column is not numeric |
prefer |
preferred origins in the event of a tie. Usually the followup table. |
warn_if_future |
whether to show a warning about dates that are after the extraction date |
a dataframe
tm = edc_example_plot() load_list(tm) lastnews_table() lastnews_table(except="db3") lastnews_table(except="db3$date9") lastnews_table(prefer="db2")
tm = edc_example_plot() load_list(tm) lastnews_table() lastnews_table(except="db3") lastnews_table(except="db3$date9") lastnews_table(prefer="db2")
.RData
file as a listInstead of loading a .RData
file in the global environment, extract every object into a list.
load_as_list(filename)
load_as_list(filename)
filename |
the filename, with the |
a list
x = list(a=1, b=mtcars) save_list(x, "test.RData") y = load_as_list("test.RData") print(y$a)
x = list(a=1, b=mtcars) save_list(x, "test.RData") y = load_as_list("test.RData") print(y$a)
Load a list in an environment
load_list(x, env = parent.frame(), remove = TRUE)
load_list(x, env = parent.frame(), remove = TRUE)
x |
a list |
env |
the environment onto which the list should be loaded |
remove |
if |
nothing, called for its side-effect
x=list(a=1, b=mtcars) load_list(x, remove=FALSE) print(a) print(nrow(b))
x=list(a=1, b=mtcars) load_list(x, remove=FALSE) print(a) print(nrow(b))
When finding wrong or unexpected values in an exported dataset, it can be useful to temporarily correct them by hard-coding a value. However, this manual correction should be undone as soon as the central database is updated with the correction.
manual_correction()
applies a correction in a specific dataset column location and throws an error if the correction is already in place. This check applies only once per R session so you can source your script without errors.
reset_manual_correction()
resets all checks. For instance, it is called by read_trialmaster()
.
manual_correction( data, col, rows, wrong, correct, verbose = getOption("edc_correction_verbose", TRUE) ) reset_manual_correction()
manual_correction( data, col, rows, wrong, correct, verbose = getOption("edc_correction_verbose", TRUE) ) reset_manual_correction()
data , col , rows
|
the rows of a column of a dataframe where the error lies |
wrong |
the actual wrong value |
correct |
the temporary correction value |
verbose |
whether to print informations (once) |
Nothing, used for side effects
library(dplyr) x = iris %>% mutate(id=row_number(), .before=1) %>% as_tibble() x$Sepal.Length[c(1,3,5)] #1st correction is silent manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) x$Sepal.Length[c(1,3,5)] #further correction is silent manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) #if the database is corrected, an error is thrown ## Not run: reset_manual_correction() x$Sepal.Length[c(1,3,5)] = c(5, 4, 3) #mimics db correction manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) ## End(Not run)
library(dplyr) x = iris %>% mutate(id=row_number(), .before=1) %>% as_tibble() x$Sepal.Length[c(1,3,5)] #1st correction is silent manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) x$Sepal.Length[c(1,3,5)] #further correction is silent manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) #if the database is corrected, an error is thrown ## Not run: reset_manual_correction() x$Sepal.Length[c(1,3,5)] = c(5, 4, 3) #mimics db correction manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) ## End(Not run)
.csv
files in a directoryRead all .csv
files in a directory, with labels if specified.
read_all_csv( path, ..., labels_from = NULL, clean_names_fun = NULL, read_fun = "guess", datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1) )
read_all_csv( path, ..., labels_from = NULL, clean_names_fun = NULL, read_fun = "guess", datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1) )
path |
[ |
... |
unused |
labels_from |
[ |
clean_names_fun |
[ |
read_fun |
[ |
datetime_extraction |
[ |
verbose |
[ |
a list containing one dataframe for each .csv
file in the folder, the extraction date (datetime_extraction
), and a summary of all imported tables (.lookup
).
labels_from
should contain the information about column labels. It should be a data file (.csv
) containing 2 columns: one for the column name and the other for its associated label. Use options(edc_col_name="xxx", edc_col_label="xxx")
to specify the names of the columns.
.sas7bdat
files in a directoryRead all .sas7bdat
files in a directory. Formats can be applied from a procformat.sas
SAS file, from a .
read_all_sas( path, ..., format_file = "procformat.sas", clean_names_fun = NULL, datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1) )
read_all_sas( path, ..., format_file = "procformat.sas", clean_names_fun = NULL, datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1) )
path |
[ |
... |
unused |
format_file |
[ |
clean_names_fun |
[ |
datetime_extraction |
[ |
verbose |
[ |
a list containing one dataframe for each .xpt
file in the folder, the extraction date (datetime_extraction
), and a summary of all imported tables (.lookup
).
format_file
should contain the information about SAS formats. It can be either
a procformat.sas
file, containing the whole PROC FORMAT
a catalog file (.sas7bcat
)
or a data file (.csv
or .sas7bdat
) containing 3 columns: the SAS format name (repeated),
each level, and its associated label. Use options(edc_var_format_name="xxx", edc_var_level="xxx", edc_var_label="xxx")
to specify the names of the columns.
.xpt
files in a directoryRead all .xpt
files in a directory (unzipped TrialMaster archive).
If 7zip
is installed, you should probably rather use read_trialmaster()
instead.
If a procformat.sas
file exists in the directory, formats will be applied.
read_all_xpt( path, ..., format_file = "procformat.sas", clean_names_fun = NULL, split_mixed = FALSE, extend_lookup = TRUE, datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1), directory = "deprecated", key_columns = "deprecated" )
read_all_xpt( path, ..., format_file = "procformat.sas", clean_names_fun = NULL, split_mixed = FALSE, extend_lookup = TRUE, datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1), directory = "deprecated", key_columns = "deprecated" )
path |
[ |
... |
unused |
format_file |
[ |
clean_names_fun |
[ |
split_mixed |
[ |
extend_lookup |
[ |
datetime_extraction |
[ |
verbose |
[ |
directory |
deprecated |
key_columns |
deprecated |
a list containing one dataframe for each .xpt
file in the folder, the extraction date (datetime_extraction
), and a summary of all imported tables (.lookup
).
format_file
should contain the information about SAS formats. It can be either
a procformat.sas
file, containing the whole PROC FORMAT
or a data file (.csv or .sas7bdat) containing 3 columns: the SAS format name (repeated),
each level, and its associated label. Use options(edc_var_format_name="xxx", edc_var_level="xxx", edc_var_label="xxx")
to specify the names of the columns.
.zip
archive of a TrialMaster exportImport the .zip
archive of a TrialMaster trial export as a list of dataframes. The archive filename should be leaved untouched as it contains the project name and the date of extraction.
Generate a .rds
cache file for future reads.
If 7zip
is not installed or available, use read_tm_all_xpt()
instead.
read_trialmaster( archive, ..., use_cache = "write", clean_names_fun = NULL, split_mixed = FALSE, extend_lookup = TRUE, pw = getOption("trialmaster_pw"), verbose = getOption("edc_read_verbose", 1), key_columns = "deprecated" )
read_trialmaster( archive, ..., use_cache = "write", clean_names_fun = NULL, split_mixed = FALSE, extend_lookup = TRUE, pw = getOption("trialmaster_pw"), verbose = getOption("edc_read_verbose", 1), key_columns = "deprecated" )
archive |
[ |
... |
unused |
use_cache |
[ |
clean_names_fun |
[ |
split_mixed |
[ |
extend_lookup |
[ |
pw |
[ |
verbose |
[ |
key_columns |
deprecated |
a list containing one dataframe for each .xpt
file in the folder, the extraction date (datetime_extraction
), and a summary of all imported tables (.lookup
).
.RData
fileSave a list as .RData
file
save_list(x, filename)
save_list(x, filename)
x |
a list |
filename |
the filename, with the |
nothing, called for its side-effect
x=list(a=1, b=mtcars) save_list(x, "test.RData") load("test.RData") file.remove("test.RData") print(a) print(nrow(b))
x=list(a=1, b=mtcars) save_list(x, "test.RData") load("test.RData") file.remove("test.RData") print(a) print(nrow(b))
Save a plotly to an HTML file
save_plotly(p, file, ...)
save_plotly(p, file, ...)
p |
a plot object ( |
file |
a file path to save the HTML file |
... |
passed on to htmlwidgets::saveWidget |
nothing, used for side effect
## Not run: tm = edc_example_plot() p = edc_swimmerplot(tm$.lookup, id_lim=c(5,45)) save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot") ## End(Not run)
## Not run: tm = edc_example_plot() p = edc_swimmerplot(tm$.lookup, id_lim=c(5,45)) save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot") ## End(Not run)
sessionInfo()
outputSave sessionInfo()
output into a text file.
save_sessioninfo(path = "check/session_info.txt", with_date = TRUE)
save_sessioninfo(path = "check/session_info.txt", with_date = TRUE)
path |
target path to write the file |
with_date |
whether to insert the date before the file extension |
nothing
## Not run: save_sessioninfo() ## End(Not run)
## Not run: save_sessioninfo() ## End(Not run)
Search in some folders if a TrialMaster database more recent than the current extraction is present. By default, it will search the "data" folder and the OS usual "Downloads" folder. If a newer database is found, user will be asked if they want to move it to the "data" folder.
search_for_newer_data( archive, ..., source = path_home("Downloads"), target = "data", ask = TRUE, advice = TRUE )
search_for_newer_data( archive, ..., source = path_home("Downloads"), target = "data", ask = TRUE, advice = TRUE )
archive |
TM archive path, giving the project name and date |
... |
unused |
source |
the path vector to be searched, default to both "data" and the usual "Downloads" folder |
target |
the path where files should be copied |
ask |
whether to ask the user to move the file to "data" |
advice |
whether to advice how to move it instead, if |
the path to the newer file, invisibly.
## Not run: archive = "data/MYPROJECT_ExportTemplate_xxx_SAS_XPORT_2024_06_01_12_00.zip" #tm = read_trialmaster(archive) search_for_newer_data(archive) ## End(Not run)
## Not run: archive = "data/MYPROJECT_ExportTemplate_xxx_SAS_XPORT_2024_06_01_12_00.zip" #tm = read_trialmaster(archive) search_for_newer_data(archive) ## End(Not run)
Select all columns that has only one level for a given grouping scope. Useful when dealing with mixed datasets containing both long data and repeated short data.
select_distinct(df, .by)
select_distinct(df, .by)
df |
a dataframe |
.by |
optional grouping columns |
df
with less columns
tm = edc_example_ae() tm$ae %>% names tm$ae %>% select_distinct() %>% names tm$ae %>% select_distinct(.by=subjid) %>% names
tm = edc_example_ae() tm$ae %>% names tm$ae %>% select_distinct() %>% names tm$ae %>% select_distinct(.by=subjid) %>% names
Split mixed tables, i.e. tables that hold both long data (N values per patient) and short data (one value per patient, duplicated on N lines), into one long table and one short table.
split_mixed_datasets( datasets = get_datasets(), id = get_subjid_cols(), ..., ignore_cols = get_meta_cols(0.95), output_code = FALSE, verbose = TRUE )
split_mixed_datasets( datasets = get_datasets(), id = get_subjid_cols(), ..., ignore_cols = get_meta_cols(0.95), output_code = FALSE, verbose = TRUE )
datasets |
a dataframe or a list of dataframes to split. Default to all the datasets from |
id |
the patient identifier, probably "SUBJID". Should be shared by all datasets. Case-insensitive. |
... |
not used |
ignore_cols |
columns to ignore when considering a table as long. Default to |
output_code |
whether to print the code to explicitly write. Can also be a file path. |
verbose |
whether to print informations about the process. |
a list of the new long and short tables. Use load_list()
to load them into the global environment.
#tm = read_trialmaster("filename.zip", pw="xx") tm = edc_example_mixed() names(tm) #load_list(tm) print(tm$long_mixed) #`val1` and `val2` are long but `val3` is short mixed_data = split_mixed_datasets(tm, id="subjid", verbose=TRUE) load_list(mixed_data) print(long_mixed_short) print(long_mixed_long) #alternatively, get the code and only use the datasets you need split_mixed_datasets(tm, id="SUBJID", output_code=TRUE) filename = tempfile("mixed_code", fileext=".R") split_mixed_datasets(tm, id="SUBJID", output_code=filename) readLines(filename)
#tm = read_trialmaster("filename.zip", pw="xx") tm = edc_example_mixed() names(tm) #load_list(tm) print(tm$long_mixed) #`val1` and `val2` are long but `val3` is short mixed_data = split_mixed_datasets(tm, id="subjid", verbose=TRUE) load_list(mixed_data) print(long_mixed_short) print(long_mixed_long) #alternatively, get the code and only use the datasets you need split_mixed_datasets(tm, id="SUBJID", output_code=TRUE) filename = tempfile("mixed_code", fileext=".R") split_mixed_datasets(tm, id="SUBJID", output_code=filename) readLines(filename)
A dataset is either in the wide format or in the long format (link). This function identifies the format of a dataframe with respect to a subject ID. If a dataframe has some wide and long columns, it is considered "mixed".
table_format( df, id = get_subjid_cols(), ..., ignore_cols = get_meta_cols(0.95), na_rm = FALSE, warn = TRUE )
table_format( df, id = get_subjid_cols(), ..., ignore_cols = get_meta_cols(0.95), na_rm = FALSE, warn = TRUE )
df |
a dataframe |
id |
the identifying subject ID |
... |
not used |
ignore_cols |
columns to ignore. Usually meta columns (see get_meta_cols). |
na_rm |
whether to consider missing values |
warn |
whether to warn if ID is not found |
a string value in c("wide", "long", "mixed)
tm = edc_example_mixed() sapply(tm, table_format, warn=FALSE)
tm = edc_example_mixed() sapply(tm, table_format, warn=FALSE)
Turn a vector of length N to a vector of length 1 after checking that there is only one unique value. Useful to safely flatten a duplicated table. This preserves the label
attribute if set.
unify(x)
unify(x)
x |
a vector |
a vector of length 1
unify(c(1,1,1,1)) #unify(c(1,1,2,1)) #warning library(dplyr) x=tibble(id=rep(letters[1:5],10), value=rep(1:5,10)) x %>% group_by(id) %>% summarise(value=unify(value)) #safer than `value=value[1]` x$value[2]=1 #x %>% group_by(id) %>% summarise(value=unify(value)) #warning about that non-unique value
unify(c(1,1,1,1)) #unify(c(1,1,2,1)) #warning library(dplyr) x=tibble(id=rep(letters[1:5],10), value=rep(1:5,10)) x %>% group_by(id) %>% summarise(value=unify(value)) #safer than `value=value[1]` x$value[2]=1 #x %>% group_by(id) %>% summarise(value=unify(value)) #warning about that non-unique value