Title: | Import Data from EDC Software |
---|---|
Description: | A convenient toolbox to import data exported from Electronic Data Capture (EDC) software 'TrialMaster'. |
Authors: | Dan Chaltiel [aut, cre] |
Maintainer: | Dan Chaltiel <[email protected]> |
License: | GPL-3 |
Version: | 0.5.2.9038 |
Built: | 2025-02-16 15:19:33 UTC |
Source: | https://github.com/DanChaltiel/EDCimport |
Check that there is no duplicate on the column holding patient ID in a pipeable style.
Mostly useful after joining two datasets.
assert_no_duplicate(df, by = NULL, id_col = get_subjid_cols())
assert_no_duplicate(df, by = NULL, id_col = get_subjid_cols())
df |
a dataframe |
by |
(optional) grouping columns |
id_col |
the name of the columns holding patient ID |
the df
dataset, unchanged
## Not run: #without duplicate => no error, continue the pipeline tibble(subjid=c(1:10)) %>% assert_no_duplicate() %>% nrow() #with duplicate => throws an error tibble(subjid=c(1:10, 1:2)) %>% assert_no_duplicate() %>% nrow() #By groups df = tibble(subjid=rep(1:10, 4), visit=rep(c("V1", "V2"), 2, each=10), group=rep(c("A", "B"), each=20)) df %>% assert_no_duplicate() #error df %>% assert_no_duplicate(by=c(visit, group)) #no error ## End(Not run)
## Not run: #without duplicate => no error, continue the pipeline tibble(subjid=c(1:10)) %>% assert_no_duplicate() %>% nrow() #with duplicate => throws an error tibble(subjid=c(1:10, 1:2)) %>% assert_no_duplicate() %>% nrow() #By groups df = tibble(subjid=rep(1:10, 4), visit=rep(c("V1", "V2"), 2, each=10), group=rep(c("A", "B"), each=20)) df %>% assert_no_duplicate() #error df %>% assert_no_duplicate(by=c(visit, group)) #no error ## End(Not run)
Clean the names of all the datasets in the database. By default, it converts names to lowercase letters, numbers, and underscores only.
edc_clean_names(database, clean_fun = NULL)
edc_clean_names(database, clean_fun = NULL)
database |
an edc_database object, from |
clean_fun |
a cleaning function to be applied to column names. |
an edc_database object
#db = read_trialmaster("filename.zip", pw="xx") db = edc_example() %>% edc_clean_names() names(db$enrol)
#db = read_trialmaster("filename.zip", pw="xx") db = edc_example() %>% edc_clean_names() names(db$enrol)
Generate a barplot showing the distribution of CRF status (Complete, Incomplete, ...) for each dataset of the database.
edc_crf_plot( crfstat_col = "CRFSTAT", ..., details = FALSE, pal = edc_pal_crf(), reverse = FALSE, x_label = "{dataset}", treat_as_worst = NULL, datasets = get_datasets(), lookup = edc_lookup() ) edc_pal_crf()
edc_crf_plot( crfstat_col = "CRFSTAT", ..., details = FALSE, pal = edc_pal_crf(), reverse = FALSE, x_label = "{dataset}", treat_as_worst = NULL, datasets = get_datasets(), lookup = edc_lookup() ) edc_pal_crf()
crfstat_col |
the column name of the CRF status |
... |
unused |
details |
whether to show all the CRF status levels. When |
pal |
the palette, defaulting to the helper |
reverse |
whether to reverse the CRF status level order. |
x_label |
a glue pattern determining the tick label in the x axis. Available variables are the ones of |
treat_as_worst |
a regex for levels that should be treated as worst in the ordering. |
datasets , lookup
|
internal |
a ggplot
ggsci:::ggsci_db$lancet[["lanonc"]] %>% dput()
## Not run: #import a TM database and use load_database(), then: edc_crf_plot() + ggtitle(date_extraction) edc_crf_plot(reverse=TRUE) edc_crf_plot(details=TRUE, treat_as_worst="No Data") edc_crf_plot(x_label="{crfname} (N={n_id}, n={nrow})") p = edc_crf_plot(details=TRUE) p$data$crfstat %>% unique() #> [1] "Incomplete" "No Data Locked" "No Data" "Signed" #> [5] "Partial Monitored" "Monitored" "Complete Locked" "Complete" ## End(Not run)
## Not run: #import a TM database and use load_database(), then: edc_crf_plot() + ggtitle(date_extraction) edc_crf_plot(reverse=TRUE) edc_crf_plot(details=TRUE, treat_as_worst="No Data") edc_crf_plot(x_label="{crfname} (N={n_id}, n={nrow})") p = edc_crf_plot(details=TRUE) p$data$crfstat %>% unique() #> [1] "Incomplete" "No Data Locked" "No Data" "Signed" #> [5] "Partial Monitored" "Monitored" "Complete Locked" "Complete" ## End(Not run)
When checking your data, filter your dataset to get only problematic rows.
Then, use either:
edc_data_warn()
to generate a standardized warning that can be forwarded to the datamanager
edc_data_stop()
to abort the script if the problem is too serious
Database issues should be traced in a separate file, each with an identifying row number, and the file should be shared with the data-manager.
Use edc_data_warnings()
to generate the table for such a file.
edc_data_warn( df, message, ..., issue_n = "xx", max_subjid = 5, csv_path = FALSE, col_subjid = get_subjid_cols() ) edc_data_stop(df, message, ..., issue_n, max_subjid, csv_path, col_subjid) edc_data_warnings()
edc_data_warn( df, message, ..., issue_n = "xx", max_subjid = 5, csv_path = FALSE, col_subjid = get_subjid_cols() ) edc_data_stop(df, message, ..., issue_n, max_subjid, csv_path, col_subjid) edc_data_warnings()
df |
the filtered dataframe |
message |
the message. Can use cli formats. |
... |
unused |
issue_n |
identifying row number |
max_subjid |
max number of subject ID to show in the message |
csv_path |
a path to save |
col_subjid |
column name for subject ID. Set to |
df
invisibly
library(dplyr) db = edc_example() load_database(db) enrol %>% filter(age>70) %>% edc_data_warn("Age should not be >70", issue_n=1) enrol %>% filter(age<25) %>% edc_data_warn("Age should not be <25", issue_n=2) data1 %>% filter(n()>1, .by=subjid) %>% edc_data_warn("There are duplicated patients in `data1` ({nrow(.data)} rows)", issue_n=3) enrol %>% filter(age<25) %>% edc_data_warn("Age should not be <25", issue_n=NULL) edc_data_warnings() ## Not run: enrol %>% filter(age<25) %>% edc_data_warn("Age should not be <25", csv_path="check/check_age_25.csv") enrol %>% filter(age<25) %>% edc_data_stop("Age should *never* be <25") ## End(Not run)
library(dplyr) db = edc_example() load_database(db) enrol %>% filter(age>70) %>% edc_data_warn("Age should not be >70", issue_n=1) enrol %>% filter(age<25) %>% edc_data_warn("Age should not be <25", issue_n=2) data1 %>% filter(n()>1, .by=subjid) %>% edc_data_warn("There are duplicated patients in `data1` ({nrow(.data)} rows)", issue_n=3) enrol %>% filter(age<25) %>% edc_data_warn("Age should not be <25", issue_n=NULL) edc_data_warnings() ## Not run: enrol %>% filter(age<25) %>% edc_data_warn("Age should not be <25", csv_path="check/check_age_25.csv") enrol %>% filter(age<25) %>% edc_data_stop("Age should *never* be <25") ## End(Not run)
This class of object represents a database, as the result of an EDCimport reading function.
It has its own print()
method.
edc_database
objectsAs per now, reading functions are: read_trialmaster()
, read_all_sas()
, read_all_xpt()
, and read_all_csv()
.
While it is not usually usefull to query them, an edc_database
object is a named list containing:
all the datasets from the source files
datetime_extraction
and date_extraction
the inferred date of data extraction
.lookup
a temporary copy of the lookup table
Because RStudio is not very good at showing data, it can be more convenient to browse the
database using MS Excel. This function turns the whole TM export (or any named list of datasets)
into an Excel workbook, with one tab for each dataset.
Use edc_db_to_excel()
to create the file and edc_browse_excel()
to open it.
edc_db_to_excel( filename = tempfile(fileext = ".xlsx"), ..., datasets = get_datasets(), overwrite = FALSE, open = FALSE ) edc_browse_excel()
edc_db_to_excel( filename = tempfile(fileext = ".xlsx"), ..., datasets = get_datasets(), overwrite = FALSE, open = FALSE ) edc_browse_excel()
filename |
the path to the Excel output file. Default to a temporary file. Use the special value |
... |
unused |
datasets |
a named list of dataframes. Default to the TM export. |
overwrite |
whether to overwrite any existing file. Default to |
open |
whether to open the Excel file afterward. Default to |
nothing
## Not run: db = edc_example() load_database(db) edc_db_to_excel() #default arguments are usually OK edc_db_to_excel(filename=TRUE) ## End(Not run)
## Not run: db = edc_example() load_database(db) edc_db_to_excel() #default arguments are usually OK edc_db_to_excel(filename=TRUE) ## End(Not run)
A list of tables that simulates the extraction of a clinical database. Used in EDCimport
examples and tests.
edc_example(N = 50, seed = 42, outdated = FALSE)
edc_example(N = 50, seed = 42, outdated = FALSE)
N |
the number of patients |
seed |
the random seed |
outdated |
whether to simulate times after the data extraction date |
A list of tables of class edc_database
.
Find a keyword in columns or values, in all the datasets of the database.
edc_find_value(keyword, ignore_case = TRUE, data = get_datasets()) edc_find_column(keyword, ignore_case = TRUE, data = edc_lookup())
edc_find_value(keyword, ignore_case = TRUE, data = get_datasets()) edc_find_column(keyword, ignore_case = TRUE, data = edc_lookup())
keyword |
The keyword to search for. Regular expressions are only supported in |
ignore_case |
Logical. If |
data |
Either a lookup table ( |
a tibble
db = edc_example() load_database(db) edc_find_value("respi") edc_find_value(2010) edc_find_column("ad") edc_find_column("date") #with regex edc_find_column("\\d") edc_find_column("\\(") #you need to escape special characters
db = edc_example() load_database(db) edc_find_value("respi") edc_find_value(2010) edc_find_column("ad") edc_find_column("date") #with regex edc_find_column("\\d") edc_find_column("\\(") #you need to escape special characters
Shows how many code you wrote
edc_inform_code(main = "main.R", Rdir = "R/")
edc_inform_code(main = "main.R", Rdir = "R/")
main |
the main R file, which sources the other ones |
Rdir |
the R directory, where sourced R files are located |
Nothing
Perform a join with default by
to the Subject ID and default suffix to the
name of the y
dataset. See [dplyr::mutate-joins]
for the description of the
join logic.
edc_left_join( x, y, by = NULL, suffix = NULL, cols = everything(), remove_dups = TRUE )
edc_left_join( x, y, by = NULL, suffix = NULL, cols = everything(), remove_dups = TRUE )
x , y
|
Data frames to join |
by |
The key to join on. Defaults to |
suffix |
The disambiguation suffix. Defaults to the actual name of the |
cols |
The columns to select in |
remove_dups |
Whether to remove columns in |
a dataframe
db = edc_example() load_database(db) data1$common = data2$common = "Common" x = enrol %>% edc_left_join(data2) %>% edc_right_join(data1) #crfname get a suffix, common names(x)
db = edc_example() load_database(db) data1$common = data2$common = "Common" x = enrol %>% edc_left_join(data2) %>% edc_right_join(data1) #crfname get a suffix, common names(x)
Retrieve the lookup table from options
edc_lookup(..., check = TRUE)
edc_lookup(..., check = TRUE)
... |
passed on to |
check |
whether to check for internal consistency |
the lookup dataframe summarizing the database import
db = edc_example() load_database(db) edc_lookup() edc_lookup(dataset)
db = edc_example() load_database(db) edc_lookup() edc_lookup(dataset)
EDCimport
Use this function to manage your EDCimport
parameters globally while taking advantage of autocompletion.
Use edc_peek_options()
to see which option is currently set and edc_reset_options()
to set all options back to default.
edc_options( ..., trialmaster_pw, path_7zip, edc_lookup, edc_subjid_ref, edc_plotly, edc_fct_yesno, edc_cols_subjid, edc_cols_meta, edc_cols_id, edc_cols_crfname, edc_meta_cols_pct, edc_warn_max_subjid, edc_read_verbose, edc_correction_verbose, edc_get_key_cols_verbose, edc_lookup_overwrite_warn, .local = FALSE )
edc_options( ..., trialmaster_pw, path_7zip, edc_lookup, edc_subjid_ref, edc_plotly, edc_fct_yesno, edc_cols_subjid, edc_cols_meta, edc_cols_id, edc_cols_crfname, edc_meta_cols_pct, edc_warn_max_subjid, edc_read_verbose, edc_correction_verbose, edc_get_key_cols_verbose, edc_lookup_overwrite_warn, .local = FALSE )
... |
unused |
trialmaster_pw |
the password of the trialmaster zip archive. For instance, you can use |
path_7zip |
the path to the 7zip executable. Default to |
edc_lookup |
(Internal) a reference to the lookup table (usually |
edc_subjid_ref |
used in edc_warn_patient_diffs the vector of the reference subject IDs. You should usually write |
edc_plotly |
used in edc_swimmerplot whether to use plotly to visualize the plot. |
edc_fct_yesno |
used in fct_yesno list of values to be considered as Yes/No values. Defaults to |
edc_cols_subjid , edc_cols_meta
|
the name of the columns holding the subject id (default to |
edc_cols_id , edc_cols_crfname
|
deprecated |
edc_meta_cols_pct |
The minimal proportion of datasets a column has to reach to be considered "meta" |
edc_warn_max_subjid |
The max number of subject IDs to show in edc_data_warn |
edc_read_verbose , edc_correction_verbose , edc_get_key_cols_verbose
|
the verbosity of the output of functions read_trialmaster and read_all_xpt, and manual_correction. For example, set |
edc_lookup_overwrite_warn |
default to TRUE. Whether there should be warning when overwriting |
.local |
if TRUE, the effect will only apply to the local frame (internally using |
Nothing, called for its side effects
Draw a gridplot giving, for each patient and each dataset, whether the patient is present in the dataset. Data are drawn from get_datasets.
edc_patient_gridplot( sort_rows = TRUE, sort_cols = TRUE, gradient = FALSE, axes_flip = FALSE, show_grid = TRUE, preprocess = NULL, palette = c(Yes = "#00468BFF", No = "#ED0000FF"), datasets = get_datasets(), lookup = edc_lookup() )
edc_patient_gridplot( sort_rows = TRUE, sort_cols = TRUE, gradient = FALSE, axes_flip = FALSE, show_grid = TRUE, preprocess = NULL, palette = c(Yes = "#00468BFF", No = "#ED0000FF"), datasets = get_datasets(), lookup = edc_lookup() )
sort_rows |
whether to sort patients from "present in most datasets" to "present in least datasets" |
sort_cols |
whether to sort datasets from "containing the most patients" to "containing the least patients" |
gradient |
whether to add a color gradient for repeating measures |
axes_flip |
whether to flip the axes, so that patients are on the Y axis and datasets on the X axis |
show_grid |
whether to show the grid |
preprocess |
a function to preprocess the patient ID, e.g. |
palette |
the colors to use |
datasets , lookup
|
internal |
a ggplot
object
## Not run: tm = read_trialmaster("path/to/archive.zip") load_database(db) edc_patient_gridplot(sort_rows=FALSE, sort_cols=FALSE) edc_patient_gridplot(axes_flip=TRUE, show_grid=TRUE, preprocess=~str_remove(.x, "\\D*")) #remove all non-digits ## End(Not run)
## Not run: tm = read_trialmaster("path/to/archive.zip") load_database(db) edc_patient_gridplot(sort_rows=FALSE, sort_cols=FALSE) edc_patient_gridplot(axes_flip=TRUE, show_grid=TRUE, preprocess=~str_remove(.x, "\\D*")) #remove all non-digits ## End(Not run)
EDCimport
option is currently set.See which EDCimport
option is currently set.
edc_peek_options(keep_null = FALSE)
edc_peek_options(keep_null = FALSE)
keep_null |
set to TRUE to get a list |
A named list of EDCimport options
In a RCT, you usually have several populations of analysis, and this function allow to show which patient is in which population graphically.
edc_population_plot(x, id_per_row = 50, ref = "first")
edc_population_plot(x, id_per_row = 50, ref = "first")
x |
a named list of subject ID, as numeric or factor. |
id_per_row |
number of patients per rows. |
ref |
the whole population. Default to the first member of |
a ggplot
#in real word code, use filter and pull to get these vectors pop_total = c(1:180) %>% setdiff(55) #screen failure, no patient 55 pop_itt = pop_total %>% setdiff(10) #patient 10 has had the wrong treatment pop_safety = pop_total %>% setdiff(c(40,160)) #patients 40 and 160 didn't receive any treatment pop_m_itt = pop_total %>% setdiff(c(40,160,80)) #patient 80 had a wrong inclusion criterion pop_evaluable = pop_total %>% setdiff(c(40,160,101,147,186)) #patients with no recist evaluation l = list( "Total population"=pop_total, "ITT population"=pop_itt, "Safety population"=pop_safety, "mITT population"=pop_m_itt, "Evaluable population"=pop_evaluable ) edc_population_plot(l) edc_population_plot(l[-1], ref=pop_total) edc_population_plot(l, ref=1:200) edc_population_plot(l, id_per_row=60)
#in real word code, use filter and pull to get these vectors pop_total = c(1:180) %>% setdiff(55) #screen failure, no patient 55 pop_itt = pop_total %>% setdiff(10) #patient 10 has had the wrong treatment pop_safety = pop_total %>% setdiff(c(40,160)) #patients 40 and 160 didn't receive any treatment pop_m_itt = pop_total %>% setdiff(c(40,160,80)) #patient 80 had a wrong inclusion criterion pop_evaluable = pop_total %>% setdiff(c(40,160,101,147,186)) #patients with no recist evaluation l = list( "Total population"=pop_total, "ITT population"=pop_itt, "Safety population"=pop_safety, "mITT population"=pop_m_itt, "Evaluable population"=pop_evaluable ) edc_population_plot(l) edc_population_plot(l[-1], ref=pop_total) edc_population_plot(l, ref=1:200) edc_population_plot(l, id_per_row=60)
EDCimport
options.Reset all EDCimport
options.
edc_reset_options( except = c("edc_lookup", "trialmaster_pw", "path_7zip"), quiet = FALSE )
edc_reset_options( except = c("edc_lookup", "trialmaster_pw", "path_7zip"), quiet = FALSE )
except |
options that are not reset by default |
quiet |
set to |
Nothing, called for its side effects
Split mixed tables, i.e. tables that hold both long data (N values per patient) and short data (one value per patient, duplicated on N lines), into one long table and one short table.
edc_split_mixed( database, datasets = everything(), ..., ignore_cols = NULL, verbose = FALSE )
edc_split_mixed( database, datasets = everything(), ..., ignore_cols = NULL, verbose = FALSE )
database |
an edc_database object, from |
datasets |
datasets to split in the database |
... |
not used, ensure arguments are named |
ignore_cols |
columns to ignore in long tables. Default to |
verbose |
whether to print informations about the process. |
an edc_database object
#db = read_trialmaster("filename.zip", pw="xx") db = edc_example() %>% edc_split_mixed(c(ae, starts_with("long")), ignore_cols="crfstat") names(db) edc_lookup() db$ae #`aesoc`, `aegr`, and `sae` are long, but `n_ae` is short db$ae_short db$ae_long
#db = read_trialmaster("filename.zip", pw="xx") db = edc_example() %>% edc_split_mixed(c(ae, starts_with("long")), ignore_cols="crfstat") names(db) edc_lookup() db$ae #`aesoc`, `aegr`, and `sae` are long, but `n_ae` is short db$ae_short db$ae_long
Join all tables on id
with only date columns to build a ggplot (or a
plotly if plotly=TRUE
) showing all dates for each patients. This allows
outliers to be easily identified.
edc_swimmerplot( ..., group = NULL, origin = NULL, id_lim = NULL, exclude = NULL, id = get_subjid_cols(), time_unit = c("days", "weeks", "months", "years"), aes_color = c("variable", "label"), plotly = getOption("edc_plotly", FALSE), .lookup = "deprecated" )
edc_swimmerplot( ..., group = NULL, origin = NULL, id_lim = NULL, exclude = NULL, id = get_subjid_cols(), time_unit = c("days", "weeks", "months", "years"), aes_color = c("variable", "label"), plotly = getOption("edc_plotly", FALSE), .lookup = "deprecated" )
... |
not used |
group |
a grouping variable, given as "dataset$column" |
origin |
a variable to consider as time 0, given as "dataset$column" |
id_lim |
a numeric vector of length 2 providing the minimum and maximum |
exclude |
a character vector of variables to exclude, in the form |
id |
the patient identifier. Will be coerced as numeric if possible. |
time_unit |
if |
aes_color |
either |
plotly |
whether to use |
.lookup |
deprecated |
either a plotly
or a ggplot
#db = read_trialmaster("filename.zip", pw="xx") db = edc_example() load_database(db) p = edc_swimmerplot(id_lim=c(5,45)) p2 = edc_swimmerplot(origin="enrol$enrol_date", time_unit="weeks", exclude=c("DATA1$DATE2", "data3$.*")) p3 = edc_swimmerplot(group="enrol$arm", aes_color="label") ## Not run: #save the plotly plot as HTML to share it save_plotly(p, "edc_swimmerplot.html") ## End(Not run)
#db = read_trialmaster("filename.zip", pw="xx") db = edc_example() load_database(db) p = edc_swimmerplot(id_lim=c(5,45)) p2 = edc_swimmerplot(origin="enrol$enrol_date", time_unit="weeks", exclude=c("DATA1$DATE2", "data3$.*")) p3 = edc_swimmerplot(group="enrol$arm", aes_color="label") ## Not run: #save the plotly plot as HTML to share it save_plotly(p, "edc_swimmerplot.html") ## End(Not run)
Turns the subject ID columns of all datasets into a factor containing levels for all the subjects of the database. Avoid problems when joining tables, and some checks can be performed on the levels.
edc_unify_subjid(database, preprocess = NULL, col_subjid = NULL)
edc_unify_subjid(database, preprocess = NULL, col_subjid = NULL)
database |
an edc_database object, from |
preprocess |
an optional function to modify the subject ID column. Default to |
col_subjid |
the names of the columns holding the subject ID (as character) |
database, with subject id modified
db = edc_example() db$enrol = head(db$enrol, 10) db$enrol$subjid %>% head() db = edc_unify_subjid(db) db$enrol$subjid %>% head() db = edc_unify_subjid(db, preprocess=function(x) paste0("#", x)) db$enrol$subjid %>% head()
db = edc_example() db$enrol = head(db$enrol, 10) db$enrol$subjid %>% head() db = edc_unify_subjid(db) db$enrol$subjid %>% head() db = edc_unify_subjid(db, preprocess=function(x) paste0("#", x)) db$enrol$subjid %>% head()
Run a Shiny application that allows to browse the datasets.
edc_viewer(background = TRUE, port = 1209)
edc_viewer(background = TRUE, port = 1209)
background |
Whether the app should run in a background process. |
port |
The TCP port that the application should listen on. |
Warn if extraction is too old
edc_warn_extraction_date(max_days = 30)
edc_warn_extraction_date(max_days = 30)
max_days |
the max acceptable age of the data |
nothing
db = edc_example() load_database(db) edc_warn_extraction_date()
db = edc_example() load_database(db) edc_warn_extraction_date()
Compare a subject ID vector to the study's reference subject ID (usually something like enrolres$subjid
), and warn if any patient is missing or extra. check_subjid()
is the old, deprecated name.
edc_warn_patient_diffs( x, ref = getOption("edc_subjid_ref"), issue_n = "xx", data_name = NULL, col_subjid = get_subjid_cols() )
edc_warn_patient_diffs( x, ref = getOption("edc_subjid_ref"), issue_n = "xx", data_name = NULL, col_subjid = get_subjid_cols() )
x |
the subject ID vector to check, or a dataframe which ID column will be guessed |
ref |
the reference for subject ID. Should usually be set through |
issue_n |
identifying row number |
data_name |
the name of the data (for the warning message) |
col_subjid |
name of the subject ID column if |
nothing, called for errors/warnings
db = edc_example() load_database(db) options(edc_subjid_ref=enrol$subjid) #usually, you set something like: #options(edc_subjid_ref=enrolres$subjid) edc_warn_patient_diffs(data1) data1 %>% dplyr::filter(subjid>1) %>% edc_warn_patient_diffs(issue_n=NULL) edc_warn_patient_diffs(c(data1$subjid, 99, 999))
db = edc_example() load_database(db) options(edc_subjid_ref=enrol$subjid) #usually, you set something like: #options(edc_subjid_ref=enrolres$subjid) edc_warn_patient_diffs(data1) data1 %>% dplyr::filter(subjid>1) %>% edc_warn_patient_diffs(issue_n=NULL) edc_warn_patient_diffs(c(data1$subjid, 99, 999))
Format factor levels as arbitrary values of Yes/No (with Yes always first) while leaving untouched all vectors that contain other information.
fct_yesno( x, input = list(yes = c("Yes", "Oui"), no = c("No", "Non")), output = c("Yes", "No"), strict = FALSE, mutate_character = TRUE, fail = TRUE )
fct_yesno( x, input = list(yes = c("Yes", "Oui"), no = c("No", "Non")), output = c("Yes", "No"), strict = FALSE, mutate_character = TRUE, fail = TRUE )
x |
a vector of any type/class. |
input |
list of values to be considered as "yes" and "no". |
output |
the output factor levels. |
strict |
whether to match the input strictly or use stringr::str_detect to find them. |
mutate_character |
whether to turn characters into factor. |
fail |
whether to fail if some levels cannot be recoded to yes/no. |
a factor, or x
untouched.
fct_yesno(c("No", "Yes")) #levels are in order set.seed(42) N=6 x = tibble( a=sample(c("Yes", "No"), size=N, replace=TRUE), b=sample(c("Oui", "Non"), size=N, replace=TRUE), c=sample(0:1, size=N, replace=TRUE), d=sample(c(TRUE, FALSE), size=N, replace=TRUE), e=sample(c("1-Yes", "0-No"), size=N, replace=TRUE), y=sample(c("aaa", "bbb", "ccc"), size=N, replace=TRUE), z=1:N, ) x #y and z are left untouched (or throw an error if fail=TRUE) sapply(x, fct_yesno, fail=FALSE) # as "1-Yes" is not in `input`, x$e is untouched/fails if strict=TRUE fct_yesno(x$e) fct_yesno(x$e, strict=TRUE, fail=FALSE) fct_yesno(x$e, output=c("Ja", "Nein"))
fct_yesno(c("No", "Yes")) #levels are in order set.seed(42) N=6 x = tibble( a=sample(c("Yes", "No"), size=N, replace=TRUE), b=sample(c("Oui", "Non"), size=N, replace=TRUE), c=sample(0:1, size=N, replace=TRUE), d=sample(c(TRUE, FALSE), size=N, replace=TRUE), e=sample(c("1-Yes", "0-No"), size=N, replace=TRUE), y=sample(c("aaa", "bbb", "ccc"), size=N, replace=TRUE), z=1:N, ) x #y and z are left untouched (or throw an error if fail=TRUE) sapply(x, fct_yesno, fail=FALSE) # as "1-Yes" is not in `input`, x$e is untouched/fails if strict=TRUE fct_yesno(x$e) fct_yesno(x$e, strict=TRUE, fail=FALSE) fct_yesno(x$e, output=c("Ja", "Nein"))
Attempt to list all columns in the database and group the ones that are
common to some datasets.
Useful to find keys to pivot or summarise data.
get_common_cols(lookup = edc_lookup(), min_datasets = 3) ## S3 method for class 'common_cols' summary(object, ...)
get_common_cols(lookup = edc_lookup(), min_datasets = 3) ## S3 method for class 'common_cols' summary(object, ...)
lookup |
the lookup table, default to |
min_datasets |
the minimal number of datasets to be considered |
object |
an object of class "common_cols" |
... |
unused |
a tibble of class "common_cols"
db = edc_example() load_database(db) x = get_common_cols(min_datasets=1) x summary(x)
db = edc_example() load_database(db) x = get_common_cols(min_datasets=1) x summary(x)
Get the datasets from the lookup table as a list of data.frames.
get_datasets(lookup = edc_lookup(), envir = parent.frame())
get_datasets(lookup = edc_lookup(), envir = parent.frame())
lookup |
the lookup table |
envir |
(internal use) |
a list of all datasets
This function search for date columns in every tables and returns the latest date for each patient with the variable it comes from. Useful in survival analysis to get the right censoring time.
lastnews_table( except = NULL, with_ties = FALSE, show_delta = FALSE, numeric_id = TRUE, prefer = NULL, regex = FALSE, warn_if_future = TRUE )
lastnews_table( except = NULL, with_ties = FALSE, show_delta = FALSE, numeric_id = TRUE, prefer = NULL, regex = FALSE, warn_if_future = TRUE )
except |
the datasets/columns that should not be searched. Example: a scheduled visit for which the patient may have died before attending should not be considered. |
with_ties |
in case of tie, whether to return the first |
show_delta |
whether to compute the difference between the last |
numeric_id |
set to FALSE if the patient ID column is not numeric |
prefer |
preferred origins in the event of a tie. Usually the followup table. |
regex |
whether to consider |
warn_if_future |
whether to show a warning about dates that are after the extraction date. Can also be a csv file path to save the warning as csv (see |
a dataframe
db = edc_example() load_database(db) lastnews_table() lastnews_table(except="data3") lastnews_table(except="data3$date9") lastnews_table(prefer="date10", show_delta=TRUE) lastnews_table() %>% dplyr::count(origin = glue::glue("{origin_data}${origin_col}"), sort=TRUE) csv_file = tempfile(fileext=".csv") lastnews_table(prefer="date9", warn_if_future=csv_file)
db = edc_example() load_database(db) lastnews_table() lastnews_table(except="data3") lastnews_table(except="data3$date9") lastnews_table(prefer="date10", show_delta=TRUE) lastnews_table() %>% dplyr::count(origin = glue::glue("{origin_data}${origin_col}"), sort=TRUE) csv_file = tempfile(fileext=".csv") lastnews_table(prefer="date9", warn_if_future=csv_file)
Load a list in an environment
load_database(db, env = parent.frame(), remove = TRUE)
load_database(db, env = parent.frame(), remove = TRUE)
db |
an edc_database object (to be fair, any list would do) |
env |
the environment onto which the list should be loaded |
remove |
if |
nothing, called for its side-effect
db = edc_example() load_database(db, remove=FALSE) print(db) print(lengths(db))
db = edc_example() load_database(db, remove=FALSE) print(db) print(lengths(db))
When finding wrong or unexpected values in an exported dataset, it can be useful to temporarily correct them by hard-coding a value. However, this manual correction should be undone as soon as the central database is updated with the correction.
manual_correction()
applies a correction in a specific dataset column location and throws an error if the correction is already in place. This check applies only once per R session so you can source your script without errors.
reset_manual_correction()
resets all checks. For instance, it is called by read_trialmaster()
.
manual_correction( data, col, rows, wrong, correct, verbose = getOption("edc_correction_verbose", TRUE) ) reset_manual_correction()
manual_correction( data, col, rows, wrong, correct, verbose = getOption("edc_correction_verbose", TRUE) ) reset_manual_correction()
data , col , rows
|
the rows of a column of a dataframe where the error lies |
wrong |
the actual wrong value |
correct |
the temporary correction value |
verbose |
whether to print informations (once) |
Nothing, used for side effects
library(dplyr) x = iris %>% mutate(id=row_number(), .before=1) %>% as_tibble() x$Sepal.Length[c(1,3,5)] #1st correction is silent manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) x$Sepal.Length[c(1,3,5)] #further correction is silent manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) #if the database is corrected, an error is thrown ## Not run: reset_manual_correction() x$Sepal.Length[c(1,3,5)] = c(5, 4, 3) #mimics db correction manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) ## End(Not run)
library(dplyr) x = iris %>% mutate(id=row_number(), .before=1) %>% as_tibble() x$Sepal.Length[c(1,3,5)] #1st correction is silent manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) x$Sepal.Length[c(1,3,5)] #further correction is silent manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) #if the database is corrected, an error is thrown ## Not run: reset_manual_correction() x$Sepal.Length[c(1,3,5)] = c(5, 4, 3) #mimics db correction manual_correction(x, Sepal.Length, rows=c(1,3,5), wrong=c(5.1, 4.7, 5.0), correct=c(5, 4, 3)) ## End(Not run)
.csv
files in a directoryRead all .csv
files in a directory, with labels if specified.
read_all_csv( path, ..., labels_from = NULL, clean_names_fun = NULL, read_fun = "guess", subdirectories = FALSE, datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1) )
read_all_csv( path, ..., labels_from = NULL, clean_names_fun = NULL, read_fun = "guess", subdirectories = FALSE, datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1) )
path |
[ |
... |
unused |
labels_from |
[ |
clean_names_fun |
[ |
read_fun |
[ |
subdirectories |
[ |
datetime_extraction |
[ |
verbose |
[ |
a list containing one dataframe for each .csv
file in the folder, the extraction date (datetime_extraction
), and a summary of all imported tables (.lookup
).
labels_from
should contain the information about column labels. It should be a data file (.csv
) containing 2 columns: one for the column name and the other for its associated label. Use options(edc_col_name="xxx", edc_col_label="xxx")
to specify the names of the columns.
Other EDCimport reading functions:
read_all_sas()
,
read_all_xpt()
,
read_trialmaster()
.sas7bdat
files in a directoryRead all .sas7bdat
files in a directory. Formats can be applied from a procformat.sas
SAS file, from a .
read_all_sas( path, ..., format_file = "procformat.sas", clean_names_fun = NULL, subdirectories = FALSE, datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1) )
read_all_sas( path, ..., format_file = "procformat.sas", clean_names_fun = NULL, subdirectories = FALSE, datetime_extraction = "guess", verbose = getOption("edc_read_verbose", 1) )
path |
[ |
... |
unused |
format_file |
[ |
clean_names_fun |
[ |
subdirectories |
[ |
datetime_extraction |
[ |
verbose |
[ |
a list containing one dataframe for each .xpt
file in the folder, the extraction date (datetime_extraction
), and a summary of all imported tables (.lookup
).
format_file
should contain the information about SAS formats. It can be either
a procformat.sas
file, containing the whole PROC FORMAT
a catalog file (.sas7bcat
)
or a data file (.csv
or .sas7bdat
) containing 3 columns: the SAS format name (repeated),
each level, and its associated label. Use options(edc_var_format_name="xxx", edc_var_level="xxx", edc_var_label="xxx")
to specify the names of the columns.
Other EDCimport reading functions:
read_all_csv()
,
read_all_xpt()
,
read_trialmaster()
.xpt
files in a directoryRead all .xpt
files in a directory (unzipped TrialMaster archive).
If 7zip
is installed, you should probably rather use read_trialmaster()
instead.
If a procformat.sas
file exists in the directory, formats will be applied.
read_all_xpt( path, ..., format_file = "procformat.sas", clean_names_fun = NULL, datetime_extraction = "guess", subdirectories = FALSE, verbose = getOption("edc_read_verbose", 1), directory = "deprecated", key_columns = "deprecated" )
read_all_xpt( path, ..., format_file = "procformat.sas", clean_names_fun = NULL, datetime_extraction = "guess", subdirectories = FALSE, verbose = getOption("edc_read_verbose", 1), directory = "deprecated", key_columns = "deprecated" )
path |
[ |
... |
unused |
format_file |
[ |
clean_names_fun |
[ |
datetime_extraction |
[ |
subdirectories |
[ |
verbose |
[ |
directory |
deprecated in favour for |
key_columns |
deprecated |
a list containing one dataframe for each .xpt
file in the folder, the extraction date (datetime_extraction
), and a summary of all imported tables (.lookup
).
format_file
should contain the information about SAS formats. It can be either
a procformat.sas
file, containing the whole PROC FORMAT
or a data file (.csv or .sas7bdat) containing 3 columns: the SAS format name (repeated),
each level, and its associated label. Use options(edc_var_format_name="xxx", edc_var_level="xxx", edc_var_label="xxx")
to specify the names of the columns.
Other EDCimport reading functions:
read_all_csv()
,
read_all_sas()
,
read_trialmaster()
.zip
archive of a TrialMaster exportImport the .zip
archive of a TrialMaster trial export as a list of dataframes. The archive filename should be leaved untouched as it contains the project name and the date of extraction.
Generate a .rds
cache file for future reads.
If 7zip
is not installed or available, use read_all_xpt()
instead.
The TM export should be of type SAS Xport
, with the checkbox
"Include Codelists" ticked.
read_trialmaster( archive, ..., use_cache = "write", clean_names_fun = NULL, subdirectories = FALSE, pw = getOption("trialmaster_pw"), verbose = getOption("edc_read_verbose", 1), key_columns = "deprecated" )
read_trialmaster( archive, ..., use_cache = "write", clean_names_fun = NULL, subdirectories = FALSE, pw = getOption("trialmaster_pw"), verbose = getOption("edc_read_verbose", 1), key_columns = "deprecated" )
archive |
[ |
... |
unused |
use_cache |
[ |
clean_names_fun |
[ |
subdirectories |
[ |
pw |
[ |
verbose |
[ |
key_columns |
deprecated |
a list containing one dataframe for each .xpt
file in the folder, the extraction date (datetime_extraction
), and a summary of all imported tables (.lookup
).
Other EDCimport reading functions:
read_all_csv()
,
read_all_sas()
,
read_all_xpt()
Save a plotly to an HTML file
save_plotly(p, file, ...)
save_plotly(p, file, ...)
p |
a plot object ( |
file |
a file path to save the HTML file |
... |
passed on to htmlwidgets::saveWidget |
nothing, used for side effect
## Not run: db = edc_example() load_database(db) p = edc_swimmerplot(id_lim=c(5,45)) save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot") ## End(Not run)
## Not run: db = edc_example() load_database(db) p = edc_swimmerplot(id_lim=c(5,45)) save_plotly(p, "graph/swimplots/edc_swimmerplot.html", title="My Swimmerplot") ## End(Not run)
sessionInfo()
outputSave sessionInfo()
output into a text file.
save_sessioninfo(path = "check/session_info.txt", with_date = TRUE)
save_sessioninfo(path = "check/session_info.txt", with_date = TRUE)
path |
target path to write the file |
with_date |
whether to insert the date before the file extension |
nothing
## Not run: save_sessioninfo() ## End(Not run)
## Not run: save_sessioninfo() ## End(Not run)
Search in some folders if a TrialMaster database more recent than the current extraction is present. By default, it will search the "data" folder and the OS usual "Downloads" folder. If a newer database is found, user will be asked if they want to move it to the "data" folder.
search_for_newer_data( archive, ..., source = path_home("Downloads"), target = "data", ask = TRUE, advice = TRUE )
search_for_newer_data( archive, ..., source = path_home("Downloads"), target = "data", ask = TRUE, advice = TRUE )
archive |
TM archive path, giving the project name and date |
... |
unused |
source |
the path vector to be searched, default to both "data" and the usual "Downloads" folder |
target |
the path where files should be copied |
ask |
whether to ask the user to move the file to "data" |
advice |
whether to advice how to move it instead, if |
the path to the newer file, invisibly.
## Not run: archive = "data/MYPROJECT_ExportTemplate_xxx_SAS_XPORT_2024_06_01_12_00.zip" #tm = read_trialmaster(archive) search_for_newer_data(archive) ## End(Not run)
## Not run: archive = "data/MYPROJECT_ExportTemplate_xxx_SAS_XPORT_2024_06_01_12_00.zip" #tm = read_trialmaster(archive) search_for_newer_data(archive) ## End(Not run)
Select all columns that has only one level for a given grouping scope. Useful when dealing with mixed datasets containing both long data and repeated short data.
select_distinct(df, .by)
select_distinct(df, .by)
df |
a dataframe |
.by |
optional grouping columns |
df
with less columns
db = edc_example() db$ae %>% colnames() #`crfname` has one level for the whole dataset db$ae %>% select_distinct() %>% colnames() #`n_ae` has one level per patient db$ae %>% select_distinct(.by=subjid) %>% colnames()
db = edc_example() db$ae %>% colnames() #`crfname` has one level for the whole dataset db$ae %>% select_distinct() %>% colnames() #`n_ae` has one level per patient db$ae %>% select_distinct(.by=subjid) %>% colnames()
A dataset is either in the wide format or in the long format (link). This function identifies the format of a dataframe with respect to a subject ID. If a dataframe has some wide and long columns, it is considered "mixed".
table_format( df, id = get_subjid_cols(), ..., ignore_cols = get_meta_cols(0.95), na_rm = FALSE, warn = TRUE )
table_format( df, id = get_subjid_cols(), ..., ignore_cols = get_meta_cols(0.95), na_rm = FALSE, warn = TRUE )
df |
a dataframe |
id |
the identifying subject ID |
... |
not used |
ignore_cols |
columns to ignore. |
na_rm |
whether to consider missing values |
warn |
whether to warn if ID is not found |
a string value in c("wide", "long", "mixed)
db = edc_example() sapply(db, table_format, warn=FALSE)
db = edc_example() sapply(db, table_format, warn=FALSE)
Turn a vector of length N to a vector of length 1 after checking that there is only one unique value. Useful to safely flatten a duplicated table. This preserves the label
attribute if set.
unify(x)
unify(x)
x |
a vector |
a vector of length 1
unify(c(1,1,1,1)) #unify(c(1,1,2,1)) #warning library(dplyr) x=tibble(id=rep(letters[1:5],10), value=rep(1:5,10)) x %>% group_by(id) %>% summarise(value=unify(value)) #safer than `value=value[1]` x$value[2]=1 #x %>% group_by(id) %>% summarise(value=unify(value)) #warning about that non-unique value
unify(c(1,1,1,1)) #unify(c(1,1,2,1)) #warning library(dplyr) x=tibble(id=rep(letters[1:5],10), value=rep(1:5,10)) x %>% group_by(id) %>% summarise(value=unify(value)) #safer than `value=value[1]` x$value[2]=1 #x %>% group_by(id) %>% summarise(value=unify(value)) #warning about that non-unique value