Skip to contents

evenly distributes a number of given patients across a number of given sites. Then simulates event reporting of each patient reducing the number of reported events for patients distributed to event-under-reporting sites.

Usage

sim_test_data_study(
  n_pat = 1000,
  n_sites = 20,
  ratio_out = 0,
  factor_event_rate = 0,
  max_visit_mean = 20,
  max_visit_sd = 4,
  event_rates = dgamma(seq(1, 20, 0.5), shape = 5, rate = 2) * 5 + 0.1,
  event_names = c("event"),
  study_id = "A"
)

Arguments

n_pat

integer, number of patients, Default: 1000

n_sites

integer, number of sites, Default: 20

ratio_out

ratio of sites with outlier, Default: 0

factor_event_rate

event reporting rate factor for site outlier, will modify mean event per visit rate used for outlier sites. Negative Values will simulate under-reporting, positive values over-reporting, e.g. -0.4 -> 40% under-reporting, +0.4 -> 40% over-reporting Default: 0

max_visit_mean

mean of the maximum number of visits of each patient, Default: 20

max_visit_sd

standard deviation of maximum number of visits of each patient, Default: 4

event_rates

list or vector with visit-specific event rates. Use list for multiple event names, Default: dgamma(seq(1, 20, 0.5), shape = 5, rate = 2) * 5 + 0.1

event_names

vector, contains the event names, default = "event"

study_id

character, Default: "A"

Value

tibble with columns site_id, patient_id, is_out, max_visit_mean, max_visit_sd, event_per_visit_mean, visit, n_event

Details

maximum visit number will be sampled from normal distribution with characteristics derived from max_visit_mean and max_visit_sd, while the events per visit will be sampled from a poisson distribution described by events_per_visit_mean.

Examples

set.seed(1)
# no outlier
df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5)
df_visit[which(df_visit$patient_id == "P000001"),]
#> # A tibble: 17 × 9
#>    patient_id site_id is_out max_visit_mean max_visit_sd event_per_visit_mean
#>    <chr>      <chr>   <lgl>           <dbl>        <dbl>                <dbl>
#>  1 P000001    S0001   FALSE              20            4                0.353
#>  2 P000001    S0001   FALSE              20            4                0.353
#>  3 P000001    S0001   FALSE              20            4                0.353
#>  4 P000001    S0001   FALSE              20            4                0.353
#>  5 P000001    S0001   FALSE              20            4                0.353
#>  6 P000001    S0001   FALSE              20            4                0.353
#>  7 P000001    S0001   FALSE              20            4                0.353
#>  8 P000001    S0001   FALSE              20            4                0.353
#>  9 P000001    S0001   FALSE              20            4                0.353
#> 10 P000001    S0001   FALSE              20            4                0.353
#> 11 P000001    S0001   FALSE              20            4                0.353
#> 12 P000001    S0001   FALSE              20            4                0.353
#> 13 P000001    S0001   FALSE              20            4                0.353
#> 14 P000001    S0001   FALSE              20            4                0.353
#> 15 P000001    S0001   FALSE              20            4                0.353
#> 16 P000001    S0001   FALSE              20            4                0.353
#> 17 P000001    S0001   FALSE              20            4                0.353
#> # ℹ 3 more variables: visit <int>, n_event <dbl>, study_id <chr>

# under-reporting outlier
df_visit <- sim_test_data_study(n_pat = 100, n_sites = 5,
    ratio_out = 0.2, factor_event_rate = -0.5)
df_visit[which(df_visit$patient_id == "P000001"),]
#> # A tibble: 23 × 9
#>    patient_id site_id is_out max_visit_mean max_visit_sd event_per_visit_mean
#>    <chr>      <chr>   <lgl>           <dbl>        <dbl>                <dbl>
#>  1 P000001    S0001   TRUE               20            4                0.176
#>  2 P000001    S0001   TRUE               20            4                0.176
#>  3 P000001    S0001   TRUE               20            4                0.176
#>  4 P000001    S0001   TRUE               20            4                0.176
#>  5 P000001    S0001   TRUE               20            4                0.176
#>  6 P000001    S0001   TRUE               20            4                0.176
#>  7 P000001    S0001   TRUE               20            4                0.176
#>  8 P000001    S0001   TRUE               20            4                0.176
#>  9 P000001    S0001   TRUE               20            4                0.176
#> 10 P000001    S0001   TRUE               20            4                0.176
#> # ℹ 13 more rows
#> # ℹ 3 more variables: visit <int>, n_event <dbl>, study_id <chr>

# constant event rates
sim_test_data_study(n_pat = 100, n_sites = 5, event_rates = 0.5)
#> # A tibble: 1,968 × 9
#>    patient_id site_id is_out max_visit_mean max_visit_sd event_per_visit_mean
#>    <chr>      <chr>   <lgl>           <dbl>        <dbl>                <dbl>
#>  1 P000001    S0001   FALSE              20            4                  0.5
#>  2 P000001    S0001   FALSE              20            4                  0.5
#>  3 P000001    S0001   FALSE              20            4                  0.5
#>  4 P000001    S0001   FALSE              20            4                  0.5
#>  5 P000001    S0001   FALSE              20            4                  0.5
#>  6 P000001    S0001   FALSE              20            4                  0.5
#>  7 P000001    S0001   FALSE              20            4                  0.5
#>  8 P000001    S0001   FALSE              20            4                  0.5
#>  9 P000001    S0001   FALSE              20            4                  0.5
#> 10 P000001    S0001   FALSE              20            4                  0.5
#> # ℹ 1,958 more rows
#> # ℹ 3 more variables: visit <int>, n_event <dbl>, study_id <chr>

# non-constant event rates for two event types
event_rates_ae <- c(0.7, rep(0.5, 8), rep(0.3, 5))
event_rates_pd <- c(0.3, rep(0.4, 6), rep(0.1, 5))

sim_test_data_study(
n_pat = 100,
n_sites = 5,
event_names = c("ae", "pd"),
event_rates = list(event_rates_ae, event_rates_pd)
)
#> # A tibble: 1,904 × 11
#>    patient_id site_id is_out max_visit_mean max_visit_sd ae_per_visit_mean
#>    <chr>      <chr>   <lgl>           <dbl>        <dbl>             <dbl>
#>  1 P000001    S0001   FALSE              20            4             0.443
#>  2 P000001    S0001   FALSE              20            4             0.443
#>  3 P000001    S0001   FALSE              20            4             0.443
#>  4 P000001    S0001   FALSE              20            4             0.443
#>  5 P000001    S0001   FALSE              20            4             0.443
#>  6 P000001    S0001   FALSE              20            4             0.443
#>  7 P000001    S0001   FALSE              20            4             0.443
#>  8 P000001    S0001   FALSE              20            4             0.443
#>  9 P000001    S0001   FALSE              20            4             0.443
#> 10 P000001    S0001   FALSE              20            4             0.443
#> # ℹ 1,894 more rows
#> # ℹ 5 more variables: pd_per_visit_mean <dbl>, visit <int>, n_ae <dbl>,
#> #   n_pd <dbl>, study_id <chr>