Introduction

We generally aggregate AEs by visit. Each patient follows the same visit schedule and a specified number of days passes between each consecutive visit. As visits are a scheduled contact point between the patient and physicians the day of the visit usually also the day when most AEs get reported. Alternatively we can also choose to use simaerep to a

Load Data

We load some a public clinical trial data set which only contains data of the control arm see SAS files as a Data Source Article

df_ae <- haven::read_sas('adae.sas7bdat') %>%
  select(STUDYID, SUBJID, SITEID, AESTDY) 

df_vs <- haven::read_sas('advs.sas7bdat') %>%
  select(STUDYID, SUBJID, SITEID, ADY) 

df_ae <- df_ae %>%
  rename(DY = AESTDY) %>%
  mutate(EVENT = "AE")

df_vs <- df_vs %>%
  rename(DY = ADY) %>%
  mutate(EVENT = "VS") %>%
  # we ignore visits that have no date
  filter(! is.na(DY)) %>%
  # we are not interested in same day visits
  distinct()

df_aevs <- bind_rows(df_ae, df_vs) %>%
  # NA's get sorted towards the end thus AEs with no date get sorted towards last visit
  arrange(STUDYID, SITEID, SUBJID, DY) %>%
  group_by(STUDYID, SITEID, SUBJID) %>%
  mutate(AE_NO = cumsum(ifelse(EVENT == "AE", 1, 0)),
         VS_NO = cumsum(ifelse(EVENT == "VS", 1, 0))) %>%
  # we remove patients with 0 visits
  filter(max(VS_NO) > 0) %>%
  # AE's before fist visit should register to visit 1 not zero
  mutate(VS_NO = ifelse(VS_NO == 0, 1, VS_NO))


df_aevs_aggr <- df_aevs %>%
  group_by(STUDYID, SITEID, SUBJID, VS_NO) %>%
  summarise(MIN_AE_NO = min(AE_NO),
            MAX_AE_NO = max(AE_NO),
            .groups = "drop") %>%
  group_by(STUDYID, SITEID, SUBJID) %>%
  mutate(MAX_VS_PAT = max(VS_NO)) %>%
  ungroup() %>%
  # assign AEs that occur after last visit to last AE
  mutate(
    CUM_AE = ifelse(
      VS_NO == MAX_VS_PAT,
      MAX_AE_NO,
      MIN_AE_NO)
    )

df_visit <- df_aevs_aggr %>%
  rename(
    study_id = "STUDYID",
    site_number = "SITEID",
    patnum = "SUBJID",
    n_ae = "CUM_AE",
    visit = "VS_NO"
  )  %>%
  select(study_id, site_number, patnum, n_ae, visit)

Aggregate on Days

For aggregating on days we need to align the reference timelines of the single patients.

df_vs_min_max <- df_vs %>%
  group_by(STUDYID, SUBJID, SITEID) %>%
  summarise(min_DY = min(DY, na.rm = TRUE),
            max_DY = max(DY, na.rm = TRUE),
            .groups = "drop")

df_vs_min_max$min_DY[1:25]
##  [1]  -7 -14 -14  -7  -3 -10 -19 -12  -6  -9  -9 -12  -8  -7  -6  -8  -3 -14  -7
## [20] -14 -19 -12  -9 -21 -28
df_vs_min_max$max_DY[1:25]
##  [1] 309 134  43   1 224 265 100 163  51 103  40  38 125 168  85  92 100 708 119
## [20]  64  43  51  70   1 225

The day of the first visit is different for each patient and they start at negative values. First we correct all values to be positive and then normalize the AE date values to the date value of the first visit of each patient

corr_factor <- abs(min(df_vs_min_max$min_DY))

df_days <- df_ae %>%
  # include patients with vs but no AE
  right_join(df_vs_min_max, by = c("STUDYID", "SUBJID", "SITEID")) %>%
  # replace DY NULL with max patient DY 
  group_by(STUDYID, SUBJID, SITEID) %>%
  mutate(DY = ifelse(is.na(DY) & ! is.na(EVENT), max(DY, na.rm = TRUE), DY)) %>%
  # replace DY for patients with 0 AE with day of maximum visit
  mutate(DY = ifelse(is.na(DY) & is.na(EVENT), max_DY, DY)) %>%
  # correct timelines
  mutate(DY = DY + corr_factor,
         min_DY = min_DY + corr_factor,
         DY_corr = DY + min_DY) %>%
  group_by(STUDYID, SITEID, SUBJID) %>%
  arrange(STUDYID, SITEID, SUBJID, DY_corr) %>%
  mutate(n_ae = row_number()) %>%
  ungroup() %>%
  # set AE count to 0 for patients with no AEs
  mutate(n_ae = ifelse(is.na(EVENT), 0 , n_ae)) %>%
  rename(
    study_id = STUDYID,
    site_number = SITEID,
    patnum = SUBJID,
    visit = DY_corr
  ) %>%
  group_by(study_id, site_number, patnum, visit) %>%
  summarise(n_ae = max(n_ae), .groups = "drop")

check if we get the same transformation as for the visit aggregations

stopifnot(n_distinct(df_days$site_number) == n_distinct(df_visit$site_number))
stopifnot(n_distinct(df_days$patnum) == n_distinct(df_visit$patnum))

pat0_days <- df_days %>%
  group_by(study_id, site_number, patnum) %>%
  filter(max(n_ae) == 0) %>%
  pull(patnum) %>%
  unique() %>%
  sort()

pat0_vs <- df_visit %>%
  group_by(study_id, site_number, patnum) %>%
  filter(max(n_ae) == 0) %>%
  pull(patnum) %>%
  unique() %>%
  sort()

stopifnot(all(pat0_days == pat0_vs))
df_days 
## # A tibble: 3,220 × 5
##    study_id   site_number patnum   visit  n_ae
##    <chr>      <chr>       <chr>    <dbl> <dbl>
##  1 CO-101-001 001         01001001    43     1
##  2 CO-101-001 001         01001001    44     2
##  3 CO-101-001 001         01001001   107     3
##  4 CO-101-001 001         01001001   124     4
##  5 CO-101-001 001         01001001   134     5
##  6 CO-101-001 001         01001001   141     6
##  7 CO-101-001 001         01001001   156     8
##  8 CO-101-001 001         01001001   184    10
##  9 CO-101-001 001         01001001   240    13
## 10 CO-101-001 001         01001001   241    14
## # ℹ 3,210 more rows

We do have gaps in between the days leading to implicitly missing values. simaerep will correct this automatically and throw a warning.

df_site <- site_aggr(df_visit = df_days)
## Warning in exp_implicit_missing_visits(df_visit): implicitly missing visit
## numbers detected and corrected

to silence the warning we can use check df_visit() which is also called internally by all other functions accepting df_visit as an argument.

df_days <- simaerep:::check_df_visit(df_days)
## Warning in exp_implicit_missing_visits(df_visit): implicitly missing visit
## numbers detected and corrected
df_days
## # A tibble: 57,873 × 5
##    study_id   site_number patnum   visit  n_ae
##    <chr>      <chr>       <chr>    <dbl> <dbl>
##  1 CO-101-001 001         01001001     5     0
##  2 CO-101-001 001         01001001     6     0
##  3 CO-101-001 001         01001001     7     0
##  4 CO-101-001 001         01001001     8     0
##  5 CO-101-001 001         01001001     9     0
##  6 CO-101-001 001         01001001    10     0
##  7 CO-101-001 001         01001001    11     0
##  8 CO-101-001 001         01001001    12     0
##  9 CO-101-001 001         01001001    13     0
## 10 CO-101-001 001         01001001    14     0
## # ℹ 57,863 more rows

Then we proceed as usual.

df_sim_sites <- sim_sites(df_site, df_visit = df_days)

df_eval_days <- eval_sites(df_sim_sites)

simaerep::plot_study(df_visit = df_days, df_site = df_site, df_eval = df_eval_days, study = unique(df_days$study_id))

Aggregate on Visits

How do the results compare to aggregating on visits?

df_site <- site_aggr(df_visit) 

df_sim_sites <- sim_sites(df_site, df_visit)

df_eval_vs <- eval_sites(df_sim_sites)

simaerep::plot_study(df_visit, df_site, df_eval_vs, study = unique(df_visit$study_id))

Compare

We observe a difference in the results. Which is largely attributable in the difference in cut-off visit_med75 points that influences the set of patients included. In any case we observe a high rank correlation with a low p-value of all results greater 0.

As the inclusion/exclusion of patients in the analysis of a site in an ongoing trial can shift results, we recommend to aggregate on actually occurred visits because then all included patients had an equal amount of opportunities to report AEs.

df_comp <- df_eval_days %>%
  select(
    site_number,
    prob_low_prob_ur_days = prob_low_prob_ur,
    n_pat_with_med75_days = n_pat_with_med75
  )  %>%
  left_join(
    select(
      df_eval_vs,
      site_number,
      prob_low_prob_ur_vs = prob_low_prob_ur,
      n_pat_with_med75_vs = n_pat_with_med75
      ),
    by = "site_number"
  ) %>%
  filter(prob_low_prob_ur_days > 0 | prob_low_prob_ur_vs > 0) %>%
  select(site_number, starts_with("prob"), starts_with("n_pat")) %>%
  arrange(desc(prob_low_prob_ur_vs))

df_comp %>%
  knitr::kable()
site_number prob_low_prob_ur_days prob_low_prob_ur_vs n_pat_with_med75_days n_pat_with_med75_vs
084 0.9750000 0.9625000 5 5
078 0.9750000 0.9625000 5 5
073 0.4093750 0.8000000 6 5
067 1.0000000 0.7375000 17 15
079 0.8312500 0.6875000 13 12
086 0.4500000 0.6875000 5 5
071 0.0000000 0.2071429 2 3
066 0.6100000 0.1923077 6 5
070 0.4093750 0.1923077 15 17
080 0.1068182 0.1923077 20 20
085 0.1068182 0.1923077 1 1
069 0.1068182 0.1923077 3 5
090 0.0000000 0.1923077 3 4
cor.test(
  df_comp$prob_low_prob_ur_vs,
  df_comp$prob_low_prob_ur_days,
  method = "spearman"
)
## Warning in cor.test.default(df_comp$prob_low_prob_ur_vs,
## df_comp$prob_low_prob_ur_days, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df_comp$prob_low_prob_ur_vs and df_comp$prob_low_prob_ur_days
## S = 114.71, p-value = 0.009797
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.6848696