SAS Files as a Data Source • simaerep

Load

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(simaerep))
suppressPackageStartupMessages(library(haven))

SAS files

Typically clinical data is stored in several SAS files in a standardized format. We need the files in which the visits and the AEs are recorded. For this demo we have selected an anonymized data set which only contains patients enrolled into the control arm. In those data sets the AE onset dates and the visit dates have been replaced with the number of days that have passed since a specific cut-off date. We can proceed in a similar way

df_ae <- haven::read_sas('adae.sas7bdat') %>%
  select(STUDYID, SUBJID, SITEID, AESTDY) 

df_ae

## # A tibble: 4,702 × 4
##    STUDYID    SUBJID   SITEID AESTDY
##    <chr>      <chr>    <chr>   <dbl>
##  1 CO-101-001 01001001 001       -14
##  2 CO-101-001 01001001 001       -13
##  3 CO-101-001 01001001 001        50
##  4 CO-101-001 01001001 001        67
##  5 CO-101-001 01001001 001        77
##  6 CO-101-001 01001001 001        84
##  7 CO-101-001 01001001 001        99
##  8 CO-101-001 01001001 001        99
##  9 CO-101-001 01001001 001       127
## 10 CO-101-001 01001001 001       127
## # ℹ 4,692 more rows

df_vs <- haven::read_sas('advs.sas7bdat') %>%
  select(STUDYID, SUBJID, SITEID, ADY) 

df_vs

## # A tibble: 45,176 × 4
##    STUDYID    SUBJID   SITEID   ADY
##    <chr>      <chr>    <chr>  <dbl>
##  1 CO-101-001 01001001 001       -7
##  2 CO-101-001 01001001 001        1
##  3 CO-101-001 01001001 001        1
##  4 CO-101-001 01001001 001        8
##  5 CO-101-001 01001001 001        8
##  6 CO-101-001 01001001 001       15
##  7 CO-101-001 01001001 001       15
##  8 CO-101-001 01001001 001       29
##  9 CO-101-001 01001001 001       29
## 10 CO-101-001 01001001 001       38
## # ℹ 45,166 more rows

In order to assign each AE to a visit we union both event tables and sort by date.

df_ae <- df_ae %>%
  rename(DY = AESTDY) %>%
  mutate(EVENT = "AE")

df_vs <- df_vs %>%
  rename(DY = ADY) %>%
  mutate(EVENT = "VS") %>%
  # we ignore visits that have no date
  filter(! is.na(DY)) %>%
  # we are not interested in same day visits
  distinct()

df_aevs <- bind_rows(df_ae, df_vs) %>%
  # NA's get sorted towards the end thus AEs with no date get sorted towards last visit
  arrange(STUDYID, SITEID, SUBJID, DY) %>%
  group_by(STUDYID, SITEID, SUBJID) %>%
  mutate(AE_NO = cumsum(ifelse(EVENT == "AE", 1, 0)),
         VS_NO = cumsum(ifelse(EVENT == "VS", 1, 0))) %>%
  # we remove patients with 0 visits
  filter(max(VS_NO) > 0) %>%
  # AE's before fist visit should register to visit 1 not zero
  mutate(VS_NO = ifelse(VS_NO == 0, 1, VS_NO))

patient example with AE before first visit and AEs with NA in date

df_aevs %>%
  filter(SUBJID == "01007004") %>%
  knitr::kable()

STUDYID	SUBJID	SITEID	DY	EVENT	AE_NO	VS_NO
CO-101-001	01007004	007	-14	AE	1	1
CO-101-001	01007004	007	-8	VS	1	1
CO-101-001	01007004	007	-3	AE	2	1
CO-101-001	01007004	007	-1	AE	3	1
CO-101-001	01007004	007	1	AE	4	1
CO-101-001	01007004	007	1	VS	4	2
CO-101-001	01007004	007	8	VS	4	3
CO-101-001	01007004	007	15	AE	5	3
CO-101-001	01007004	007	15	AE	6	3
CO-101-001	01007004	007	18	AE	7	3
CO-101-001	01007004	007	20	VS	7	4
CO-101-001	01007004	007	29	VS	7	5
CO-101-001	01007004	007	36	AE	8	5
CO-101-001	01007004	007	36	AE	9	5
CO-101-001	01007004	007	36	AE	10	5
CO-101-001	01007004	007	36	VS	10	6
CO-101-001	01007004	007	43	VS	10	7
CO-101-001	01007004	007	57	AE	11	7
CO-101-001	01007004	007	57	AE	12	7
CO-101-001	01007004	007	57	VS	12	8
CO-101-001	01007004	007	58	AE	13	8
CO-101-001	01007004	007	64	VS	13	9
CO-101-001	01007004	007	71	VS	13	10
CO-101-001	01007004	007	85	VS	13	11
CO-101-001	01007004	007	92	VS	13	12
CO-101-001	01007004	007	99	AE	14	12
CO-101-001	01007004	007	99	VS	14	13
CO-101-001	01007004	007	112	VS	14	14
CO-101-001	01007004	007	120	AE	15	14
CO-101-001	01007004	007	123	AE	16	14
CO-101-001	01007004	007	123	AE	17	14
CO-101-001	01007004	007	123	AE	18	14
CO-101-001	01007004	007	125	VS	18	15
CO-101-001	01007004	007	NA	AE	19	15
CO-101-001	01007004	007	NA	AE	20	15
CO-101-001	01007004	007	NA	AE	21	15

Then we aggregate on visit number.

df_aevs_aggr <- df_aevs %>%
  group_by(STUDYID, SITEID, SUBJID, VS_NO) %>%
  summarise(MIN_AE_NO = min(AE_NO),
            MAX_AE_NO = max(AE_NO),
            .groups = "drop") %>%
  group_by(STUDYID, SITEID, SUBJID) %>%
  mutate(MAX_VS_PAT = max(VS_NO)) %>%
  ungroup() %>%
  # assign AEs that occur after last visit to last AE
  mutate(
    CUM_AE = ifelse(
      VS_NO == MAX_VS_PAT,
      MAX_AE_NO,
      MIN_AE_NO)
    )

Same patient example as before.

df_aevs_aggr  %>%
  filter(SUBJID == "01007004") %>%
  knitr::kable()

STUDYID	SITEID	SUBJID	VS_NO	MIN_AE_NO	MAX_AE_NO	MAX_VS_PAT	CUM_AE
CO-101-001	007	01007004	1	1	4	15	1
CO-101-001	007	01007004	2	4	4	15	4
CO-101-001	007	01007004	3	4	7	15	4
CO-101-001	007	01007004	4	7	7	15	7
CO-101-001	007	01007004	5	7	10	15	7
CO-101-001	007	01007004	6	10	10	15	10
CO-101-001	007	01007004	7	10	12	15	10
CO-101-001	007	01007004	8	12	13	15	12
CO-101-001	007	01007004	9	13	13	15	13
CO-101-001	007	01007004	10	13	13	15	13
CO-101-001	007	01007004	11	13	13	15	13
CO-101-001	007	01007004	12	13	14	15	13
CO-101-001	007	01007004	13	14	14	15	14
CO-101-001	007	01007004	14	14	18	15	14
CO-101-001	007	01007004	15	18	21	15	21

As a control we check whether the numbers of visits and AEs of our processed data still matches the number of AEs in our original data.

stopifnot(nrow(df_aevs_aggr) == nrow(df_vs))

n_aes <- df_aevs_aggr %>%
  group_by(SUBJID) %>%
  summarize(n_aes = max(CUM_AE)) %>%
  pull(n_aes) %>%
  sum()

n_aes_original <- df_ae %>%
  # all AEs for patients with more than 1 visit
  filter(SUBJID %in% df_aevs$SUBJID) %>%
  nrow()

stopifnot(n_aes == n_aes_original)

{simaerep}

After renaming some of the columns we can pass the aggregated data from the SAS files to simaerep

df_visit <- df_aevs_aggr %>%
  rename(
    study_id = "STUDYID",
    site_number = "SITEID",
    patnum = "SUBJID",
    n_ae = "CUM_AE",
    visit = "VS_NO"
  )  %>%
  select(study_id, site_number, patnum, n_ae, visit)

df_visit

## # A tibble: 5,432 × 5
##    study_id   site_number patnum    n_ae visit
##    <chr>      <chr>       <chr>    <dbl> <dbl>
##  1 CO-101-001 001         01001001     1     1
##  2 CO-101-001 001         01001001     2     2
##  3 CO-101-001 001         01001001     2     3
##  4 CO-101-001 001         01001001     2     4
##  5 CO-101-001 001         01001001     2     5
##  6 CO-101-001 001         01001001     2     6
##  7 CO-101-001 001         01001001     2     7
##  8 CO-101-001 001         01001001     3     8
##  9 CO-101-001 001         01001001     3     9
## 10 CO-101-001 001         01001001     6    10
## # ℹ 5,422 more rows

aerep <- simaerep(df_visit)

plot(aerep)

Left panel shows mean AE reporting per site (lightblue and darkblue lines) against mean AE reporting of the entire study (golden line). Single sites are plotted in descending order by AE under-reporting probability on the right panel in which grey lines denote cumulative AE count of single patients. Grey dots in the left panel plot indicate sites that were picked for single plotting. AE under-reporting probability of dark blue lines crossed threshold of 95%. Numbers in the upper left corner indicate the ratio of patients that have been used for the analysis against the total number of patients. Patients that have not been on the study long enough to reach the evaluation point (visit_med75, see introduction) will be ignored.