(Setup, input, description)

(Setup)

Note: Unfortunately, part of this analysis depends on data/numbers that cannot be publicly shared, and are being ‘gitignored’. This work has been done in a separate file called ‘private_work.R’. This is switched on if you change the code below to `private <- TRUE’

#private <- TRUE
private <- FALSE

if (private) {
  source(here::here("code", "private_work.R"))
}

Sourcing functions base options

knitr::opts_chunk$set(echo = TRUE, results = 'asis')

#options(scipen=100) #'penalizes' using scientific notation

sumnumopt <- setSummaries(numeric= c("centralValue" ,"quartiles", "minMax","countMissing"))

#st_options("round.digits",3)

#options(todor_rmd=TRUE)

local({
  r <- getOption("repos")
  r["CRAN"] <- "http://cran.r-project.org"
  options(repos = r)
})

Assign names to folders and files

data_folder <- here("data","dv_data")
#note -- running this interactively seems to require one step down:  "../data/dv_data"... but the 'here' seems to solve it

data_2018 <- "Stats-for-DV-Formula_TYVid_dr_editing.xlsx"
data_2019 <- "stats-for-dv-formula_tyvid_2019_dr_editing.xlsx"

(ALL INPUT AND DESCRIPTION)

Note: file documentation from DV experiments has been moved to section below

Input, clean, mutate

Our charity partner gave us three types of summary statistics on the data:

Basic statistics: counts of recipients (N), clicks/conversions, etc., and amount raised
Donation crosstabs, by donation ranges
Ranks of donations within each run/year

… each classed or identified by run/year and treatment.

We were also able to infer the combination of 2018 and 2019 ‘treatments’ (mailings) for each individual, and recover donation counts and summary statistics for these combination.

Input basic statistics (sample sizes, outcome counts and totals)

Create objects: Data input parameters, names of estimation objects

range_names    <- list("2018"="moved_from...!R11:V41", "2019"="bins_data_input!A1:E31")
range_names_ss <- list("2018"="TY-Video!H24:O26", "2019"="bins!A3:I5")
data_filenames <- list("2018"=data_2018, "2019"=data_2019)

Actual input

base_stats <- purrr::map2_dfr(data_filenames, range_names_ss, ~read_excel(file.path(data_folder, .x), range = .y,  col_types = "text"), .id='run') %>%
  mutate_at(vars(-matches("email|revenue")), as.numeric) %>%
          clean_names(case = c("snake"))  %>%
          mutate(revenue = as.numeric(gsub('[$,]', '', revenue)),
          recipients = coalesce(count_overall, recipients)) %>%
 mutate(
    treatment = case_when(
      stringr::str_detect(email,regex("impact", ignore_case = TRUE)) ~ "treat",
      TRUE ~ "control"),
    av_pos_gift    = revenue/conversions,
    rev_per_recip  = revenue/recipients,
    conv_per_recip = conversions/recipients,
    conv_per_click = conversions/clicks,
    add_zeroes     = recipients - conversions,
    run            = as.character(run)
    ) %>%
  dplyr::select(run, treatment, recipients, bounces, opens, clicks, conversions, av_pos_gift, conv_per_recip, conv_per_click, rev_per_recip, everything(), -avg_gift, -count_overall) %>% #note redundant variables are removed here
  arrange(run,treatment) %>%
  setter::set_comment("2018-19 Summary statistics and totals (recip/open/click/conversion...) by treatment and year")

#TODO - CHECK: For 2018 -- Are these the relevant outcomes, or should we be focusing on the "moved_from...!R11:V41" data instead?

#TODO: Can we get the 'modes other than email' values for this? Can they be imputed from the range-coded data?

#TODO: We are missing the important sum of squares etc... to allow t-tests! It is in cel H of the 'stats' table, but that seems to be from a different (earlier?) cut of the data; we need this information from something with the 38+47=85 conversions (from email?)

Aggregate 2018-19 for ‘total’ rows

Note that I ‘hard code in’ the (partial) total revenue and number of gifts including indirect for 2018 only. This is from the sheet TYEmailVideo_Results2.xlsx which we do not use for anything else. This is not good practice but the data we were sent was so chaotic that I don’t think this is consequential.

Note that they only sent these figures including indirect donations on 17-Jan-19 and did not update it. I don’t think it includes the outlier.

#for `treatments`
base_stats <- base_stats %>%
  group_by(treatment) %>%
  summarise_at(c("recipients","bounces","opens","clicks","conversions","revenue","unsubs","add_zeroes"), sum) %>%
mutate(
    av_pos_gift    = revenue/conversions,
    rev_per_recip  = revenue/recipients,
    conv_per_recip = conversions/recipients,
    conv_per_click = conversions/clicks,
    run            = "both"
    )  %>%
  ungroup() %>%
  bind_rows(base_stats) %>%
  mutate(
    conversions_including_indirect = case_when(
      run==2018 & treatment=="control" ~ 241,
      run==2018 & treatment=="treat" ~ 267
    ),
    rev_incl_indirect = case_when(
      run==2018 & treatment=="control" ~ 30931,
      run==2018 & treatment=="treat" ~ 43448
    )
  ) %>%
  dplyr::select(treatment, run, everything())

Set (and also get) names of key variables and objects to model, sample sizes

treatments <- c("control", "treat") #If we had crossed arms we would probably specify this as a list of character vectors\

outcomes_all <- c("opens", "clicks", "bounces", "conversions", "unsubs", "rev_per_recip", "av_pos_gift", "revenue")

outcomes     <- c("opens", "clicks", "conversions", "rev_per_recip", "av_pos_gift" )

#TODO: Add the 'donation mode' and sum of squares variables when I get these

bin_outcomes  <- c("opens", "clicks", "conversions")
cont_outcomes <- c("rev_per_recip", "av_pos_gift")

years <- c("2018", "2019")

#splits <- list(years, mode)

samp_size <- matrix(data=base_stats$recipients, nrow=length(years), ncol = length(treatments), dimnames=list(treatments, years))

samp_size_opens <- matrix(data=base_stats$opens, nrow=length(years), ncol = length(treatments), dimnames=list(treatments, years))

Input donation crosstabs (summary data we can analyse)

Input rank data

Add zero rows to allow easy ranksum tests

for (h in treatments) {
  for (i in 2018:2019) {
    assign(glue('z_{h}_{i}'),
        dv_ranks %>%
          filter(run=={i}) %>%
            slice(1) %>%
          #start with the first row of ranks only
          ungroup() %>%
         mutate(
           rank      = max_rank+1, #assign highest rank+1 in other data (for all zeroes)
           rev_rank  = min_revrank-1, #reverse rank (here it seems to be 0)
           treatment = as.factor({h}), #for control obs
           run       = as.character({i})
         ) %>%
         slice(rep(1:n(), each = base_stats$add_zeroes[base_stats$run=={i} & base_stats$treatment=={h}])) #repeat this for the number of controls less 1 less number of control donations
    )
  }
}

dv_ranks_all <- as_tibble(bind_rows(dv_ranks, z_control_2018, z_treat_2018, z_control_2019, z_treat_2019)) %>%
  setter::set_comment("Donation rank within each year, most to least, and reverse, with non-donations (zeroes) added")

rm(list = ls(pattern = "^z_"))

5.5.1 Cross-year data

(Note: this is brought in from data/dv_data/archive-and-original and coded in private_work.R. These folders and files are not shared.)

dv_link_sum <- readRDS(file = here::here("data", "dv_data", "dv_link_sum.rds")) %>%
      mutate(year = as.character(year)) %>% 
  setter::set_comment("Incidence and totals; combined 2018 ad 2019 treatments")

Cleaning, Adding 2019 aggregation categories for 2018

dv_link_sum_all18 <- dv_link_sum %>% 
  filter(year==2019) %>%
  group_by(treat_2019) %>%
  select(-matches("mean|per|names")) %>% 
  summarise(across(where(is.numeric), ~ sum(.x, na.rm = TRUE))) %>%
  mutate(treat_2018="Any", year = "2019")
  
dv_link_sum_not_treat18 <- dv_link_sum %>% 
  filter(year==2019) %>%
  filter(treat_2018=="Control" | is.na(treat_2018)) %>%
  group_by(treat_2019) %>%
  select(-matches("mean|per|names")) %>% 
  summarise(across(where(is.numeric), ~ sum(.x, na.rm = TRUE))) %>%
    mutate(treat_2018="control_or_NA", year = "2019")

dv_link_sum <- bind_rows(dv_link_sum, dv_link_sum_all18, dv_link_sum_not_treat18)

names(dv_link_sum) <-  to_snake_case(names(dv_link_sum))

Rename and reformat to agree with individual year data:

dv_link_sum <- dv_link_sum %>% 
  dplyr::rename(
    run = year,
    bounces = bounced_sum,
    opens = opened_sum,
    clicks = clicked_sum,
    conversions = converted_sum,
    conv_per_recip = converted_mean, 
    revenue = contribution_sum,
    rev_per_recip = contribution_mean,
    unsubs = unsubscribed_sum
  ) %>%
  mutate(
    recipients = counter_sum - bounces)

Basic statistics for treatment combinations:

(
dt_link_counts <- dv_link_sum %>% 
  select(-matches("mean|per|names")) %>% 
  DT::datatable() 
)

(dt_link_mean <- dv_link_sum %>% 
  filter(treat_2018!="Control or NA" & treat_2018!="Any") %>% 
  select(matches("mean|run|treat|per")) %>% 
  DT::datatable()  %>%
  formatRound(columns=c('opened_mean', 'clicked_mean', 'rev_per_recip'), digits=4) %>% 
  formatRound(columns=c('conv_per_recip', 'bounced_mean', 'unsubscribed_mean'), digits=6) 
)

Description/depiction/codebook of (summary) data

Key data frames

base_stats:
dv_ranges: Donation stats in ranges by treatment, mode, year
dv_ranks_all: Donation rank within each year, most to least, and reverse, with non-donations (zeroes) added
dv_link_sum : Incidence and totals; combined 2018 ad 2019 treatments

Key summary statistics and graphs

big_stats_table <-  base_stats %>%
  as_tibble() %>%
mutate (
  conv_per_10k_recip = conv_per_recip*10000
) %>%
  dplyr::select(run, treatment, recipients, bounces, opens, clicks, conversions, conv_per_10k_recip, conv_per_click, rev_per_recip, av_pos_gift, conversions_including_indirect, rev_incl_indirect, -email,-add_zeroes) %>%
  kable() %>%
  kable_styling("striped", full_width = F) #as.tibble just to remove row label

big_stats_table

run	treatment	recipients	bounces	opens	clicks	conversions	conv_per_10k_recip	conv_per_click	rev_per_recip	av_pos_gift	conversions_including_indirect	rev_incl_indirect
both	control	131175	180	29047	681	74	5.641319	0.1086637	0.1399200	248.02703	NA	NA
both	treat	131173	178	28558	645	109	8.309637	0.1689922	0.0859399	103.42202	NA	NA
2018	control	91298	39	16906	414	27	2.957349	0.0652174	0.1587768	536.88889	241	30931
2018	treat	91296	42	16195	371	71	7.776902	0.1913747	0.0703536	90.46479	267	43448
2019	control	39877	141	12141	267	47	11.786243	0.1760300	0.0967475	82.08511	NA	NA
2019	treat	39877	136	12363	274	38	9.529303	0.1386861	0.1216240	127.63158	NA	NA

funnel <- plot_ly(base_stats %>% filter(run=="both" & treatment=="control"))

funnel <- funnel %>% add_trace(type="funnel",
                  y = c("recipients", "opens", "clicks", "conversions"),
                  x = c(~recipients, ~opens, ~clicks, ~conversions),
            textinfo = "value+percent initial")

funnel <- funnel %>%
  layout(yaxis=list(categoryarray = c("recipients", "opens", "clicks", "conversions")))

funnel

Histogram of donation ranges by year treatment (and, for 2018, by response mode)

Key descriptives

Years (runs): 20182019

Treatments: controltreat