(Setup, input, description)

(Setup)

Note: Unfortunately, part of this analysis depends on data/numbers that cannot be publicly shared, and are being ‘gitignored’. This work has been done in a separate file called ‘private_work.R’. This is switched on if you change the code below to `private <- TRUE’

#private <- TRUE
private <- FALSE

if (private) {
  source(here::here("code", "private_work.R"))
}

Sourcing functions base options

knitr::opts_chunk$set(echo = TRUE, results = 'asis')

#options(scipen=100) #'penalizes' using scientific notation

sumnumopt <- setSummaries(numeric= c("centralValue" ,"quartiles", "minMax","countMissing"))

#st_options("round.digits",3)

#options(todor_rmd=TRUE)

local({
  r <- getOption("repos")
  r["CRAN"] <- "http://cran.r-project.org"
  options(repos = r)
})

Assign names to folders and files

data_folder <- here("data","dv_data")
#note -- running this interactively seems to require one step down:  "../data/dv_data"... but the 'here' seems to solve it

data_2018 <- "Stats-for-DV-Formula_TYVid_dr_editing.xlsx"
data_2019 <- "stats-for-dv-formula_tyvid_2019_dr_editing.xlsx"

(ALL INPUT AND DESCRIPTION)

Note: file documentation from DV experiments has been moved to section below

Input, clean, mutate

Our charity partner gave us three types of summary statistics on the data:

  1. Basic statistics: counts of recipients (N), clicks/conversions, etc., and amount raised

  2. Donation crosstabs, by donation ranges

  3. Ranks of donations within each run/year

each classed or identified by run/year and treatment.

We were also able to infer the combination of 2018 and 2019 ‘treatments’ (mailings) for each individual, and recover donation counts and summary statistics for these combination.


Input basic statistics (sample sizes, outcome counts and totals)

Create objects: Data input parameters, names of estimation objects

range_names    <- list("2018"="moved_from...!R11:V41", "2019"="bins_data_input!A1:E31")
range_names_ss <- list("2018"="TY-Video!H24:O26", "2019"="bins!A3:I5")
data_filenames <- list("2018"=data_2018, "2019"=data_2019)

Actual input

base_stats <- purrr::map2_dfr(data_filenames, range_names_ss, ~read_excel(file.path(data_folder, .x), range = .y,  col_types = "text"), .id='run') %>%
  mutate_at(vars(-matches("email|revenue")), as.numeric) %>%
          clean_names(case = c("snake"))  %>%
          mutate(revenue = as.numeric(gsub('[$,]', '', revenue)),
          recipients = coalesce(count_overall, recipients)) %>%
 mutate(
    treatment = case_when(
      stringr::str_detect(email,regex("impact", ignore_case = TRUE)) ~ "treat",
      TRUE ~ "control"),
    av_pos_gift    = revenue/conversions,
    rev_per_recip  = revenue/recipients,
    conv_per_recip = conversions/recipients,
    conv_per_click = conversions/clicks,
    add_zeroes     = recipients - conversions,
    run            = as.character(run)
    ) %>%
  dplyr::select(run, treatment, recipients, bounces, opens, clicks, conversions, av_pos_gift, conv_per_recip, conv_per_click, rev_per_recip, everything(), -avg_gift, -count_overall) %>% #note redundant variables are removed here
  arrange(run,treatment) %>%
  setter::set_comment("2018-19 Summary statistics and totals (recip/open/click/conversion...) by treatment and year")

#TODO - CHECK: For 2018 -- Are these the relevant outcomes, or should we be focusing on the "moved_from...!R11:V41" data instead?

#TODO: Can we get the 'modes other than email' values for this? Can they be imputed from the range-coded data?

#TODO: We are missing the important sum of squares etc... to allow t-tests! It is in cel H of the 'stats' table, but that seems to be from a different (earlier?) cut of the data; we need this information from something with the 38+47=85 conversions (from email?)

Aggregate 2018-19 for ‘total’ rows

Note that I ‘hard code in’ the (partial) total revenue and number of gifts including indirect for 2018 only. This is from the sheet TYEmailVideo_Results2.xlsx which we do not use for anything else. This is not good practice but the data we were sent was so chaotic that I don’t think this is consequential.

Note that they only sent these figures including indirect donations on 17-Jan-19 and did not update it. I don’t think it includes the outlier.

#for `treatments`
base_stats <- base_stats %>%
  group_by(treatment) %>%
  summarise_at(c("recipients","bounces","opens","clicks","conversions","revenue","unsubs","add_zeroes"), sum) %>%
mutate(
    av_pos_gift    = revenue/conversions,
    rev_per_recip  = revenue/recipients,
    conv_per_recip = conversions/recipients,
    conv_per_click = conversions/clicks,
    run            = "both"
    )  %>%
  ungroup() %>%
  bind_rows(base_stats) %>%
  mutate(
    conversions_including_indirect = case_when(
      run==2018 & treatment=="control" ~ 241,
      run==2018 & treatment=="treat" ~ 267
    ),
    rev_incl_indirect = case_when(
      run==2018 & treatment=="control" ~ 30931,
      run==2018 & treatment=="treat" ~ 43448
    )
  ) %>%
  dplyr::select(treatment, run, everything())



Set (and also get) names of key variables and objects to model, sample sizes

treatments <- c("control", "treat") #If we had crossed arms we would probably specify this as a list of character vectors\

outcomes_all <- c("opens", "clicks", "bounces", "conversions", "unsubs", "rev_per_recip", "av_pos_gift", "revenue")

outcomes     <- c("opens", "clicks", "conversions", "rev_per_recip", "av_pos_gift" )

#TODO: Add the 'donation mode' and sum of squares variables when I get these

bin_outcomes  <- c("opens", "clicks", "conversions")
cont_outcomes <- c("rev_per_recip", "av_pos_gift")

years <- c("2018", "2019")

#splits <- list(years, mode)

samp_size <- matrix(data=base_stats$recipients, nrow=length(years), ncol = length(treatments), dimnames=list(treatments, years))

samp_size_opens <- matrix(data=base_stats$opens, nrow=length(years), ncol = length(treatments), dimnames=list(treatments, years))

Input donation crosstabs (summary data we can analyse)

Input rank data

Add zero rows to allow easy ranksum tests

for (h in treatments) {
  for (i in 2018:2019) {
    assign(glue('z_{h}_{i}'),
        dv_ranks %>%
          filter(run=={i}) %>%
            slice(1) %>%
          #start with the first row of ranks only
          ungroup() %>%
         mutate(
           rank      = max_rank+1, #assign highest rank+1 in other data (for all zeroes)
           rev_rank  = min_revrank-1, #reverse rank (here it seems to be 0)
           treatment = as.factor({h}), #for control obs
           run       = as.character({i})
         ) %>%
         slice(rep(1:n(), each = base_stats$add_zeroes[base_stats$run=={i} & base_stats$treatment=={h}])) #repeat this for the number of controls less 1 less number of control donations
    )
  }
}

dv_ranks_all <- as_tibble(bind_rows(dv_ranks, z_control_2018, z_treat_2018, z_control_2019, z_treat_2019)) %>%
  setter::set_comment("Donation rank within each year, most to least, and reverse, with non-donations (zeroes) added")

rm(list = ls(pattern = "^z_"))

5.5.1 Cross-year data

(Note: this is brought in from data/dv_data/archive-and-original and coded in private_work.R. These folders and files are not shared.)

dv_link_sum <- readRDS(file = here::here("data", "dv_data", "dv_link_sum.rds")) %>%
      mutate(year = as.character(year)) %>% 
  setter::set_comment("Incidence and totals; combined 2018 ad 2019 treatments")  

Cleaning, Adding 2019 aggregation categories for 2018

dv_link_sum_all18 <- dv_link_sum %>% 
  filter(year==2019) %>%
  group_by(treat_2019) %>%
  select(-matches("mean|per|names")) %>% 
  summarise(across(where(is.numeric), ~ sum(.x, na.rm = TRUE))) %>%
  mutate(treat_2018="Any", year = "2019")
  
dv_link_sum_not_treat18 <- dv_link_sum %>% 
  filter(year==2019) %>%
  filter(treat_2018=="Control" | is.na(treat_2018)) %>%
  group_by(treat_2019) %>%
  select(-matches("mean|per|names")) %>% 
  summarise(across(where(is.numeric), ~ sum(.x, na.rm = TRUE))) %>%
    mutate(treat_2018="control_or_NA", year = "2019")

dv_link_sum <- bind_rows(dv_link_sum, dv_link_sum_all18, dv_link_sum_not_treat18)

names(dv_link_sum) <-  to_snake_case(names(dv_link_sum))

Rename and reformat to agree with individual year data:

dv_link_sum <- dv_link_sum %>% 
  dplyr::rename(
    run = year,
    bounces = bounced_sum,
    opens = opened_sum,
    clicks = clicked_sum,
    conversions = converted_sum,
    conv_per_recip = converted_mean, 
    revenue = contribution_sum,
    rev_per_recip = contribution_mean,
    unsubs = unsubscribed_sum
  ) %>%
  mutate(
    recipients = counter_sum - bounces)

Basic statistics for treatment combinations:

(
dt_link_counts <- dv_link_sum %>% 
  select(-matches("mean|per|names")) %>% 
  DT::datatable() 
)
(dt_link_mean <- dv_link_sum %>% 
  filter(treat_2018!="Control or NA" & treat_2018!="Any") %>% 
  select(matches("mean|run|treat|per")) %>% 
  DT::datatable()  %>%
  formatRound(columns=c('opened_mean', 'clicked_mean', 'rev_per_recip'), digits=4) %>% 
  formatRound(columns=c('conv_per_recip', 'bounced_mean', 'unsubscribed_mean'), digits=6) 
)

Description/depiction/codebook of (summary) data

Key data frames

  • base_stats:

  • dv_ranges: Donation stats in ranges by treatment, mode, year

  • dv_ranks_all: Donation rank within each year, most to least, and reverse, with non-donations (zeroes) added

  • dv_link_sum : Incidence and totals; combined 2018 ad 2019 treatments

Key summary statistics and graphs

big_stats_table <-  base_stats %>%
  as_tibble() %>%
mutate (
  conv_per_10k_recip = conv_per_recip*10000
) %>%
  dplyr::select(run, treatment, recipients, bounces, opens, clicks, conversions, conv_per_10k_recip, conv_per_click, rev_per_recip, av_pos_gift, conversions_including_indirect, rev_incl_indirect, -email,-add_zeroes) %>%
  kable() %>%
  kable_styling("striped", full_width = F) #as.tibble just to remove row label

big_stats_table
run treatment recipients bounces opens clicks conversions conv_per_10k_recip conv_per_click rev_per_recip av_pos_gift conversions_including_indirect rev_incl_indirect
both control 131175 180 29047 681 74 5.641319 0.1086637 0.1399200 248.02703 NA NA
both treat 131173 178 28558 645 109 8.309637 0.1689922 0.0859399 103.42202 NA NA
2018 control 91298 39 16906 414 27 2.957349 0.0652174 0.1587768 536.88889 241 30931
2018 treat 91296 42 16195 371 71 7.776902 0.1913747 0.0703536 90.46479 267 43448
2019 control 39877 141 12141 267 47 11.786243 0.1760300 0.0967475 82.08511 NA NA
2019 treat 39877 136 12363 274 38 9.529303 0.1386861 0.1216240 127.63158 NA NA
funnel <- plot_ly(base_stats %>% filter(run=="both" & treatment=="control"))

funnel <- funnel %>% add_trace(type="funnel",
                  y = c("recipients", "opens", "clicks", "conversions"),
                  x = c(~recipients, ~opens, ~clicks, ~conversions),
            textinfo = "value+percent initial")

funnel <- funnel %>%
  layout(yaxis=list(categoryarray = c("recipients", "opens", "clicks", "conversions")))

funnel


Histogram of donation ranges by year treatment (and, for 2018, by response mode)

Key descriptives

Years (runs): 20182019

Treatments: controltreat