(Setup, input, description)
(Setup)
Note: Unfortunately, part of this analysis depends on data/numbers that cannot be publicly shared, and are being ‘gitignored’. This work has been done in a separate file called ‘private_work.R’. This is switched on if you change the code below to `private <- TRUE’
#private <- TRUE
<- FALSE
private
if (private) {
source(here::here("code", "private_work.R"))
}
Sourcing functions base options
::opts_chunk$set(echo = TRUE, results = 'asis')
knitr
#options(scipen=100) #'penalizes' using scientific notation
<- setSummaries(numeric= c("centralValue" ,"quartiles", "minMax","countMissing"))
sumnumopt
#st_options("round.digits",3)
#options(todor_rmd=TRUE)
local({
<- getOption("repos")
r "CRAN"] <- "http://cran.r-project.org"
r[options(repos = r)
})
Assign names to folders and files
<- here("data","dv_data")
data_folder #note -- running this interactively seems to require one step down: "../data/dv_data"... but the 'here' seems to solve it
<- "Stats-for-DV-Formula_TYVid_dr_editing.xlsx"
data_2018 <- "stats-for-dv-formula_tyvid_2019_dr_editing.xlsx" data_2019
(ALL INPUT AND DESCRIPTION)
Note: file documentation from DV experiments has been moved to section below
Input, clean, mutate
Our charity partner gave us three types of summary statistics on the data:
Basic statistics: counts of recipients (N), clicks/conversions, etc., and amount raised
Donation crosstabs, by donation ranges
Ranks of donations within each run/year
… each classed or identified by run/year and treatment.
We were also able to infer the combination of 2018 and 2019 ‘treatments’ (mailings) for each individual, and recover donation counts and summary statistics for these combination.
Input basic statistics (sample sizes, outcome counts and totals)
Create objects: Data input parameters, names of estimation objects
<- list("2018"="moved_from...!R11:V41", "2019"="bins_data_input!A1:E31")
range_names <- list("2018"="TY-Video!H24:O26", "2019"="bins!A3:I5")
range_names_ss <- list("2018"=data_2018, "2019"=data_2019) data_filenames
Actual input
<- purrr::map2_dfr(data_filenames, range_names_ss, ~read_excel(file.path(data_folder, .x), range = .y, col_types = "text"), .id='run') %>%
base_stats mutate_at(vars(-matches("email|revenue")), as.numeric) %>%
clean_names(case = c("snake")) %>%
mutate(revenue = as.numeric(gsub('[$,]', '', revenue)),
recipients = coalesce(count_overall, recipients)) %>%
mutate(
treatment = case_when(
::str_detect(email,regex("impact", ignore_case = TRUE)) ~ "treat",
stringrTRUE ~ "control"),
av_pos_gift = revenue/conversions,
rev_per_recip = revenue/recipients,
conv_per_recip = conversions/recipients,
conv_per_click = conversions/clicks,
add_zeroes = recipients - conversions,
run = as.character(run)
%>%
) ::select(run, treatment, recipients, bounces, opens, clicks, conversions, av_pos_gift, conv_per_recip, conv_per_click, rev_per_recip, everything(), -avg_gift, -count_overall) %>% #note redundant variables are removed here
dplyrarrange(run,treatment) %>%
::set_comment("2018-19 Summary statistics and totals (recip/open/click/conversion...) by treatment and year")
setter
#TODO - CHECK: For 2018 -- Are these the relevant outcomes, or should we be focusing on the "moved_from...!R11:V41" data instead?
#TODO: Can we get the 'modes other than email' values for this? Can they be imputed from the range-coded data?
#TODO: We are missing the important sum of squares etc... to allow t-tests! It is in cel H of the 'stats' table, but that seems to be from a different (earlier?) cut of the data; we need this information from something with the 38+47=85 conversions (from email?)
Aggregate 2018-19 for ‘total’ rows
Note that I ‘hard code in’ the (partial) total revenue and number of gifts including indirect for 2018 only. This is from the sheet TYEmailVideo_Results2.xlsx
which we do not use for anything else. This is not good practice but the data we were sent was so chaotic that I don’t think this is consequential.
Note that they only sent these figures including indirect donations on 17-Jan-19 and did not update it. I don’t think it includes the outlier.
#for `treatments`
<- base_stats %>%
base_stats group_by(treatment) %>%
summarise_at(c("recipients","bounces","opens","clicks","conversions","revenue","unsubs","add_zeroes"), sum) %>%
mutate(
av_pos_gift = revenue/conversions,
rev_per_recip = revenue/recipients,
conv_per_recip = conversions/recipients,
conv_per_click = conversions/clicks,
run = "both"
%>%
) ungroup() %>%
bind_rows(base_stats) %>%
mutate(
conversions_including_indirect = case_when(
==2018 & treatment=="control" ~ 241,
run==2018 & treatment=="treat" ~ 267
run
),rev_incl_indirect = case_when(
==2018 & treatment=="control" ~ 30931,
run==2018 & treatment=="treat" ~ 43448
run
)%>%
) ::select(treatment, run, everything()) dplyr
Set (and also get) names of key variables and objects to model, sample sizes
<- c("control", "treat") #If we had crossed arms we would probably specify this as a list of character vectors\
treatments
<- c("opens", "clicks", "bounces", "conversions", "unsubs", "rev_per_recip", "av_pos_gift", "revenue")
outcomes_all
<- c("opens", "clicks", "conversions", "rev_per_recip", "av_pos_gift" )
outcomes
#TODO: Add the 'donation mode' and sum of squares variables when I get these
<- c("opens", "clicks", "conversions")
bin_outcomes <- c("rev_per_recip", "av_pos_gift")
cont_outcomes
<- c("2018", "2019")
years
#splits <- list(years, mode)
<- matrix(data=base_stats$recipients, nrow=length(years), ncol = length(treatments), dimnames=list(treatments, years))
samp_size
<- matrix(data=base_stats$opens, nrow=length(years), ncol = length(treatments), dimnames=list(treatments, years)) samp_size_opens
Add zero rows to allow easy ranksum tests
for (h in treatments) {
for (i in 2018:2019) {
assign(glue('z_{h}_{i}'),
%>%
dv_ranks filter(run=={i}) %>%
slice(1) %>%
#start with the first row of ranks only
ungroup() %>%
mutate(
rank = max_rank+1, #assign highest rank+1 in other data (for all zeroes)
rev_rank = min_revrank-1, #reverse rank (here it seems to be 0)
treatment = as.factor({h}), #for control obs
run = as.character({i})
%>%
) slice(rep(1:n(), each = base_stats$add_zeroes[base_stats$run=={i} & base_stats$treatment=={h}])) #repeat this for the number of controls less 1 less number of control donations
)
}
}
<- as_tibble(bind_rows(dv_ranks, z_control_2018, z_treat_2018, z_control_2019, z_treat_2019)) %>%
dv_ranks_all ::set_comment("Donation rank within each year, most to least, and reverse, with non-donations (zeroes) added")
setter
rm(list = ls(pattern = "^z_"))
5.5.1 Cross-year data
(Note: this is brought in from data/dv_data/archive-and-original
and coded in private_work.R
. These folders and files are not shared.)
<- readRDS(file = here::here("data", "dv_data", "dv_link_sum.rds")) %>%
dv_link_sum mutate(year = as.character(year)) %>%
::set_comment("Incidence and totals; combined 2018 ad 2019 treatments") setter
Cleaning, Adding 2019 aggregation categories for 2018
<- dv_link_sum %>%
dv_link_sum_all18 filter(year==2019) %>%
group_by(treat_2019) %>%
select(-matches("mean|per|names")) %>%
summarise(across(where(is.numeric), ~ sum(.x, na.rm = TRUE))) %>%
mutate(treat_2018="Any", year = "2019")
<- dv_link_sum %>%
dv_link_sum_not_treat18 filter(year==2019) %>%
filter(treat_2018=="Control" | is.na(treat_2018)) %>%
group_by(treat_2019) %>%
select(-matches("mean|per|names")) %>%
summarise(across(where(is.numeric), ~ sum(.x, na.rm = TRUE))) %>%
mutate(treat_2018="control_or_NA", year = "2019")
<- bind_rows(dv_link_sum, dv_link_sum_all18, dv_link_sum_not_treat18)
dv_link_sum
names(dv_link_sum) <- to_snake_case(names(dv_link_sum))
Rename and reformat to agree with individual year data:
<- dv_link_sum %>%
dv_link_sum ::rename(
dplyrrun = year,
bounces = bounced_sum,
opens = opened_sum,
clicks = clicked_sum,
conversions = converted_sum,
conv_per_recip = converted_mean,
revenue = contribution_sum,
rev_per_recip = contribution_mean,
unsubs = unsubscribed_sum
%>%
) mutate(
recipients = counter_sum - bounces)
Basic statistics for treatment combinations:
(<- dv_link_sum %>%
dt_link_counts select(-matches("mean|per|names")) %>%
::datatable()
DT )
<- dv_link_sum %>%
(dt_link_mean filter(treat_2018!="Control or NA" & treat_2018!="Any") %>%
select(matches("mean|run|treat|per")) %>%
::datatable() %>%
DTformatRound(columns=c('opened_mean', 'clicked_mean', 'rev_per_recip'), digits=4) %>%
formatRound(columns=c('conv_per_recip', 'bounced_mean', 'unsubscribed_mean'), digits=6)
)
Description/depiction/codebook of (summary) data
Key data frames
base_stats
:dv_ranges
: Donation stats in ranges by treatment, mode, yeardv_ranks_all
: Donation rank within each year, most to least, and reverse, with non-donations (zeroes) addeddv_link_sum
: Incidence and totals; combined 2018 ad 2019 treatments
Key summary statistics and graphs
<- base_stats %>%
big_stats_table as_tibble() %>%
mutate (
conv_per_10k_recip = conv_per_recip*10000
%>%
) ::select(run, treatment, recipients, bounces, opens, clicks, conversions, conv_per_10k_recip, conv_per_click, rev_per_recip, av_pos_gift, conversions_including_indirect, rev_incl_indirect, -email,-add_zeroes) %>%
dplyrkable() %>%
kable_styling("striped", full_width = F) #as.tibble just to remove row label
big_stats_table
run | treatment | recipients | bounces | opens | clicks | conversions | conv_per_10k_recip | conv_per_click | rev_per_recip | av_pos_gift | conversions_including_indirect | rev_incl_indirect |
---|---|---|---|---|---|---|---|---|---|---|---|---|
both | control | 131175 | 180 | 29047 | 681 | 74 | 5.641319 | 0.1086637 | 0.1399200 | 248.02703 | NA | NA |
both | treat | 131173 | 178 | 28558 | 645 | 109 | 8.309637 | 0.1689922 | 0.0859399 | 103.42202 | NA | NA |
2018 | control | 91298 | 39 | 16906 | 414 | 27 | 2.957349 | 0.0652174 | 0.1587768 | 536.88889 | 241 | 30931 |
2018 | treat | 91296 | 42 | 16195 | 371 | 71 | 7.776902 | 0.1913747 | 0.0703536 | 90.46479 | 267 | 43448 |
2019 | control | 39877 | 141 | 12141 | 267 | 47 | 11.786243 | 0.1760300 | 0.0967475 | 82.08511 | NA | NA |
2019 | treat | 39877 | 136 | 12363 | 274 | 38 | 9.529303 | 0.1386861 | 0.1216240 | 127.63158 | NA | NA |
<- plot_ly(base_stats %>% filter(run=="both" & treatment=="control"))
funnel
<- funnel %>% add_trace(type="funnel",
funnel y = c("recipients", "opens", "clicks", "conversions"),
x = c(~recipients, ~opens, ~clicks, ~conversions),
textinfo = "value+percent initial")
<- funnel %>%
funnel layout(yaxis=list(categoryarray = c("recipients", "opens", "clicks", "conversions")))
funnel
Histogram of donation ranges by year treatment (and, for 2018, by response mode)