Analytic choices for analyzing multidimensional behavior: Many analysts test hypotheses about human speech --- Prospective simulations

2021-05-10

1 Set up

We first create the necessary directories, if they don't exist already.

# Save simulation output here
dir.create("./models/sims", FALSE, TRUE)
# Save figure output here
dir.create("./figs/sample-size", FALSE)

2 Sample size

2.1 Estimate teams' standard deviation

We obtain an estimate of the teams' standard deviation based on data from Silberzahn et al.

# Load in Silberzahn et al. data: https://osf.io/fa743/
silber <- read_csv("https://osf.io/fa743/download") %>%
  mutate(
    z_score = log(OR) / ((log(OR) - log(OR_lo)) / 1.96)
  ) %>%
  # Filter out extremely high z_scores
  filter(z_score < 6)

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Team = col_double(),
##   Analytic.Approach = col_character(),
##   Effect.size.units = col_character(),
##   Estimate = col_double(),
##   Low = col_double(),
##   Hi = col_double(),
##   OR = col_double(),
##   OR_lo = col_double(),
##   OR_hi = col_double()
## )

# Mean z-score
round(mean(silber$z_score), 2)

## [1] 2.29

# Set weakly informative priors for intercept and scale parameters.
priors <- c(
  prior(normal(0, 3), class = Intercept),
  prior(cauchy(0, 1), class = sd)
)

# Run brm to estimate teams' SD (= `(1 | Team)` in the model)
silber_sd <- brm(
  z_score ~ (1 | Team),
  data = silber,
  prior = priors,
  chain = 4,
  iter = 4000,
  cores = 4,
  seed = my_seed,
  # Use cmdstanr to enable multi-core multi-threading processing
  backend = "cmdstanr", threads = threading(4),
  control = list(adapt_delta = 0.9999, max_treedepth = 15),
  file = "./models/sims/silber_sd"
)

# Extract SD estimate and 95 CrI
SD <- summary(silber_sd)$random$Team[1]

## Warning: There were 3 divergent transitions after warmup. Increasing adapt_delta
## above may help. See http://mc-stan.org/misc/warnings.html#divergent-transitions-
## after-warmup

lower <- summary(silber_sd)$random$Team[3]

## Warning: There were 3 divergent transitions after warmup. Increasing adapt_delta
## above may help. See http://mc-stan.org/misc/warnings.html#divergent-transitions-
## after-warmup

upper <- summary(silber_sd)$random$Team[4]

## Warning: There were 3 divergent transitions after warmup. Increasing adapt_delta
## above may help. See http://mc-stan.org/misc/warnings.html#divergent-transitions-
## after-warmup

# Create a vector of quantiles of the estimated teams' SD
# min, 1st quartile, mean, 3rd quartile, max = 0.00 0.25 0.50 0.75 1.00
quantiles <- quantile(seq(lower, upper, by = 0.1), probs = seq(0, 1, 0.25),  na.rm = FALSE)
quantiles <- c(quantiles[[1]], quantiles[[2]], quantiles[[3]], quantiles[[4]], quantiles[[5]])
n_teams <- seq(5, 40, by = 5)

# Get all unique combos of number of teams and quantiles of SD.
n_ts_qs <- expand_grid(n_teams, quantiles)

2.2 Prepare functions for simulation

The following functions define routines to simulate data, fir the models to the data, and set up the simulation loop.

# Simulate data ####
# For each iteration in the simulation loop, simulate data table of z-scores
# n = number of teams
# sd = teams' SD used to simulate z-scores
simulate_data <- function(n, sd) {
  tibble::tibble(
    z_score = rnorm(n, 0, sd),
    team = letters[1:n]
  )
}

# Fit brm to simulated data ####
run_brm <- function(data) {
  # weakly informative priors
  priors <- c(
    brms::prior(normal(0, 3), class = Intercept),
    brms::prior(cauchy(0, 1), class = sd)
  )
  
  # fit brm
  meta_bm <- brms::brm(
    z_score ~
      (1 | team),
    data = data,
    prior = priors,
    chain = 4,
    iter = 6000,
    cores = 4,
    backend = "cmdstanr", threads = brms::threading(4),
    control = list(adapt_delta = 0.9999, max_treedepth = 15),
  )
  
  return(meta_bm)
}

# Simulation iteration loop ####
# This function defines the simulation loop.
# The mean and SD of the simulated teams' SD across 10 iterations (per
# n-teams/sd combos) are returned.
run_simulations <- function(n, sd) {
  data <- simulate_data(n, sd)

  sd_team_i <- numeric()
  
  log_info("Model: {n}, {sd}")

  for (iter in 1:10) {
    log_info("--- Iteration {iter}")
    brm_model <- run_brm(data)
    brm_summ <- summary(brm_model)
    sd_team_i <- c(sd_team_i, brm_summ$random$team[4] - brm_summ$random$team[3]) 
  }

  sd_team_mean <- mean(sd_team_i, na.rm = TRUE)
  sd_team_sd <- sd(sd_team_i, na.rm = TRUE)
  
  return(list(sd_team_mean, sd_team_sd))
}

2.3 Run the simulations

sample_file <- "./models/sims/sample_sims.rds"

# If results file already exists, read it. Otherwise run the simulations.
if (file.exists(sample_file)) {
  # read results file
  log_info("Reading file...\n")
  sample_sims <- readRDS(sample_file)
  log_info("Done!")
} else {
  # run simulations
  sample_sims <- tibble(
    n_teams = n_ts_qs$n_teams,
    quants = n_ts_qs$quantiles,
    sd_team = map2(
      n_teams, quants,
      ~run_simulations(.x, .y)
    )
  )

  # save results to file
  saveRDS(sample_sims, sample_file)
}

2.4 Plot results

sample_cri <- sample_sims %>%
  mutate(
    cri = as.numeric(map(sd_team, ~.x[[1]])),
    cri_sd = as.numeric(map(sd_team, ~.x[[2]]))
  )

labels <- tibble(
  x = c(0.5, 0.25, 0.9, 0.3, 1.3),
  y = c(1.25, 0.7, 1.5, 0.1, 0.9),
  labs = c("Mean", "1st quart.", "3rd quart.", "Min", "Max")
)

arrows <- tibble(
    x1 = c(0.5, 0.25, 0.9, 0.25, 1.3),
    y1 = c(1.15, 0.6, 1.4, 0.1, 1),
    x2 = c(0.63, 0.33, 0.97, 0.05, 1.33),
    y2 = c(0.8, 0.45, 1.2, 0.1, 1.2)
  )

sample_cri %>%
  ggplot(aes(quants, cri)) +
  geom_line(aes(colour = as.factor(n_teams))) +
  geom_point(aes(colour = as.factor(n_teams)), size = 4) +
  geom_hline(yintercept = 1) +
  geom_hline(yintercept = 2, linetype = "dashed") +
  geom_text(
    data = labels, aes(x, y, label = labs),
    family = "Lato"
  ) +
  geom_curve(
    data = arrows, aes(x = x1, y = y1, xend = x2, yend = y2),
    arrow = arrow(length = unit(0.07, "inch")), size = 0.4,
    color = "gray20", curvature = 0.3
  ) +
  scale_x_continuous(breaks = round(quantiles, 2)) +
  scale_color_manual(values = c("#e0ecf4", "#bfd3e6", "#9ebcda", "#8c96c6", "#8c6bb1", "#88419d", "#810f7c", "#4d004b")) +
  labs(
    title = "95% CrI width depending on teams' true SD and number of teams",
    x = "Teams' true standard deviation",
    y = "95% CrI width",
    colour = "N of teams",
    caption = "The values on the x-axis are the quartiles (0, 0.25, 0.5, 0.75, 1)\nof the estimated teams' SD from Silberzahn et al."
  )

ggsave("./figs/sample-size/cri-width-nteams-true-sd.pdf", width = 7, height = 5, device = cairo_pdf)

3 Simulate data

We first simulate a dataset (eta_tbl) of standardised effect sizes. Each team (\(n = 12\)) contributes one standardized effect size \(\eta_i\), with respective standard error \(\text{se}_i\).

3.1 Helper functions

Let's create helper functions:

at_least() ensures that there is at least a 1 when sampling for predictors (= there must be at least one predictor in the model).
sdi_mean() calculates the mean SDI for the pop-level and group-level predictions.

at_least <- function(x) {
  if (1 %in% x) {x} else {sample(c(1, 0, 0, 0, 0))}
}

sdi_mean <- function(preds) {
  bpair <- tibble::tibble(preds) %>% tidyr::unnest_wider(preds, names_sep = ".") %>%
    as.matrix() %>%
    betapart::beta.pair(.)
  sdim <- as.matrix(bpair$beta.sor)
  diag(sdim) <- NA
  rowSums(sdim, na.rm = T) / length(preds)
}

3.2 Tibble

The \(\eta_i\) is randomly sampled from a \(Normal(1, 0.5)\) distribution, while \(\text{se}_i\) from a \(HalfCauchy(0, 0.15)\). See comments in code for the other variables.

set.seed(my_seed)

# Number of teams, each team contributes one effect size
n <- 12
# standardized effect size
eta <- rnorm(n, 1, 0.5)
# standard error of the standardised effect size
se <- rhcauchy(n, 0.15)
# label each team
team <- letters[1:n]

# pop-level predictors
pop_pred <- lapply(replicate(n, list(rbinom(5, 1, 0.5))), at_least)
# group-level predictors
group_pred <- lapply(replicate(n, list(rbinom(5, 1, 0.5))), at_least)
# number of post-hoc changes to the measures
phoc_n <- rpois(n, 1)
# number of models run
mod_n <- sample(1:5, n, replace = T)
# major dimension
mdim <- sample(c("duration", "amplitude", "f0", "spectral", "other"), n, TRUE)
# temporal window
twin <- sample(c("segment", "syllable", "word", "phrase"), n, T)
# reviewers' ratings
rating <- replicate(n, sample(0:100, sample(3:4, 1), replace = T))

# research experience as years from PhD
exp <- replicate(n, sample(-3:20, sample(1:10, 1), replace = T))
# prior belief in the research hypothesis
belf <- lapply(lengths(exp), function(.x) rbinom(.x, 1, 0.5))

We can now put the tibble together.

# put together in a tibble
eta_tbl <- tibble(
  eta,
  se,
  team,
  pop_pred,
  pop_n = map(pop_pred, sum),
  group_pred,
  group_n = map(group_pred, sum),
  pop_sdi = sdi_mean(pop_pred),
  phoc_n,
  mod_n,
  mdim,
  twin,
  rating,
  exp,
  belf,
  rating_mean = unlist(map(rating, mean)),
  rating_sd = unlist(map(rating, sd)),
  exp_mean = unlist(map(exp, mean)),
  exp_sd = unlist(map(exp, sd)),
  belf_mean = unlist(map(belf, mean)),
  belf_sd = unlist(map(belf, sd))
) %>%
  mutate(
    # need the following in cases where score = 0
    rating_mean = ifelse(rating_mean == 0, 0.01, rating_mean),
    exp_mean = ifelse(exp_mean == 0, 0.01, exp_mean),
    belf_mean = ifelse(belf_mean == 0, 0.01, belf_mean),
    # need the following in cases where there is only one person in the team
    rating_sd = ifelse(is.na(rating_sd) | rating_sd == 0, 0.01, rating_sd),
    exp_sd = ifelse(is.na(exp_sd) | exp_sd == 0, 0.01, exp_sd),
    belf_sd = ifelse(is.na(belf_sd) | belf_sd == 0, 0.01, belf_sd)
  )

4 Meta-analytical model

See paper for details.

To summarize the variability in reported effect sized, we will follow Bayesian random-effects meta-analytical techniques. Since the variables used by the analysis teams might substantially differ in their measurement scales (e.g, Hz for frequency vs ms for duration), we will standardize all reported effects by refitting each reported model with centered and scaled continuous variables (z-scores, i.e. the observed values subtracted from the mean divided by the standard deviation) and sum-coded factor variables.

4.1 Model formula

The estimated coefficients of the critical predictors (i.e. critical according to the analysis teams' self-reported inferential criteria), as obtained from the standardized models, will be used as the standardized effect size (\(\eta_i\)) of each reported model. If multiple predictors within a single analysis have been reported as critical, each will be included in the meta-analytical model (described in details in the next paragraph). Moreover, to account for the differing degree of uncertainty around each standardized effect size, we will use the standard deviation of each effect size returned by the standardized models as the standard error (\(\text{se}_i\)) of the effect size.

After having obtained the standardized effect sizes \(\eta_i\) with related standard errors \(\text{se}_i\), for each critical predictor of the individual reported analyses, the initiating authors will fit a Bayesian random-effects meta-analysis using a multilevel (intercept-only) regression model, as described in the following. The outcome variable will be the set of standardized effect sizes \(\eta_i\). The likelihood of \(\eta_i\) is assumed to correspond to a normal distribution (Knight 2000). The analysis teams will be entered as a group-level effect (i.e., random effect, (1 | team)). The standard errors \(\text{se}_i\) will be included as the standard deviation \(\sigma_i\) of \(\eta_i\) to fit a measurement-error model, as discussed above. We will use regularizing weakly-informative priors for the intercept (\(Normal(0, 1)\)) and for the group-level standard deviation \(\sigma_{\alpha_{\text{t}}}\) (\(HalfCauchy(0, 1)\)).

\[ \begin{aligned} \eta_i & \sim \text{Normal}(\mu_i, \sigma_i) \\ \mu_i & = \alpha + \alpha_{\text{t}[i]} \\ \alpha & \sim \text{Normal}(0, 1) \\ \sigma_{\alpha_{\text{t}}} & \sim \text{HalfCauchy}(0, 1) \\ \sigma_i & = \text{se}_i \end{aligned} \]

4.2 Model priors

For the meta-analytical model, we will use the following priors:

A \(Normal(0, 1)\) distribution for the prior of the intercept \(\alpha\).
A \(HalfCauchy(0, 1)\) distribution for the prior of the teams' intercept standard deviation \(\sigma_{\alpha_{\text{t}}}\).

The following graph illustrates the density function of \(HalfCauchy(0, 1)\). The long tail ensures that values of \(\sigma_{\alpha_{\text{t}}}\) greater than 5 be possible but less probable. This is desirable since the effect sizes are standardized (i.e. they should in principle be within a [-3, +3] range).

tibble(
  x = seq(0, 10, by = 0.05),
  density = dhcauchy(x, 1)
) %>%
  ggplot(aes(x, density)) +
  geom_area(fill = "#325756", alpha = 0.9) +
  labs(
    title = "HalfCauchy(0, 1)",
    x = "Teams' intercept standard deviation"
  )

priors <- c(
  prior(normal(0, 1), class = Intercept),
  prior(cauchy(0, 1), class = sd)
)

4.2.1 Prior predictive checks

We run prior predictive checks to assess the goodness of the prior specification before model fitting.

# run checks
meta_bm_ppred <- brm(
  eta | se(se) ~
    (1 | team),
  data = eta_tbl,
  prior = priors,
  sample_prior = "only",
  chain = 4,
  seed = my_seed,
  file = "./models/sims/meta_bm_ppred"
)

# get samples
ppred_samples <- posterior_samples(meta_bm_ppred) %>%
  rename(intercept = b_Intercept, `group-level SD` = sd_team__Intercept)

We can now inspect the prior probability distributions of the intercept and the standard deviation.

ppred_long <- ppred_samples %>%
  pivot_longer(intercept:`group-level SD`, names_to = "term") %>%
  mutate(term = factor(term, levels = c("intercept", "group-level SD")))
  
int <- ppred_long %>%
  filter(term == "intercept") %>%
  ggplot(aes(value)) +
  geom_density(colour = NA, fill = "#325756", alpha = 0.9, bw = 0.4) +
  labs(title = "Intercept")

sd <- ppred_long %>%
  filter(term == "group-level SD") %>%
  ggplot(aes(value)) +
  geom_density(colour = NA, fill = "#325756", alpha = 0.9, bw = 1) +
  xlim(0, 25) +
  labs(title = "Group-level SD")

int + sd

And the joint prior probability distribution.

ppred_samples %>%
  select(intercept, `r_team[a,Intercept]`:`r_team[l,Intercept]`) %>%
  pivot_longer(-intercept, names_to = "team") %>%
  mutate(eta = intercept + value) %>%
  ggplot(aes(eta)) +
  geom_density(colour = NA, fill = "#325756", alpha = 0.9, bw = 0.4) +
  xlim(-7, 7)

4.3 Run

The following code fits the meta-analytical model to the simulated data.

meta_bm <- brm(
  eta | se(se) ~
    (1 | team),
  data = eta_tbl,
  prior = priors,
  chain = 4,
  seed = my_seed,
  cores = 4,
  file = "./models/sims/meta_bm"
)

meta_bm

## Warning: There were 35 divergent transitions after warmup. Increasing
## adapt_delta above 0.8 may help. See http://mc-stan.org/misc/
## warnings.html#divergent-transitions-after-warmup

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: eta | se(se) ~ (1 | team) 
##    Data: eta_tbl (Number of observations: 12) 
## Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup samples = 4000
## 
## Group-Level Effects: 
## ~team (Number of levels: 12) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.24      0.08     0.13     0.45 1.01      984     1208
## 
## Population-Level Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     0.98      0.10     0.79     1.18 1.00      883     1234
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.00      0.00     0.00     0.00 1.00     4000     4000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(meta_bm)

The degree of variability among the team's reported effects is within the range [0.41, 1.14] in SD units at 95% probability.

5 Analytic and researcher-related predictors model

As a second step, we will investigate the extent to which the individual standardized effect sizes are affected by a series of predictors related to analytical and researcher-related factors.

5.1 Model formula

uniqueness_i: Measure of uniqueness of individual analyses for the set of predictors in each model [numeric].
conservativeness_i: Measure of conservativeness of the model specification, as the number of random/group-level effects included [numeric].
posthoc_i: Number of post-hoc changes to the acoustic measurements the teams will report to have carried out [numeric].
numModels_i: Number of models the teams will report to have run [numeric].
majorDimension_i: Major dimension that has been measured to answer the research question [categorical].
timeWindow_i: Temporal window that the measurement is taken over [categorical].
PRrating_i: Average peer-review rating, as the mean of the overall peer-review ratings for each analysis [numeric].
resExpirience_i: Research experience as the elapsed time from receiving the PhD. Negative values will indicate that the person is a student or graduate studens [numeric].
initialBelief_i: Initial belief in the presence of an effect of atypical noun-adjective pairs on acoustics, as answered during the intake questionnaire [numeric].

\[ \begin{aligned} \alpha_{\text{t}[i]} & \sim \text{Normal}(\mu_{\alpha_{t[i]}}, \sigma_i) \\ \mu_{\alpha_{t[i]}} & = \beta + \upsilon_u\cdot uniqueness_i + \upsilon_c\cdot conservativeness_i \\ & \qquad + \upsilon_p\cdot posthoc_i + \upsilon_m\cdot numModels_i \\ & \qquad + \upsilon_d\cdot majorDimension_i + \upsilon_w\cdot timeWindow_i \\ & \qquad + \upsilon_r\cdot PRrating_i + \\ & \qquad + \upsilon_{xp}\cdot resExpirience_i + \upsilon_{be}\cdot initialBelief_i \\ \iota & \sim \text{Normal}(0, 1) \\ \upsilon_{[\ldots]} & \sim \text{Normal}(0,1) \\ rat_i & \sim \text{Normal}(\mu_{rat_i}, \sigma_{rat_i}) \\ resp_i & \sim \text{Normal}(\mu_{resp_i}, \sigma_{resp_i}) \\ belf_i & \sim \text{Normal}(\mu_{belf_i}, \sigma_{belf_i}) \\ \sigma_i & = \sigma_{\alpha_{t[i]}} \end{aligned} \]

5.2 Model priors

Weakly-informative regularising priors.

preds_priors <- c(
  prior(normal(0, 1), class = Intercept),
  prior(normal(0, 1), class = b),
  prior(normal(0, 1), class = meanme),
  prior(cauchy(0, 1), class = sdme),
  prior(lkj(2), class = corme)
)

5.3 Run

predictors_bm <- brm(
  eta | se(se) ~
    pop_sdi +
    phoc_n +
    mod_n +
    mdim +
    twin +
    me(rating_mean, rating_sd) +
    me(exp_mean, exp_sd) +
    me(belf_mean, belf_sd),
  data = eta_tbl,
  prior = preds_priors,
  chain = 4,
  iter = 8000,
  control = list(adapt_delta = 0.9999, max_treedepth = 15),
  seed = my_seed,
  cores = 4,
  file = "./models/sims/predictors_bm"
)

predictors_bm

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: eta | se(se) ~ pop_sdi + phoc_n + mod_n + mdim + twin + me(rating_mean, rating_sd) + me(exp_mean, exp_sd) + me(belf_mean, belf_sd) 
##    Data: eta_tbl (Number of observations: 12) 
## Samples: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
##          total post-warmup samples = 16000
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept                  0.13      1.17    -2.31     2.36 1.00     1369
## pop_sdi                   -0.02      0.90    -1.81     1.78 1.00     2109
## phoc_n                     0.05      0.30    -0.51     0.70 1.00     1175
## mod_n                      0.01      0.22    -0.43     0.46 1.00      924
## mdimduration              -0.31      0.72    -1.70     1.10 1.00     1219
## mdimf0                     0.15      0.60    -1.05     1.32 1.00     1189
## mdimother                 -0.02      0.79    -1.58     1.52 1.00     1468
## mdimspectral               0.14      0.57    -0.98     1.26 1.00     1098
## twinsegment               -0.01      0.70    -1.41     1.30 1.00     1030
## twinsyllable               0.09      0.58    -1.03     1.21 1.00     1081
## twinword                   0.27      0.64    -1.01     1.50 1.00     1010
## merating_meanrating_sd     0.01      0.02    -0.03     0.05 1.00      950
## meexp_meanexp_sd           0.04      0.12    -0.20     0.27 1.00     1123
## mebelf_meanbelf_sd         0.09      0.74    -1.43     1.58 1.00     1593
##                        Tail_ESS
## Intercept                  2441
## pop_sdi                    2693
## phoc_n                     1620
## mod_n                      1977
## mdimduration               2872
## mdimf0                     2662
## mdimother                  3567
## mdimspectral               2590
## twinsegment                2619
## twinsyllable               2638
## twinword                   2383
## merating_meanrating_sd     1607
## meexp_meanexp_sd           1768
## mebelf_meanbelf_sd         2392
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.00      0.00     0.00     0.00 1.00    16000    16000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(predictors_bm, ask = FALSE)

References

Knight, K. 2000. Mathematical Statistics. Chapman; Hall, New York.