Results

Setup and libraries

source("setup_params.R")

library("tidyverse")
library("janitor")
library("stringr")
library("here")
library("knitr")
library("kableExtra")
library("ggrepel")
library("scales")
library("jsonlite")
library("purrr")
library("tibble")

# Theme and colors — crisp palette (high saturation, maximum separation)
UJ_ORANGE <- "#E8722A"   # vivid saffron orange
UJ_GREEN  <- "#2D9D5E"   # rich emerald green
UJ_BLUE   <- "#2B7CE9"   # clear azure blue

MODEL_COLORS <- c(
  "GPT-5 Pro"        = "#E8722A",   # saffron orange (focal)
  "GPT-5.2 Pro"      = "#D62839",   # bright crimson
  "GPT-4o-mini"      = "#17B890",   # vivid teal
  "Claude Sonnet 4"  = "#A855F7",   # vivid purple
  "Claude Opus 4.6"  = "#7C3AED",   # deep violet
  "Gemini 2.0 Flash" = "#2B7CE9",   # clear azure
  "Human"            = "#2D9D5E"    # emerald green
)

theme_uj <- function(base_size = 12) {
  theme_minimal(base_size = base_size) +
    theme(
      panel.grid.minor   = element_blank(),
      panel.grid.major   = element_line(linewidth = 0.3, color = "grey88"),
      plot.title          = element_text(face = "bold", size = rel(1.1)),
      plot.title.position = "plot",
      plot.subtitle       = element_text(color = "grey40", size = rel(0.9)),
      plot.caption        = element_text(color = "grey50", size = rel(0.8), hjust = 0),
      axis.title          = element_text(size = rel(0.95)),
      axis.text           = element_text(size = rel(0.88)),
      legend.position     = "bottom",
      legend.text         = element_text(size = rel(0.88)),
      legend.title        = element_text(size = rel(0.9), face = "bold"),
      strip.text          = element_text(face = "bold", size = rel(0.95))
    )
}

canon_metric <- function(x) dplyr::recode(
  x,
  "advancing_knowledge" = "adv_knowledge",
  "open_science"        = "open_sci",
  "logic_communication" = "logic_comms",
  "global_relevance"    = "gp_relevance",
  "claims_evidence"     = "claims",
  .default = x
)

`%||%` <- function(x, y) if (!is.null(x)) x else y

Load human evaluation data

UJmap <- read_delim("data/UJ_map.csv", delim = ";", show_col_types = FALSE) |>
  mutate(label_paper_title = research, label_paper = paper) |>
  select(c("label_paper_title", "label_paper"))

rsx <- read_csv("data/rsx_evalr_rating.csv", show_col_types = FALSE) |>
  clean_names() |>
  mutate(label_paper_title = research) |>
  select(-c("research"))

research <- read_csv("data/research.csv", show_col_types = FALSE) |>
  clean_names() |>
  filter(status == "50_published evaluations (on PubPub, by Unjournal)") |>
  left_join(UJmap, by = c("label_paper_title")) |>
  mutate(doi = str_trim(doi)) |>
  mutate(label_paper = case_when(
    doi == "https://doi.org/10.3386/w31162" ~ "Walker et al. 2023",
    doi == "doi.org/10.3386/w32728" ~ "Hahn et al. 2025",
    doi == "https://doi.org/10.3386/w30011" ~ "Bhat et al. 2022",
    doi == "10.1093/wbro/lkae010" ~ "Crawfurd et al. 2023",
    TRUE ~ label_paper
  )) |>
  left_join(rsx, by = c("label_paper_title"))

key_map <- research |>
  transmute(label_paper_title = str_trim(label_paper_title), label_paper = label_paper) |>
  filter(!is.na(label_paper_title)) |>
  distinct(label_paper_title, label_paper) |>
  group_by(label_paper_title) |>
  slice(1) |>
  ungroup()

rsx_research <- rsx |>
  mutate(label_paper_title = str_trim(label_paper_title)) |>
  left_join(key_map, by = "label_paper_title", relationship = "many-to-one")

metrics_human <- rsx_research |>
  mutate(criteria = canon_metric(criteria)) |>
  filter(criteria %in% c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |>
  transmute(
    paper = label_paper, criteria, evaluator, model = "Human",
    mid = as.numeric(middle_rating),
    lo = suppressWarnings(as.numeric(lower_ci)),
    hi = suppressWarnings(as.numeric(upper_ci))
  ) |>
  filter(!is.na(paper), !is.na(mid)) |>
  mutate(
    lo = ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_),
    hi = ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_)
  ) |>
  mutate(across(c(mid, lo, hi), ~ round(.x, 4))) |>
  distinct(paper, criteria, model, evaluator, mid, lo, hi)

human_avg <- metrics_human |>
  filter(criteria == "overall") |>
  group_by(paper) |>
  summarise(
    human_mid = mean(mid, na.rm = TRUE),
    human_lo = mean(lo, na.rm = TRUE),
    human_hi = mean(hi, na.rm = TRUE),
    n_human = n(),
    .groups = "drop"
  )

n_human_papers <- n_distinct(metrics_human$paper)
n_human_evaluators <- n_distinct(metrics_human$evaluator)

Load LLM evaluation data (all models)

model_dirs <- list(
  "gpt5_pro_updated_jan2026" = "GPT-5 Pro",
  "gpt52_pro_focal_jan2026" = "GPT-5.2 Pro",
  "gpt_4o_mini_2024_07_18" = "GPT-4o-mini",
  "claude_sonnet_4_20250514" = "Claude Sonnet 4",
  "claude_opus_4_6" = "Claude Opus 4.6",
  "gemini_2.0_flash" = "Gemini 2.0 Flash"
)

parse_response <- function(path, model_name) {
  tryCatch({
    r <- jsonlite::fromJSON(path, simplifyVector = FALSE)
    paper <- basename(path) |>
      str_replace("\\.response\\.json$", "") |>
      str_replace_all("_", " ")

    parsed <- NULL
    if (!is.null(r$parsed) && length(r$parsed) > 0) {
      parsed <- r$parsed
    } else if (!is.null(r$output_text) && nchar(r$output_text) > 0) {
      txt <- r$output_text
      txt <- sub("^\\s*```[a-z]*\\s*\n?", "", txt)
      txt <- sub("\\s*```\\s*$", "", txt)
      parsed <- jsonlite::fromJSON(txt, simplifyVector = TRUE)
    } else if (!is.null(r$output)) {
      msg <- purrr::detect(r$output, ~ .x$type == "message", .default = NULL)
      if (!is.null(msg) && length(msg$content) > 0) {
        parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector = TRUE)
      }
    }

    if (is.null(parsed)) return(NULL)

    metrics <- parsed$metrics
    metric_rows <- list()
    tier_rows <- list()
    tier_names <- c("tier_should", "tier_will", "journal_should", "journal_will")

    for (nm in names(metrics)) {
      if (nm %in% tier_names) {
        tier_kind <- sub("^journal_", "tier_", nm)
        tier_rows[[length(tier_rows) + 1]] <- tibble(
          paper = paper, model = model_name, tier_kind = tier_kind,
          score = metrics[[nm]]$score,
          ci_lower = metrics[[nm]]$ci_lower,
          ci_upper = metrics[[nm]]$ci_upper
        )
      } else {
        metric_rows[[length(metric_rows) + 1]] <- tibble(
          paper = paper, model = model_name, metric = nm,
          midpoint = metrics[[nm]]$midpoint,
          lower_bound = metrics[[nm]]$lower_bound,
          upper_bound = metrics[[nm]]$upper_bound
        )
      }
    }

    input_tok <- r$usage$input_tokens %||% r$input_tokens
    output_tok <- r$usage$output_tokens %||% r$output_tokens

    list(
      metrics = bind_rows(metric_rows),
      tiers = bind_rows(tier_rows),
      tokens = tibble(
        paper = paper, model = model_name,
        input_tokens = input_tok %||% NA_integer_,
        output_tokens = output_tok %||% NA_integer_
      )
    )
  }, error = function(e) NULL)
}

load_all_llm <- function() {
  all_metrics <- list()
  all_tiers <- list()
  all_tokens <- list()

  for (dir_name in names(model_dirs)) {
    model_name <- model_dirs[[dir_name]]
    json_dir <- here("results", dir_name, "json")

    if (dir.exists(json_dir)) {
      files <- list.files(json_dir, pattern = "\\.response\\.json$", full.names = TRUE)

      for (f in files) {
        result <- parse_response(f, model_name)
        if (!is.null(result)) {
          all_metrics[[length(all_metrics) + 1]] <- result$metrics
          all_tiers[[length(all_tiers) + 1]] <- result$tiers
          all_tokens[[length(all_tokens) + 1]] <- result$tokens
        }
      }
    }
  }

  list(
    metrics = bind_rows(all_metrics) |> mutate(criteria = canon_metric(metric)),
    tiers = bind_rows(all_tiers),
    tokens = bind_rows(all_tokens)
  )
}

llm_data <- load_all_llm()
llm_metrics <- llm_data$metrics
llm_tiers <- llm_data$tiers
llm_tokens <- llm_data$tokens

n_llm_models <- n_distinct(llm_metrics$model)
n_llm_papers <- n_distinct(llm_metrics$paper)

matched_papers <- intersect(
  unique(metrics_human$paper),
  unique(llm_metrics$paper)
)
n_matched <- length(matched_papers)

# Opus sample: papers evaluated by Claude Opus 4.6 AND humans
# Used for the ranked slopegraph and agreement tables so all models
# are compared on the same set of papers.
opus_papers <- intersect(
  llm_metrics |> filter(model == "Claude Opus 4.6") |> pull(paper) |> unique(),
  unique(metrics_human$paper)
)
n_opus <- length(opus_papers)

primary_model <- if ("GPT-5 Pro" %in% unique(llm_metrics$model)) "GPT-5 Pro" else unique(llm_metrics$model)[1]

We evaluate 6 frontier LLMs against human expert reviews from The Unjournal. Across all models, 45 papers have both human and LLM evaluations; 33 of these were also evaluated by Claude Opus 4.6 and form the matched sample used for rank comparisons and agreement tables below. Each model receives the same PDF, system prompt mirroring The Unjournal rubric, and JSON schema requiring a diagnostic summary plus numeric midpoints and 90% credible intervals for every metric. Full methodological details appear in Methods.

We do not treat human ratings as ground truth. Quantitative percentile scoring is genuinely difficult: even domain experts disagree, and individual scores reflect both signal about paper quality and idiosyncratic tendencies (severity, topic familiarity, interpretation of the scale). Our question is therefore whether LLMs provide signal comparable to an additional expert rater. If two human evaluators agree at Spearman ρ = 0.55, an LLM achieving ρ = 0.57 against the human mean is performing within human inter-rater range. The Human–Human baseline row in Table 2.1 makes this explicit; Krippendorff’s α_HH in the human-baseline table in Appendix A provides the criterion-level ceiling.

Per-paper overview and model comparison. Figure 2.1 presents three complementary views of overall (0–100 percentile) ratings. Panel (a) displays individual human evaluator ratings alongside GPT-5 Pro (orange diamonds) and Claude Opus 4.6 (purple diamonds) ratings for each paper, revealing inter-rater variability—CIs often span 20–40~points. In most cases both LLMs fall within the range of human opinions, though several papers show substantial divergence. Panel (b) compares the two highest-performing reasoning models directly against human mean ratings. Panel (c) plots evaluator pairs against each other for all papers with ≥2 raters, making the human-human agreement ceiling directly visible alongside LLM-human agreement in panel (b).

Show code

library("patchwork")

# ── Panel (a): Forest plot ──
H_ind <- metrics_human |>
  filter(criteria == "overall", paper %in% matched_papers) |>
  mutate(
    lo = ifelse(is.finite(lo), pmax(0, lo), NA_real_),
    hi = ifelse(is.finite(hi), pmin(100, hi), NA_real_)
  )

ord <- H_ind |>
  group_by(paper) |>
  summarise(h_mean = mean(mid, na.rm = TRUE), .groups = "drop") |>
  arrange(desc(h_mean)) |>
  mutate(pos = row_number())

H_plot <- H_ind |>
  inner_join(ord, by = "paper") |>
  group_by(paper) |>
  mutate(
    off = (row_number() - (n() + 1) / 2) * 0.18,
    x   = pos + off
  ) |>
  ungroup()

L_c <- llm_metrics |>
  filter(criteria == "overall", model == primary_model, paper %in% matched_papers) |>
  group_by(paper) |>
  summarise(
    mid = mean(midpoint, na.rm = TRUE),
    lo  = suppressWarnings(min(coalesce(lower_bound, midpoint), na.rm = TRUE)),
    hi  = suppressWarnings(max(coalesce(upper_bound, midpoint), na.rm = TRUE)),
    .groups = "drop"
  ) |>
  inner_join(ord, by = "paper") |>
  mutate(x = pos - 0.18)

L_opus <- llm_metrics |>
  filter(criteria == "overall", model == "Claude Opus 4.6", paper %in% matched_papers) |>
  group_by(paper) |>
  summarise(
    mid = mean(midpoint, na.rm = TRUE),
    lo  = suppressWarnings(min(coalesce(lower_bound, midpoint), na.rm = TRUE)),
    hi  = suppressWarnings(max(coalesce(upper_bound, midpoint), na.rm = TRUE)),
    .groups = "drop"
  ) |>
  inner_join(ord, by = "paper") |>
  mutate(x = pos + 0.18)

hbar     <- mean(ord$h_mean, na.rm = TRUE)
lbar     <- mean(L_c$mid, na.rm = TRUE)
lbar_opus <- mean(L_opus$mid, na.rm = TRUE)

OPUS_PURPLE <- MODEL_COLORS["Claude Opus 4.6"]

# Legend labels embed grand means so the annotation text can be dropped
lbl_human <- sprintf("Human evaluators (mean = %.1f)", hbar)
lbl_gpt   <- sprintf("GPT-5 Pro (mean = %.1f)", lbar)
lbl_opus  <- sprintf("Claude Opus 4.6 (mean = %.1f)", lbar_opus)

leg_colors <- setNames(c(UJ_GREEN, UJ_ORANGE, unname(OPUS_PURPLE)),
                       c(lbl_human, lbl_gpt, lbl_opus))
leg_shapes <- setNames(c(16L, 18L, 18L),
                       c(lbl_human, lbl_gpt, lbl_opus))

p_forest <- if (nrow(L_c) > 0) {
  ggplot() +
    geom_vline(data = ord, aes(xintercept = pos), color = "grey92", linewidth = 0.3) +
    geom_hline(yintercept = hbar,      color = UJ_GREEN,    linetype = "dotted", linewidth = 0.8) +
    geom_hline(yintercept = lbar,      color = UJ_ORANGE,   linetype = "dotted", linewidth = 0.8) +
    geom_hline(yintercept = lbar_opus, color = OPUS_PURPLE, linetype = "dotted", linewidth = 0.8) +
    geom_errorbar(
      data = subset(H_plot, is.finite(lo) & is.finite(hi)),
      aes(x = x, ymin = lo, ymax = hi),
      width = 0, linewidth = 1, alpha = 0.5, color = UJ_GREEN
    ) +
    geom_point(data = H_plot,
               aes(x = x, y = mid, color = lbl_human, shape = lbl_human),
               size = 3.0, alpha = 0.9) +
    geom_errorbar(
      data = subset(L_c, is.finite(lo) & is.finite(hi)),
      aes(x = x, ymin = lo, ymax = hi),
      width = 0, linewidth = 1.0, color = UJ_ORANGE
    ) +
    geom_point(data = L_c,
               aes(x = x, y = mid, color = lbl_gpt, shape = lbl_gpt),
               size = 3.6) +
    geom_errorbar(
      data = subset(L_opus, is.finite(lo) & is.finite(hi)),
      aes(x = x, ymin = lo, ymax = hi),
      width = 0, linewidth = 1.0, color = OPUS_PURPLE
    ) +
    geom_point(data = L_opus,
               aes(x = x, y = mid, color = lbl_opus, shape = lbl_opus),
               size = 3.6) +
    scale_color_manual(name = NULL, values = leg_colors,
                       breaks = c(lbl_human, lbl_gpt, lbl_opus)) +
    scale_shape_manual(name = NULL, values = leg_shapes,
                       breaks = c(lbl_human, lbl_gpt, lbl_opus)) +
    guides(
      color = guide_legend(override.aes = list(size = 3.8, alpha = 1)),
      shape = guide_legend(override.aes = list(size = 3.8))
    ) +
    scale_x_continuous(
      breaks = ord$pos, labels = ord$paper,
      expand = expansion(mult = c(0.01, 0.03))
    ) +
    coord_cartesian(ylim = c(0, 100), clip = "off") +
    labs(x = NULL, y = "Percentile (0\u2013100)", tag = "(a)") +
    theme_uj() +
    theme(
      axis.text.x      = element_text(angle = 55, hjust = 1, vjust = 1, size = 9),
      axis.title.y     = element_text(size = 12),
      panel.grid.major.x = element_blank(),
      plot.margin      = margin(5, 40, 5, 5),
      plot.tag         = element_text(face = "bold", size = 14),
      legend.position  = "bottom",
      legend.direction = "horizontal",
      legend.text      = element_text(size = 10),
      legend.key.size  = unit(0.5, "cm"),
      legend.spacing.x = unit(0.5, "cm")
    )
} else {
  ggplot() + annotate("text", x = 0.5, y = 0.5, label = "Insufficient data") + theme_void()
}

# ── Panel (b): Human-Human scatter ──
# All pairwise evaluator combinations for papers with ≥2 human raters
hh_scatter_data <- metrics_human |>
  filter(criteria == "overall") |>
  select(paper, evaluator, mid) |>
  distinct() |>
  group_by(paper) |>
  filter(n() >= 2) |>
  group_modify(\(df, key) {
    idx <- combn(nrow(df), 2)
    tibble(mid_a = df$mid[idx[1, ]], mid_b = df$mid[idx[2, ]])
  }) |>
  ungroup()

p_hh <- if (nrow(hh_scatter_data) > 0) {
  hh_rho <- cor(hh_scatter_data$mid_a, hh_scatter_data$mid_b,
                method = "spearman", use = "complete.obs")
  hh_lbl <- sprintf("Spearman \u03c1 = %.2f\nN = %d pairs", hh_rho, nrow(hh_scatter_data))
  ggplot(hh_scatter_data, aes(x = mid_a, y = mid_b)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
    geom_point(color = UJ_GREEN, size = 3, alpha = 0.7) +
    annotate("text", x = 4, y = 96, label = hh_lbl,
             hjust = 0, vjust = 1, size = 3.4, color = "grey30") +
    coord_fixed(ratio = 1, xlim = c(0, 100), ylim = c(0, 100)) +
    labs(x = "Evaluator A (overall)", y = "Evaluator B (overall)", tag = "(b)") +
    theme_uj() +
    theme(plot.tag = element_text(face = "bold", size = 14))
} else {
  ggplot() + annotate("text", x = 0.5, y = 0.5, label = "Insufficient data") + theme_void()
}

# ── Panel (c): LLM vs Human scatter ──
focal_models <- c("GPT-5 Pro", "Claude Opus 4.6")

scatter_data <- llm_metrics |>
  filter(criteria == "overall", model %in% focal_models) |>
  inner_join(human_avg, by = "paper") |>
  mutate(
    diff = midpoint - human_mid,
    paper_short = str_trunc(paper, 25),
    human_lo = coalesce(human_lo, human_mid),
    human_hi = coalesce(human_hi, human_mid),
    lower_bound = coalesce(lower_bound, midpoint),
    upper_bound = coalesce(upper_bound, midpoint)
  ) |>
  filter(!is.na(human_mid), !is.na(midpoint))

# Per-model Spearman rho for facet annotations
scatter_rho <- scatter_data |>
  group_by(model) |>
  summarise(
    rho = cor(human_mid, midpoint, method = "spearman", use = "complete.obs"),
    .groups = "drop"
  ) |>
  mutate(lbl = sprintf("Spearman \u03c1 = %.2f", rho),
         x = 4, y = 96)

p_scatter <- if (nrow(scatter_data) > 0) {
  ggplot(scatter_data, aes(x = human_mid, y = midpoint, color = model)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
    geom_point(size = 3.5, alpha = 0.7) +
    ggrepel::geom_text_repel(aes(label = paper_short), size = 2.4, max.overlaps = 8, show.legend = FALSE) +
    geom_text(data = scatter_rho, aes(x = x, y = y, label = lbl),
              hjust = 0, vjust = 1, size = 3.4, color = "grey30", inherit.aes = FALSE) +
    scale_color_manual(values = MODEL_COLORS) +
    coord_fixed(ratio = 1, xlim = c(0, 100), ylim = c(0, 100)) +
    facet_wrap(~model, ncol = 2) +
    labs(x = "Human mean rating (0\u2013100)", y = "LLM rating (0\u2013100)", color = NULL, tag = "(c)") +
    theme_uj() +
    theme(
      strip.text = element_text(size = 13, face = "bold"),
      plot.tag = element_text(face = "bold", size = 14),
      legend.position = "none"
    )
} else {
  ggplot() + annotate("text", x = 0.5, y = 0.5, label = "No matching data") + theme_void()
}

# ── Combine ──
bottom_row <- wrap_plots(p_hh, p_scatter, widths = c(1, 2))
(p_forest / bottom_row) + plot_layout(heights = c(1, 0.85))

Figure 2.1: Per-paper overall ratings (0–100 percentile). **(a)** Individual human evaluator midpoints (green circles) with each evaluator’s self-reported 90% credible interval (reflecting their own uncertainty about the true score; the vertical separation between green dots per paper reflects inter-rater disagreement), GPT-5 Pro (orange diamond with CI), and Claude Opus 4.6 (purple diamond with CI), sorted by descending human mean. Dotted horizontal lines show grand means. **(b)** Pairwise human evaluator agreement across all matched papers: each point is one evaluator pair (papers with 3 raters contribute 3 points); uses all `r n_matched` matched papers (the Human–Human row in Table 1 is restricted to the `r n_opus`-paper matched sample). **(c)** Human mean vs LLM overall rating for the two focal models with per-model Spearman ρ annotated; dashed diagonal is the identity line. Compare panels (b) and (c) directly to see whether LLM-human scatter is tighter than human-human scatter.

Panel (c) makes the human-human ceiling visual: the scatter of evaluator pairs is no tighter than the LLM-human scatter in panel (b), and individual pairs disagree by as much as 30–40 percentile points on individual papers. Note that panel (c) uses all 45 matched papers while the Human–Human row in Table 2.1 is restricted to the 33-paper matched sample (explaining the slightly different ρ values). Both models cluster around the identity line in the 40–80 range but diverge more at the extremes. LLMs tend to compress ratings toward the centre of the scale relative to humans, producing narrower spread—a pattern consistent with alignment training that discourages extreme outputs. Where humans rate a paper very highly or very harshly, the LLM typically pulls toward the middle. Full agreement metrics including Pearson r, Spearman ρ, mean bias, and RMSE for all six models appear in the agreement table in Appendix A.

Rank comparison. Figure 2.2 places human rankings in the centre column with GPT-5 Pro on the left and Claude Opus 4.6 on the right, restricted to the 33 papers with Claude Opus 4.6 evaluations. Each paper appears as a pair of S-curves connecting its human rank to its rank under each LLM. Horizontal curves indicate agreement; curves that climb toward the top indicate the LLM ranked the paper higher (better) than humans did. Colour encodes the rank shift: green when humans rank it higher, orange (left, GPT-5 Pro) or purple (right, Opus) when the LLM does. The triptych layout makes it easy to spot papers where both LLMs agree with each other but disagree with humans (both curves slope the same way), versus papers where the two LLMs diverge.

Show code

library("ggforce")

# --- Rank-based triptych ---
# Ranks are computed AFTER the inner join so all three columns span exactly 1..n.
rank_all <- human_avg |>
  filter(paper %in% matched_papers) |>
  select(paper, human_mid) |>
  inner_join(
    llm_metrics |>
      filter(criteria == "overall", model == "GPT-5 Pro") |>
      select(paper, gpt_mid = midpoint),
    by = "paper"
  ) |>
  inner_join(
    llm_metrics |>
      filter(criteria == "overall", model == "Claude Opus 4.6") |>
      select(paper, claude_mid = midpoint),
    by = "paper"
  ) |>
  mutate(
    rank_h      = rank(-human_mid,  ties.method = "first"),
    rank_gpt    = rank(-gpt_mid,    ties.method = "first"),
    rank_claude = rank(-claude_mid, ties.method = "first"),
    label_use   = str_trunc(paper, 28),
    d_left      = rank_h - rank_gpt,    # positive = GPT ranks it higher (lower number)
    d_right     = rank_h - rank_claude
  )

n_papers_slope <- nrow(rank_all)

# --- Bezier curve data ---
left_curves <- purrr::map_dfr(seq_len(n_papers_slope), function(i) {
  tibble(
    group = paste0("L", i),
    x = c(0.5, 0.35, 0.15, 0),
    y = c(rank_all$rank_h[i],   rank_all$rank_h[i],
          rank_all$rank_gpt[i], rank_all$rank_gpt[i]),
    dr  = rank_all$d_left[i],
    mag = abs(rank_all$d_left[i])
  )
})

right_curves <- purrr::map_dfr(seq_len(n_papers_slope), function(i) {
  tibble(
    group = paste0("R", i),
    x = c(0.5, 0.65, 0.85, 1),
    y = c(rank_all$rank_h[i],      rank_all$rank_h[i],
          rank_all$rank_claude[i], rank_all$rank_claude[i]),
    dr  = rank_all$d_right[i],
    mag = abs(rank_all$d_right[i])
  )
})

# Colour: green = human ranks higher, orange/purple = LLM ranks higher
make_slope_color <- function(dr, high_col, max_shift = 15) {
  t <- pmax(0, pmin(1, (dr + max_shift) / (2 * max_shift)))
  ramp <- grDevices::colorRamp(c(UJ_GREEN, "grey88", high_col))
  m <- ramp(t)
  grDevices::rgb(m[, 1], m[, 2], m[, 3], maxColorValue = 255)
}

left_curves  <- left_curves  |> mutate(curve_col = make_slope_color(dr, high_col = UJ_ORANGE))
right_curves <- right_curves |> mutate(curve_col = make_slope_color(dr, high_col = unname(MODEL_COLORS["Claude Opus 4.6"])))

all_curves <- bind_rows(left_curves, right_curves) |>
  mutate(alpha_val = pmin(mag / (n_papers_slope / 3), 1))

if (n_papers_slope > 0) {
  ggplot() +
    ggforce::geom_bezier(
      data = all_curves,
      aes(x = x, y = y, group = group, colour = curve_col, alpha = alpha_val),
      linewidth = 0.75
    ) +
    scale_color_identity() +
    scale_alpha(range = c(0.20, 1), guide = "none") +

    # Column dots
    geom_point(data = rank_all, aes(x = 0,   y = rank_gpt),
               color = MODEL_COLORS["GPT-5 Pro"], size = 2.2) +
    geom_point(data = rank_all, aes(x = 0.5, y = rank_h),
               color = MODEL_COLORS["Human"], size = 2.6, shape = 16) +
    geom_point(data = rank_all, aes(x = 1,   y = rank_claude),
               color = MODEL_COLORS["Claude Opus 4.6"], size = 2.2) +

    # Paper labels left — repelled vertically to avoid overlap
    ggrepel::geom_text_repel(
      data = rank_all,
      aes(x = 0, y = rank_gpt, label = label_use),
      hjust = 1, nudge_x = -0.06, direction = "y",
      size = 2.5, color = "grey25",
      segment.color = "grey72", segment.size = 0.25,
      force = 0.8, max.overlaps = Inf, min.segment.length = 0.1
    ) +

    # Right annotations: human rank + deltas
    geom_text(data = rank_all,
              aes(x = 1, y = rank_claude,
                  label = sprintf("H%d  \u0394G%+d  \u0394C%+d",
                                  rank_h, d_left, d_right)),
              hjust = -0.06, size = 2.3, color = "grey35") +

    scale_x_continuous(
      breaks = c(0, 0.5, 1),
      labels = c("GPT-5 Pro", "Human", "Claude Opus 4.6"),
      limits = c(-0.65, 1.6),
      expand = expansion(mult = 0)
    ) +
    scale_y_reverse(
      breaks = seq(1, n_papers_slope, by = 4),
      expand = expansion(mult = c(0.02, 0.04))
    ) +
    coord_cartesian(clip = "off") +
    labs(x = NULL, y = "Rank (1 = highest rated)") +
    theme_uj() +
    theme(
      axis.text.x        = element_text(size = 13, face = "bold"),
      axis.text.y        = element_text(size = 9, color = "grey50"),
      axis.ticks.y       = element_blank(),
      panel.grid.major.y = element_line(linewidth = 0.25, color = "grey92"),
      panel.grid.major.x = element_line(linewidth = 0.35, color = "grey82"),
      plot.margin        = margin(t = 8, r = 190, b = 8, l = 140)
    )
} else {
  cat("Insufficient data for triptych slopegraph.\\n")
}

Figure 2.2: Triptych slopegraph of overall ratings for the `r n_papers_slope`-paper matched sample (papers evaluated by all three sources). Each column shows rank order (1 = highest rated) within that source; papers are connected by S-shaped curves. Green = human panel ranks higher; orange (left) = GPT-5 Pro ranks higher; purple (right) = Claude Opus ranks higher. Colour intensity encodes magnitude of rank shift. Right-side labels show human rank (H) and rank delta for each model (ΔG, ΔC; positive = LLM ranks higher).

Papers where both curves slope the same direction indicate systematic human–LLM disagreement; papers where left and right curves diverge indicate model-specific differences—GPT and Claude “see” the paper differently, not just differently from humans. Horizontal curves indicate agreement with the human ordering. The criteria-level Krippendorff’s alpha analysis in the human baseline table in Appendix A quantifies where human–LLM agreement approaches the ceiling set by human inter-rater variability.

Rating differences by paper and criterion. While the scatter plots and slopegraph summarise agreement on a single dimension, Figure 2.3 unpacks Human − LLM differences across all seven criteria simultaneously. Each column is a paper, each row a criterion, and tile colour encodes the signed difference (human mean minus LLM midpoint). Green tiles indicate the human panel rated the paper higher; orange (GPT-5 Pro) or purple (Claude Opus) tiles indicate the LLM was more generous. Papers are sorted by GPT-5 Pro’s overall difference so that both panels share the same column order, making model-specific patterns immediately visible.

Show code

library("patchwork")

metric_order <- c("overall", "claims", "methods", "adv_knowledge",
                  "logic_comms", "open_sci", "gp_relevance")
metric_lab <- c(
  overall = "Overall",
  claims = "Claims & Evidence",
  methods = "Methods",
  adv_knowledge = "Adv. Knowledge",
  logic_comms = "Logic & Comms",
  open_sci = "Open Science",
  gp_relevance = "Global Relevance"
)

H_mean <- metrics_human |>
  filter(criteria %in% metric_order, paper %in% matched_papers) |>
  group_by(paper, criteria) |>
  summarise(h = mean(mid, na.rm = TRUE), .groups = "drop")

# Fixed paper order based on GPT-5 Pro overall difference
L_mean_primary <- llm_metrics |>
  filter(criteria %in% metric_order, model == primary_model, paper %in% matched_papers) |>
  group_by(paper, criteria) |>
  summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop")

Ddiff_primary <- inner_join(H_mean, L_mean_primary, by = c("paper", "criteria")) |>
  mutate(diff = h - l)

ord_p <- Ddiff_primary |>
  filter(criteria == "overall") |>
  arrange(desc(diff)) |>
  pull(paper)

build_gap_panel <- function(model_name, show_y_labels = TRUE,
                            llm_col = UJ_ORANGE) {
  L_mean_m <- llm_metrics |>
    filter(criteria %in% metric_order, model == model_name, paper %in% matched_papers) |>
    group_by(paper, criteria) |>
    summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop")

  Ddiff_m <- inner_join(H_mean, L_mean_m, by = c("paper", "criteria")) |>
    mutate(
      diff = h - l,
      crit = factor(criteria, levels = metric_order, labels = metric_lab[metric_order])
    )

  p <- ggplot(Ddiff_m, aes(x = factor(paper, levels = ord_p), y = crit, fill = diff)) +
    geom_tile(color = "white", linewidth = 0.25) +
    scale_fill_gradient2(
      low = llm_col, mid = "grey95", high = UJ_GREEN, midpoint = 0,
      name = "Human \u2212 LLM",
      limits = c(-30, 30),
      oob = scales::squish
    ) +
    labs(x = NULL, y = NULL, title = model_name) +
    theme_uj() +
    theme(
      axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, size = 8),
      panel.grid = element_blank()
    )

  if (show_y_labels) {
    p <- p + theme(axis.text.y = element_text(size = 11))
  } else {
    p <- p + theme(axis.text.y = element_blank())
  }

  p
}

if (length(ord_p) > 0) {
  p_left  <- build_gap_panel("GPT-5 Pro",      show_y_labels = TRUE,
                              llm_col = UJ_ORANGE)
  p_right <- build_gap_panel("Claude Opus 4.6", show_y_labels = FALSE,
                              llm_col = unname(MODEL_COLORS["Claude Opus 4.6"]))

  p_left + p_right + plot_layout(guides = "keep") & theme(legend.position = "bottom")
} else {
  cat("Insufficient data for gap heatmap.\n")
}

Figure 2.3: Human minus LLM rating difference for every paper (columns) and criterion (rows), comparing GPT-5 Pro (left panel) and Claude Opus 4.6 (right panel). Green tiles indicate the human mean was higher; orange (GPT-5 Pro) or purple (Claude Opus) tiles indicate the LLM rated the paper higher. Colour is clamped to ±30 percentile points. Papers are sorted by GPT-5 Pro overall difference so the two panels are directly comparable.

Rows with uniformly green or orange tiles indicate papers where humans and the LLM disagree systematically across all criteria, not just on one dimension. Columns with consistent colour suggest criteria-level biases—for instance, if “Open Science” is green across nearly all papers, humans may systematically reward data-sharing practices more than the LLM does. Comparing the two panels reveals whether disagreement patterns are model-specific or shared across frontier LLMs; tiles that change colour between panels highlight papers where the models diverge from each other, not just from humans.

Qualitative critique comparison. Beyond numeric ratings, we compare the substantive critiques each model raises against the consensus issues identified by human experts. The figure below plots coverage—the fraction of human-identified concerns that the LLM also raised in some form—against precision—the fraction of LLM-raised issues that correspond to a substantive human concern. These metrics are assessed by GPT-5.2 Pro acting as an independent judge (see Methods for the judging protocol).

Load critique comparison data

comparison_results_file <- c(
  "results/key_issue_comp_results.json",
  "results/key_issues_comparison_results.json"
)
comparison_results_file <- comparison_results_file[file.exists(comparison_results_file)][1]

has_critique_data <- FALSE

if (!is.na(comparison_results_file)) {
  comparison_file <- "results/key_issues_comparison.json"
  if (file.exists(comparison_file)) {
    comparison_data <- fromJSON(comparison_file)
    llm_results_raw <- fromJSON(comparison_results_file)
    llm_results <- llm_results_raw |>
      as_tibble() |>
      unnest_wider(comparison) |>
      select(
        gpt_paper,
        coverage_pct,
        precision_pct,
        any_of(c("matched_pairs", "unmatched_human", "unmatched_llm")),
        any_of(c("missed_issues", "extra_issues")),
        overall_rating,
        overall_justification,
        detailed_notes
      )
    comparison_data <- comparison_data |>
      left_join(llm_results, by = "gpt_paper")
    has_critique_data <- nrow(comparison_data) > 0 &&
      "coverage_pct" %in% names(comparison_data) &&
      any(!is.na(comparison_data$coverage_pct))
  }
}

Show code

if (has_critique_data) {
  crit_results <- comparison_data |>
    filter(!is.na(coverage_pct) & !is.na(precision_pct)) |>
    mutate(paper_short = str_trunc(gpt_paper, 25))

  ggplot(crit_results, aes(x = coverage_pct, y = precision_pct)) +
    geom_point(size = 4.5, color = UJ_ORANGE, alpha = 0.85) +
    geom_text(aes(label = paper_short), hjust = -0.1, vjust = 0.5, size = 2.6, check_overlap = TRUE) +
    geom_vline(xintercept = mean(crit_results$coverage_pct), linetype = "dashed", color = UJ_BLUE, linewidth = 0.6) +
    geom_hline(yintercept = mean(crit_results$precision_pct), linetype = "dashed", color = UJ_BLUE, linewidth = 0.6) +
    scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) +
    scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) +
    labs(
      x = "Coverage (%) \u2014 Human issues captured by LLM",
      y = "Precision (%) \u2014 LLM issues that are substantive"
    ) +
    theme_uj()
}

Coverage and precision vary substantially across papers. Some papers achieve high scores on both dimensions, indicating strong alignment between human and LLM critiques, while others reveal the LLM missing key human concerns or raising issues absent from the expert consensus. Because these metrics are themselves LLM-assessed, they should be interpreted with the caveat that an LLM judge may systematically over- or under-credit matches relative to a human annotator; manual validation through our annotation tool is underway and preliminary results are consistent with the automated scores.

Detailed paper-by-paper comparisons—including matched issue pairs with severity labels, structural difference tables, and per-evaluator breakdowns—appear in Appendix B: Critiques & Key Issues. Extended quantitative analysis, including per-criterion correlations, bootstrap confidence intervals, mutual information, tier prediction accuracy, cost-quality trade-offs across all six models, and top disagreement cases, is reported in Appendix A: Results Ratings. The full LLM reasoning traces and assessment summaries are available in Appendix C: LLM Traces.

Model comparison summary. Table 2.1 compares all six models on overall-rating agreement metrics against the human mean, restricted to the 33-paper matched sample so every model is evaluated on identical papers. The Human–Human row is the empirical ceiling: it shows pairwise agreement between evaluators on the same papers. Models are sorted by Spearman ρ.

Show code

model_compare_data <- llm_metrics |>
  filter(criteria == "overall", paper %in% opus_papers) |>
  left_join(
    human_avg |> select(paper, human_mid),
    by = "paper"
  ) |>
  filter(!is.na(human_mid))

if (nrow(model_compare_data) > 0) {
  llm_rows <- model_compare_data |>
    group_by(Model = model) |>
    summarise(
      N              = n(),
      `Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman",
                                     use = "complete.obs"), 3),
      `Pearson r`    = round(cor(human_mid, midpoint, use = "complete.obs"), 3),
      `Mean bias`    = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)),
      RMSE           = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1),
      MAE            = round(mean(abs(midpoint - human_mid), na.rm = TRUE), 1),
      .groups = "drop"
    ) |>
    arrange(desc(`Spearman ρ`))

  model_compare_tbl <- bind_rows(hh_baseline_row, llm_rows)

  knitr::kable(model_compare_tbl, align = c("l", rep("r", 6)), booktabs = TRUE) |>
    kableExtra::row_spec(1, bold = TRUE, background = "#f0f8f0")
}

Table 2.1: Agreement between each LLM and human mean overall rating (matched sample, N = r n_opus papers). Human–Human row (top, bold) shows pairwise evaluator agreement as an empirical upper bound. Spearman ρ (rank-based) and MAE (in percentile points, directly interpretable) are primary metrics; Pearson r shown for completeness. Bias = LLM − Human (positive = LLM more generous). Models sorted by Spearman ρ.

Model	N	Spearman ρ	Pearson r	Mean bias	RMSE	MAE
Human–Human	27	0.472	0.584	+3.5	15.6	11.3
GPT-5 Pro	33	0.553	0.396	+5.1	14.1	10.0
Claude Opus 4.6	33	0.477	0.567	-7.6	17.2	12.4
Gemini 2.0 Flash	33	0.447	0.467	-5.3	13.2	10.8
GPT-5.2 Pro	5	0.410	0.433	-6.6	19.9	15.4
GPT-4o-mini	33	0.309	0.356	-0.1	15.9	10.7
Claude Sonnet 4	8	-0.058	-0.232	-1.6	9.1	6.8

Table 2.2 breaks the same agreement metrics down by evaluation criterion, averaged across all models. Crucially, the H-H ρ column shows pairwise human-human Spearman ρ for each criterion — the ceiling available to any model. Criteria where H-H ρ is itself low indicate genuine expert disagreement; low LLM agreement on those criteria is therefore expected rather than a model failure.

Show code

criteria_order <- c("overall", "claims", "methods", "adv_knowledge",
                    "logic_comms", "open_sci", "gp_relevance")
criteria_labels <- c(
  overall      = "Overall",
  claims       = "Claims & Evidence",
  methods      = "Methods",
  adv_knowledge = "Adv. Knowledge",
  logic_comms  = "Logic & Comms",
  open_sci     = "Open Science",
  gp_relevance = "Global Relevance"
)

# Human-human pairwise Spearman rho per criterion (ceiling reference)
hh_crit_pairs <- metrics_human |>
  filter(criteria %in% criteria_order, paper %in% opus_papers) |>
  select(paper, criteria, evaluator, mid) |>
  distinct() |>
  group_by(paper, criteria) |>
  filter(n() >= 2) |>
  mutate(slot = paste0("E", row_number())) |>
  ungroup() |>
  pivot_wider(names_from = slot, values_from = c(mid, evaluator)) |>
  filter(!is.na(mid_E1), !is.na(mid_E2))

hh_crit_rho <- hh_crit_pairs |>
  group_by(criteria) |>
  summarise(
    `H-H ρ` = round(cor(mid_E1, mid_E2, method = "spearman", use = "complete.obs"), 3),
    .groups = "drop"
  )

human_by_crit <- metrics_human |>
  filter(criteria %in% criteria_order, paper %in% opus_papers) |>
  group_by(paper, criteria) |>
  summarise(human_mid = mean(mid, na.rm = TRUE), .groups = "drop")

crit_compare_data <- llm_metrics |>
  filter(criteria %in% criteria_order, paper %in% opus_papers) |>
  inner_join(human_by_crit, by = c("paper", "criteria"))

if (nrow(crit_compare_data) > 0) {
  crit_tbl <- crit_compare_data |>
    group_by(criteria) |>
    summarise(
      `LLM Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman",
                                    use = "complete.obs"), 3),
      `LLM Pearson r`  = round(cor(human_mid, midpoint, use = "complete.obs"), 3),
      `Mean bias`       = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)),
      RMSE              = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1),
      .groups = "drop"
    ) |>
    left_join(hh_crit_rho, by = "criteria") |>
    mutate(Criterion = factor(criteria, levels = criteria_order,
                              labels = criteria_labels[criteria_order])) |>
    select(Criterion, `H-H ρ`, `LLM Spearman ρ`, `LLM Pearson r`, `Mean bias`, RMSE) |>
    arrange(Criterion)

  knitr::kable(crit_tbl, align = c("l", rep("r", 5))) |>
    kableExtra::column_spec(2, bold = TRUE)
}

Table 2.2: Human–LLM agreement by criterion, averaged across all models (matched sample, N = r n_opus papers). H-H ρ shows pairwise human evaluator Spearman ρ (the ceiling for each criterion); LLM Spearman ρ and other metrics are averages across all six models. Positive bias = LLMs rate higher on average. Note the ceiling variation across criteria: human evaluators agree strongly on some dimensions and barely at all on others (Open Science), which sets expectations for LLM performance.

Criterion	H-H ρ	LLM Spearman ρ	LLM Pearson r	Mean bias	RMSE
Overall	0.472	0.406	0.415	-2.1	15.1
Claims & Evidence	0.087	0.279	0.362	-4.5	16.9
Methods	0.178	0.323	0.304	-1.2	18.4
Adv. Knowledge	0.026	0.218	0.230	+1.1	16.8
Logic & Comms	0.291	0.041	0.082	+3.9	16.9
Open Science	0.238	0.008	0.037	-11.7	27.0
Global Relevance	0.201	0.257	0.181	-0.3	17.6

--- engine: knitr --- # Results ```{r} #| label: setup-results #| code-summary: "Setup and libraries" #| code-fold: true #| message: false #| warning: false source("setup_params.R") library("tidyverse") library("janitor") library("stringr") library("here") library("knitr") library("kableExtra") library("ggrepel") library("scales") library("jsonlite") library("purrr") library("tibble") # Theme and colors — crisp palette (high saturation, maximum separation) UJ_ORANGE <- "#E8722A" # vivid saffron orange UJ_GREEN <- "#2D9D5E" # rich emerald green UJ_BLUE <- "#2B7CE9" # clear azure blue MODEL_COLORS <- c( "GPT-5 Pro" = "#E8722A", # saffron orange (focal) "GPT-5.2 Pro" = "#D62839", # bright crimson "GPT-4o-mini" = "#17B890", # vivid teal "Claude Sonnet 4" = "#A855F7", # vivid purple "Claude Opus 4.6" = "#7C3AED", # deep violet "Gemini 2.0 Flash" = "#2B7CE9", # clear azure "Human" = "#2D9D5E" # emerald green ) theme_uj <- function(base_size = 12) { theme_minimal(base_size = base_size) + theme( panel.grid.minor = element_blank(), panel.grid.major = element_line(linewidth = 0.3, color = "grey88"), plot.title = element_text(face = "bold", size = rel(1.1)), plot.title.position = "plot", plot.subtitle = element_text(color = "grey40", size = rel(0.9)), plot.caption = element_text(color = "grey50", size = rel(0.8), hjust = 0), axis.title = element_text(size = rel(0.95)), axis.text = element_text(size = rel(0.88)), legend.position = "bottom", legend.text = element_text(size = rel(0.88)), legend.title = element_text(size = rel(0.9), face = "bold"), strip.text = element_text(face = "bold", size = rel(0.95)) ) } canon_metric <- function(x) dplyr::recode( x, "advancing_knowledge" = "adv_knowledge", "open_science" = "open_sci", "logic_communication" = "logic_comms", "global_relevance" = "gp_relevance", "claims_evidence" = "claims", .default = x ) `%||%` <- function(x, y) if (!is.null(x)) x else y ``` ```{r} #| label: load-human-data-results #| code-fold: true #| code-summary: "Load human evaluation data" #| message: false UJmap <- read_delim("data/UJ_map.csv", delim = ";", show_col_types = FALSE) |> mutate(label_paper_title = research, label_paper = paper) |> select(c("label_paper_title", "label_paper")) rsx <- read_csv("data/rsx_evalr_rating.csv", show_col_types = FALSE) |> clean_names() |> mutate(label_paper_title = research) |> select(-c("research")) research <- read_csv("data/research.csv", show_col_types = FALSE) |> clean_names() |> filter(status == "50_published evaluations (on PubPub, by Unjournal)") |> left_join(UJmap, by = c("label_paper_title")) |> mutate(doi = str_trim(doi)) |> mutate(label_paper = case_when( doi == "https://doi.org/10.3386/w31162" ~ "Walker et al. 2023", doi == "doi.org/10.3386/w32728" ~ "Hahn et al. 2025", doi == "https://doi.org/10.3386/w30011" ~ "Bhat et al. 2022", doi == "10.1093/wbro/lkae010" ~ "Crawfurd et al. 2023", TRUE ~ label_paper )) |> left_join(rsx, by = c("label_paper_title")) key_map <- research |> transmute(label_paper_title = str_trim(label_paper_title), label_paper = label_paper) |> filter(!is.na(label_paper_title)) |> distinct(label_paper_title, label_paper) |> group_by(label_paper_title) |> slice(1) |> ungroup() rsx_research <- rsx |> mutate(label_paper_title = str_trim(label_paper_title)) |> left_join(key_map, by = "label_paper_title", relationship = "many-to-one") metrics_human <- rsx_research |> mutate(criteria = canon_metric(criteria)) |> filter(criteria %in% c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |> transmute( paper = label_paper, criteria, evaluator, model = "Human", mid = as.numeric(middle_rating), lo = suppressWarnings(as.numeric(lower_ci)), hi = suppressWarnings(as.numeric(upper_ci)) ) |> filter(!is.na(paper), !is.na(mid)) |> mutate( lo = ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_), hi = ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_) ) |> mutate(across(c(mid, lo, hi), ~ round(.x, 4))) |> distinct(paper, criteria, model, evaluator, mid, lo, hi) human_avg <- metrics_human |> filter(criteria == "overall") |> group_by(paper) |> summarise( human_mid = mean(mid, na.rm = TRUE), human_lo = mean(lo, na.rm = TRUE), human_hi = mean(hi, na.rm = TRUE), n_human = n(), .groups = "drop" ) n_human_papers <- n_distinct(metrics_human$paper) n_human_evaluators <- n_distinct(metrics_human$evaluator) ``` ```{r} #| label: load-llm-data-results #| code-fold: true #| code-summary: "Load LLM evaluation data (all models)" #| message: false model_dirs <- list( "gpt5_pro_updated_jan2026" = "GPT-5 Pro", "gpt52_pro_focal_jan2026" = "GPT-5.2 Pro", "gpt_4o_mini_2024_07_18" = "GPT-4o-mini", "claude_sonnet_4_20250514" = "Claude Sonnet 4", "claude_opus_4_6" = "Claude Opus 4.6", "gemini_2.0_flash" = "Gemini 2.0 Flash" ) parse_response <- function(path, model_name) { tryCatch({ r <- jsonlite::fromJSON(path, simplifyVector = FALSE) paper <- basename(path) |> str_replace("\\.response\\.json$", "") |> str_replace_all("_", " ") parsed <- NULL if (!is.null(r$parsed) && length(r$parsed) > 0) { parsed <- r$parsed } else if (!is.null(r$output_text) && nchar(r$output_text) > 0) { txt <- r$output_text txt <- sub("^\\s*```[a-z]*\\s*\n?", "", txt) txt <- sub("\\s*```\\s*$", "", txt) parsed <- jsonlite::fromJSON(txt, simplifyVector = TRUE) } else if (!is.null(r$output)) { msg <- purrr::detect(r$output, ~ .x$type == "message", .default = NULL) if (!is.null(msg) && length(msg$content) > 0) { parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector = TRUE) } } if (is.null(parsed)) return(NULL) metrics <- parsed$metrics metric_rows <- list() tier_rows <- list() tier_names <- c("tier_should", "tier_will", "journal_should", "journal_will") for (nm in names(metrics)) { if (nm %in% tier_names) { tier_kind <- sub("^journal_", "tier_", nm) tier_rows[[length(tier_rows) + 1]] <- tibble( paper = paper, model = model_name, tier_kind = tier_kind, score = metrics[[nm]]$score, ci_lower = metrics[[nm]]$ci_lower, ci_upper = metrics[[nm]]$ci_upper ) } else { metric_rows[[length(metric_rows) + 1]] <- tibble( paper = paper, model = model_name, metric = nm, midpoint = metrics[[nm]]$midpoint, lower_bound = metrics[[nm]]$lower_bound, upper_bound = metrics[[nm]]$upper_bound ) } } input_tok <- r$usage$input_tokens %||% r$input_tokens output_tok <- r$usage$output_tokens %||% r$output_tokens list( metrics = bind_rows(metric_rows), tiers = bind_rows(tier_rows), tokens = tibble( paper = paper, model = model_name, input_tokens = input_tok %||% NA_integer_, output_tokens = output_tok %||% NA_integer_ ) ) }, error = function(e) NULL) } load_all_llm <- function() { all_metrics <- list() all_tiers <- list() all_tokens <- list() for (dir_name in names(model_dirs)) { model_name <- model_dirs[[dir_name]] json_dir <- here("results", dir_name, "json") if (dir.exists(json_dir)) { files <- list.files(json_dir, pattern = "\\.response\\.json$", full.names = TRUE) for (f in files) { result <- parse_response(f, model_name) if (!is.null(result)) { all_metrics[[length(all_metrics) + 1]] <- result$metrics all_tiers[[length(all_tiers) + 1]] <- result$tiers all_tokens[[length(all_tokens) + 1]] <- result$tokens } } } } list( metrics = bind_rows(all_metrics) |> mutate(criteria = canon_metric(metric)), tiers = bind_rows(all_tiers), tokens = bind_rows(all_tokens) ) } llm_data <- load_all_llm() llm_metrics <- llm_data$metrics llm_tiers <- llm_data$tiers llm_tokens <- llm_data$tokens n_llm_models <- n_distinct(llm_metrics$model) n_llm_papers <- n_distinct(llm_metrics$paper) matched_papers <- intersect( unique(metrics_human$paper), unique(llm_metrics$paper) ) n_matched <- length(matched_papers) # Opus sample: papers evaluated by Claude Opus 4.6 AND humans # Used for the ranked slopegraph and agreement tables so all models # are compared on the same set of papers. opus_papers <- intersect( llm_metrics |> filter(model == "Claude Opus 4.6") |> pull(paper) |> unique(), unique(metrics_human$paper) ) n_opus <- length(opus_papers) primary_model <- if ("GPT-5 Pro" %in% unique(llm_metrics$model)) "GPT-5 Pro" else unique(llm_metrics$model)[1] ``` ```{r} #| label: compute-hh-baseline #| include: false # Human-Human pairwise agreement (baseline for agreement tables). # For papers with ≥2 evaluators, assign E1/E2 by row order within paper, # then compute pairwise correlation and error metrics across all such papers. hh_pairs <- metrics_human |> filter(criteria == "overall", paper %in% opus_papers) |> select(paper, evaluator, mid) |> distinct() |> group_by(paper) |> filter(n() >= 2) |> mutate(slot = paste0("E", row_number())) |> ungroup() |> pivot_wider(names_from = slot, values_from = c(mid, evaluator)) |> filter(!is.na(mid_E1), !is.na(mid_E2)) hh_n <- nrow(hh_pairs) hh_pearson <- round(cor(hh_pairs$mid_E1, hh_pairs$mid_E2), 3) hh_spearman <- round(cor(hh_pairs$mid_E1, hh_pairs$mid_E2, method = "spearman"), 3) hh_mae <- round(mean(abs(hh_pairs$mid_E1 - hh_pairs$mid_E2), na.rm = TRUE), 1) hh_rmse <- round(sqrt(mean((hh_pairs$mid_E1 - hh_pairs$mid_E2)^2, na.rm = TRUE)), 1) hh_bias <- sprintf("%+.1f", mean(hh_pairs$mid_E1 - hh_pairs$mid_E2, na.rm = TRUE)) hh_baseline_row <- tibble( Model = "Human\u2013Human", N = hh_n, `Spearman ρ` = hh_spearman, `Pearson r` = hh_pearson, `Mean bias` = hh_bias, RMSE = hh_rmse, MAE = hh_mae ) ``` We evaluate `r n_llm_models` frontier LLMs against human expert reviews from [The Unjournal](https://unjournal.pubpub.org). Across all models, `r n_matched` papers have both human and LLM evaluations; `r n_opus` of these were also evaluated by Claude Opus 4.6 and form the matched sample used for rank comparisons and agreement tables below. Each model receives the same PDF, system prompt mirroring The Unjournal rubric, and JSON schema requiring a diagnostic summary plus numeric midpoints and 90% credible intervals for every metric. Full methodological details appear in [Methods](methods.qmd). We do not treat human ratings as ground truth. Quantitative percentile scoring is genuinely difficult: even domain experts disagree, and individual scores reflect both signal about paper quality and idiosyncratic tendencies (severity, topic familiarity, interpretation of the scale). Our question is therefore whether LLMs provide signal *comparable to an additional expert rater*. If two human evaluators agree at Spearman ρ = 0.55, an LLM achieving ρ = 0.57 against the human mean is performing within human inter-rater range. The Human–Human baseline row in @tbl-model-agreement-main makes this explicit; Krippendorff's α~HH~ in the [human-baseline table in Appendix A](results_ratings.qmd) provides the criterion-level ceiling. **Per-paper overview and model comparison.** @fig-overview-combined presents three complementary views of overall (0--100 percentile) ratings. Panel (a) displays individual human evaluator ratings alongside GPT-5 Pro (orange diamonds) and Claude Opus 4.6 (purple diamonds) ratings for each paper, revealing inter-rater variability---CIs often span 20--40~points. In most cases both LLMs fall within the range of human opinions, though several papers show substantial divergence. Panel (b) compares the two highest-performing reasoning models directly against human mean ratings. Panel (c) plots evaluator pairs against each other for all papers with ≥2 raters, making the human-human agreement ceiling directly visible alongside LLM-human agreement in panel (b). ```{r} #| label: fig-overview-combined #| fig-cap: "Per-paper overall ratings (0--100 percentile). **(a)** Individual human evaluator midpoints (green circles) with each evaluator's self-reported 90% credible interval (reflecting their own uncertainty about the true score; the vertical separation between green dots per paper reflects inter-rater disagreement), GPT-5 Pro (orange diamond with CI), and Claude Opus 4.6 (purple diamond with CI), sorted by descending human mean. Dotted horizontal lines show grand means. **(b)** Pairwise human evaluator agreement across all matched papers: each point is one evaluator pair (papers with 3 raters contribute 3 points); uses all `r n_matched` matched papers (the Human–Human row in Table 1 is restricted to the `r n_opus`-paper matched sample). **(c)** Human mean vs LLM overall rating for the two focal models with per-model Spearman ρ annotated; dashed diagonal is the identity line. Compare panels (b) and (c) directly to see whether LLM-human scatter is tighter than human-human scatter." #| fig-width: 14 #| fig-height: 14 #| code-fold: true library("patchwork") # ── Panel (a): Forest plot ── H_ind <- metrics_human |> filter(criteria == "overall", paper %in% matched_papers) |> mutate( lo = ifelse(is.finite(lo), pmax(0, lo), NA_real_), hi = ifelse(is.finite(hi), pmin(100, hi), NA_real_) ) ord <- H_ind |> group_by(paper) |> summarise(h_mean = mean(mid, na.rm = TRUE), .groups = "drop") |> arrange(desc(h_mean)) |> mutate(pos = row_number()) H_plot <- H_ind |> inner_join(ord, by = "paper") |> group_by(paper) |> mutate( off = (row_number() - (n() + 1) / 2) * 0.18, x = pos + off ) |> ungroup() L_c <- llm_metrics |> filter(criteria == "overall", model == primary_model, paper %in% matched_papers) |> group_by(paper) |> summarise( mid = mean(midpoint, na.rm = TRUE), lo = suppressWarnings(min(coalesce(lower_bound, midpoint), na.rm = TRUE)), hi = suppressWarnings(max(coalesce(upper_bound, midpoint), na.rm = TRUE)), .groups = "drop" ) |> inner_join(ord, by = "paper") |> mutate(x = pos - 0.18) L_opus <- llm_metrics |> filter(criteria == "overall", model == "Claude Opus 4.6", paper %in% matched_papers) |> group_by(paper) |> summarise( mid = mean(midpoint, na.rm = TRUE), lo = suppressWarnings(min(coalesce(lower_bound, midpoint), na.rm = TRUE)), hi = suppressWarnings(max(coalesce(upper_bound, midpoint), na.rm = TRUE)), .groups = "drop" ) |> inner_join(ord, by = "paper") |> mutate(x = pos + 0.18) hbar <- mean(ord$h_mean, na.rm = TRUE) lbar <- mean(L_c$mid, na.rm = TRUE) lbar_opus <- mean(L_opus$mid, na.rm = TRUE) OPUS_PURPLE <- MODEL_COLORS["Claude Opus 4.6"] # Legend labels embed grand means so the annotation text can be dropped lbl_human <- sprintf("Human evaluators (mean = %.1f)", hbar) lbl_gpt <- sprintf("GPT-5 Pro (mean = %.1f)", lbar) lbl_opus <- sprintf("Claude Opus 4.6 (mean = %.1f)", lbar_opus) leg_colors <- setNames(c(UJ_GREEN, UJ_ORANGE, unname(OPUS_PURPLE)), c(lbl_human, lbl_gpt, lbl_opus)) leg_shapes <- setNames(c(16L, 18L, 18L), c(lbl_human, lbl_gpt, lbl_opus)) p_forest <- if (nrow(L_c) > 0) { ggplot() + geom_vline(data = ord, aes(xintercept = pos), color = "grey92", linewidth = 0.3) + geom_hline(yintercept = hbar, color = UJ_GREEN, linetype = "dotted", linewidth = 0.8) + geom_hline(yintercept = lbar, color = UJ_ORANGE, linetype = "dotted", linewidth = 0.8) + geom_hline(yintercept = lbar_opus, color = OPUS_PURPLE, linetype = "dotted", linewidth = 0.8) + geom_errorbar( data = subset(H_plot, is.finite(lo) & is.finite(hi)), aes(x = x, ymin = lo, ymax = hi), width = 0, linewidth = 1, alpha = 0.5, color = UJ_GREEN ) + geom_point(data = H_plot, aes(x = x, y = mid, color = lbl_human, shape = lbl_human), size = 3.0, alpha = 0.9) + geom_errorbar( data = subset(L_c, is.finite(lo) & is.finite(hi)), aes(x = x, ymin = lo, ymax = hi), width = 0, linewidth = 1.0, color = UJ_ORANGE ) + geom_point(data = L_c, aes(x = x, y = mid, color = lbl_gpt, shape = lbl_gpt), size = 3.6) + geom_errorbar( data = subset(L_opus, is.finite(lo) & is.finite(hi)), aes(x = x, ymin = lo, ymax = hi), width = 0, linewidth = 1.0, color = OPUS_PURPLE ) + geom_point(data = L_opus, aes(x = x, y = mid, color = lbl_opus, shape = lbl_opus), size = 3.6) + scale_color_manual(name = NULL, values = leg_colors, breaks = c(lbl_human, lbl_gpt, lbl_opus)) + scale_shape_manual(name = NULL, values = leg_shapes, breaks = c(lbl_human, lbl_gpt, lbl_opus)) + guides( color = guide_legend(override.aes = list(size = 3.8, alpha = 1)), shape = guide_legend(override.aes = list(size = 3.8)) ) + scale_x_continuous( breaks = ord$pos, labels = ord$paper, expand = expansion(mult = c(0.01, 0.03)) ) + coord_cartesian(ylim = c(0, 100), clip = "off") + labs(x = NULL, y = "Percentile (0\u2013100)", tag = "(a)") + theme_uj() + theme( axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, size = 9), axis.title.y = element_text(size = 12), panel.grid.major.x = element_blank(), plot.margin = margin(5, 40, 5, 5), plot.tag = element_text(face = "bold", size = 14), legend.position = "bottom", legend.direction = "horizontal", legend.text = element_text(size = 10), legend.key.size = unit(0.5, "cm"), legend.spacing.x = unit(0.5, "cm") ) } else { ggplot() + annotate("text", x = 0.5, y = 0.5, label = "Insufficient data") + theme_void() } # ── Panel (b): Human-Human scatter ── # All pairwise evaluator combinations for papers with ≥2 human raters hh_scatter_data <- metrics_human |> filter(criteria == "overall") |> select(paper, evaluator, mid) |> distinct() |> group_by(paper) |> filter(n() >= 2) |> group_modify(\(df, key) { idx <- combn(nrow(df), 2) tibble(mid_a = df$mid[idx[1, ]], mid_b = df$mid[idx[2, ]]) }) |> ungroup() p_hh <- if (nrow(hh_scatter_data) > 0) { hh_rho <- cor(hh_scatter_data$mid_a, hh_scatter_data$mid_b, method = "spearman", use = "complete.obs") hh_lbl <- sprintf("Spearman \u03c1 = %.2f\nN = %d pairs", hh_rho, nrow(hh_scatter_data)) ggplot(hh_scatter_data, aes(x = mid_a, y = mid_b)) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") + geom_point(color = UJ_GREEN, size = 3, alpha = 0.7) + annotate("text", x = 4, y = 96, label = hh_lbl, hjust = 0, vjust = 1, size = 3.4, color = "grey30") + coord_fixed(ratio = 1, xlim = c(0, 100), ylim = c(0, 100)) + labs(x = "Evaluator A (overall)", y = "Evaluator B (overall)", tag = "(b)") + theme_uj() + theme(plot.tag = element_text(face = "bold", size = 14)) } else { ggplot() + annotate("text", x = 0.5, y = 0.5, label = "Insufficient data") + theme_void() } # ── Panel (c): LLM vs Human scatter ── focal_models <- c("GPT-5 Pro", "Claude Opus 4.6") scatter_data <- llm_metrics |> filter(criteria == "overall", model %in% focal_models) |> inner_join(human_avg, by = "paper") |> mutate( diff = midpoint - human_mid, paper_short = str_trunc(paper, 25), human_lo = coalesce(human_lo, human_mid), human_hi = coalesce(human_hi, human_mid), lower_bound = coalesce(lower_bound, midpoint), upper_bound = coalesce(upper_bound, midpoint) ) |> filter(!is.na(human_mid), !is.na(midpoint)) # Per-model Spearman rho for facet annotations scatter_rho <- scatter_data |> group_by(model) |> summarise( rho = cor(human_mid, midpoint, method = "spearman", use = "complete.obs"), .groups = "drop" ) |> mutate(lbl = sprintf("Spearman \u03c1 = %.2f", rho), x = 4, y = 96) p_scatter <- if (nrow(scatter_data) > 0) { ggplot(scatter_data, aes(x = human_mid, y = midpoint, color = model)) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") + geom_point(size = 3.5, alpha = 0.7) + ggrepel::geom_text_repel(aes(label = paper_short), size = 2.4, max.overlaps = 8, show.legend = FALSE) + geom_text(data = scatter_rho, aes(x = x, y = y, label = lbl), hjust = 0, vjust = 1, size = 3.4, color = "grey30", inherit.aes = FALSE) + scale_color_manual(values = MODEL_COLORS) + coord_fixed(ratio = 1, xlim = c(0, 100), ylim = c(0, 100)) + facet_wrap(~model, ncol = 2) + labs(x = "Human mean rating (0\u2013100)", y = "LLM rating (0\u2013100)", color = NULL, tag = "(c)") + theme_uj() + theme( strip.text = element_text(size = 13, face = "bold"), plot.tag = element_text(face = "bold", size = 14), legend.position = "none" ) } else { ggplot() + annotate("text", x = 0.5, y = 0.5, label = "No matching data") + theme_void() } # ── Combine ── bottom_row <- wrap_plots(p_hh, p_scatter, widths = c(1, 2)) (p_forest / bottom_row) + plot_layout(heights = c(1, 0.85)) ``` Panel (c) makes the human-human ceiling visual: the scatter of evaluator pairs is no tighter than the LLM-human scatter in panel (b), and individual pairs disagree by as much as 30--40 percentile points on individual papers. Note that panel (c) uses all `r n_matched` matched papers while the Human–Human row in @tbl-model-agreement-main is restricted to the `r n_opus`-paper matched sample (explaining the slightly different ρ values). Both models cluster around the identity line in the 40--80 range but diverge more at the extremes. LLMs tend to compress ratings toward the centre of the scale relative to humans, producing narrower spread---a pattern consistent with alignment training that discourages extreme outputs. Where humans rate a paper very highly or very harshly, the LLM typically pulls toward the middle. Full agreement metrics including Pearson *r*, Spearman *ρ*, mean bias, and RMSE for all six models appear in the [agreement table in Appendix A](results_ratings.qmd). **Rank comparison.** @fig-rankslope-main places human rankings in the centre column with GPT-5 Pro on the left and Claude Opus 4.6 on the right, restricted to the `r n_opus` papers with Claude Opus 4.6 evaluations. Each paper appears as a pair of S-curves connecting its human rank to its rank under each LLM. Horizontal curves indicate agreement; curves that climb toward the top indicate the LLM ranked the paper higher (better) than humans did. Colour encodes the rank shift: green when humans rank it higher, orange (left, GPT-5 Pro) or purple (right, Opus) when the LLM does. The triptych layout makes it easy to spot papers where both LLMs agree with each other but disagree with humans (both curves slope the same way), versus papers where the two LLMs diverge. ```{r} #| label: fig-rankslope-main #| fig-cap: "Triptych slopegraph of overall ratings for the `r n_papers_slope`-paper matched sample (papers evaluated by all three sources). Each column shows rank order (1 = highest rated) within that source; papers are connected by S-shaped curves. Green = human panel ranks higher; orange (left) = GPT-5 Pro ranks higher; purple (right) = Claude Opus ranks higher. Colour intensity encodes magnitude of rank shift. Right-side labels show human rank (H) and rank delta for each model (\u0394G, \u0394C; positive = LLM ranks higher)." #| fig-height: 13 #| fig-width: 14 #| code-fold: true library("ggforce") # --- Rank-based triptych --- # Ranks are computed AFTER the inner join so all three columns span exactly 1..n. rank_all <- human_avg |> filter(paper %in% matched_papers) |> select(paper, human_mid) |> inner_join( llm_metrics |> filter(criteria == "overall", model == "GPT-5 Pro") |> select(paper, gpt_mid = midpoint), by = "paper" ) |> inner_join( llm_metrics |> filter(criteria == "overall", model == "Claude Opus 4.6") |> select(paper, claude_mid = midpoint), by = "paper" ) |> mutate( rank_h = rank(-human_mid, ties.method = "first"), rank_gpt = rank(-gpt_mid, ties.method = "first"), rank_claude = rank(-claude_mid, ties.method = "first"), label_use = str_trunc(paper, 28), d_left = rank_h - rank_gpt, # positive = GPT ranks it higher (lower number) d_right = rank_h - rank_claude ) n_papers_slope <- nrow(rank_all) # --- Bezier curve data --- left_curves <- purrr::map_dfr(seq_len(n_papers_slope), function(i) { tibble( group = paste0("L", i), x = c(0.5, 0.35, 0.15, 0), y = c(rank_all$rank_h[i], rank_all$rank_h[i], rank_all$rank_gpt[i], rank_all$rank_gpt[i]), dr = rank_all$d_left[i], mag = abs(rank_all$d_left[i]) ) }) right_curves <- purrr::map_dfr(seq_len(n_papers_slope), function(i) { tibble( group = paste0("R", i), x = c(0.5, 0.65, 0.85, 1), y = c(rank_all$rank_h[i], rank_all$rank_h[i], rank_all$rank_claude[i], rank_all$rank_claude[i]), dr = rank_all$d_right[i], mag = abs(rank_all$d_right[i]) ) }) # Colour: green = human ranks higher, orange/purple = LLM ranks higher make_slope_color <- function(dr, high_col, max_shift = 15) { t <- pmax(0, pmin(1, (dr + max_shift) / (2 * max_shift))) ramp <- grDevices::colorRamp(c(UJ_GREEN, "grey88", high_col)) m <- ramp(t) grDevices::rgb(m[, 1], m[, 2], m[, 3], maxColorValue = 255) } left_curves <- left_curves |> mutate(curve_col = make_slope_color(dr, high_col = UJ_ORANGE)) right_curves <- right_curves |> mutate(curve_col = make_slope_color(dr, high_col = unname(MODEL_COLORS["Claude Opus 4.6"]))) all_curves <- bind_rows(left_curves, right_curves) |> mutate(alpha_val = pmin(mag / (n_papers_slope / 3), 1)) if (n_papers_slope > 0) { ggplot() + ggforce::geom_bezier( data = all_curves, aes(x = x, y = y, group = group, colour = curve_col, alpha = alpha_val), linewidth = 0.75 ) + scale_color_identity() + scale_alpha(range = c(0.20, 1), guide = "none") + # Column dots geom_point(data = rank_all, aes(x = 0, y = rank_gpt), color = MODEL_COLORS["GPT-5 Pro"], size = 2.2) + geom_point(data = rank_all, aes(x = 0.5, y = rank_h), color = MODEL_COLORS["Human"], size = 2.6, shape = 16) + geom_point(data = rank_all, aes(x = 1, y = rank_claude), color = MODEL_COLORS["Claude Opus 4.6"], size = 2.2) + # Paper labels left — repelled vertically to avoid overlap ggrepel::geom_text_repel( data = rank_all, aes(x = 0, y = rank_gpt, label = label_use), hjust = 1, nudge_x = -0.06, direction = "y", size = 2.5, color = "grey25", segment.color = "grey72", segment.size = 0.25, force = 0.8, max.overlaps = Inf, min.segment.length = 0.1 ) + # Right annotations: human rank + deltas geom_text(data = rank_all, aes(x = 1, y = rank_claude, label = sprintf("H%d \u0394G%+d \u0394C%+d", rank_h, d_left, d_right)), hjust = -0.06, size = 2.3, color = "grey35") + scale_x_continuous( breaks = c(0, 0.5, 1), labels = c("GPT-5 Pro", "Human", "Claude Opus 4.6"), limits = c(-0.65, 1.6), expand = expansion(mult = 0) ) + scale_y_reverse( breaks = seq(1, n_papers_slope, by = 4), expand = expansion(mult = c(0.02, 0.04)) ) + coord_cartesian(clip = "off") + labs(x = NULL, y = "Rank (1 = highest rated)") + theme_uj() + theme( axis.text.x = element_text(size = 13, face = "bold"), axis.text.y = element_text(size = 9, color = "grey50"), axis.ticks.y = element_blank(), panel.grid.major.y = element_line(linewidth = 0.25, color = "grey92"), panel.grid.major.x = element_line(linewidth = 0.35, color = "grey82"), plot.margin = margin(t = 8, r = 190, b = 8, l = 140) ) } else { cat("Insufficient data for triptych slopegraph.\\n") } ``` Papers where both curves slope the same direction indicate systematic human--LLM disagreement; papers where left and right curves diverge indicate model-specific differences—GPT and Claude "see" the paper differently, not just differently from humans. Horizontal curves indicate agreement with the human ordering. The criteria-level Krippendorff's alpha analysis in the [human baseline table in Appendix A](results_ratings.qmd) quantifies where human--LLM agreement approaches the ceiling set by human inter-rater variability. **Rating differences by paper and criterion.** While the scatter plots and slopegraph summarise agreement on a single dimension, @fig-gap-heatmap-main unpacks Human − LLM differences across all seven criteria simultaneously. Each column is a paper, each row a criterion, and tile colour encodes the signed difference (human mean minus LLM midpoint). Green tiles indicate the human panel rated the paper higher; orange (GPT-5 Pro) or purple (Claude Opus) tiles indicate the LLM was more generous. Papers are sorted by GPT-5 Pro's overall difference so that both panels share the same column order, making model-specific patterns immediately visible. ```{r} #| label: fig-gap-heatmap-main #| fig-cap: "Human minus LLM rating difference for every paper (columns) and criterion (rows), comparing GPT-5 Pro (left panel) and Claude Opus 4.6 (right panel). Green tiles indicate the human mean was higher; orange (GPT-5 Pro) or purple (Claude Opus) tiles indicate the LLM rated the paper higher. Colour is clamped to \u00b130 percentile points. Papers are sorted by GPT-5 Pro overall difference so the two panels are directly comparable." #| fig-width: 16 #| fig-height: 8 #| code-fold: true library("patchwork") metric_order <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") metric_lab <- c( overall = "Overall", claims = "Claims & Evidence", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) H_mean <- metrics_human |> filter(criteria %in% metric_order, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(h = mean(mid, na.rm = TRUE), .groups = "drop") # Fixed paper order based on GPT-5 Pro overall difference L_mean_primary <- llm_metrics |> filter(criteria %in% metric_order, model == primary_model, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop") Ddiff_primary <- inner_join(H_mean, L_mean_primary, by = c("paper", "criteria")) |> mutate(diff = h - l) ord_p <- Ddiff_primary |> filter(criteria == "overall") |> arrange(desc(diff)) |> pull(paper) build_gap_panel <- function(model_name, show_y_labels = TRUE, llm_col = UJ_ORANGE) { L_mean_m <- llm_metrics |> filter(criteria %in% metric_order, model == model_name, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop") Ddiff_m <- inner_join(H_mean, L_mean_m, by = c("paper", "criteria")) |> mutate( diff = h - l, crit = factor(criteria, levels = metric_order, labels = metric_lab[metric_order]) ) p <- ggplot(Ddiff_m, aes(x = factor(paper, levels = ord_p), y = crit, fill = diff)) + geom_tile(color = "white", linewidth = 0.25) + scale_fill_gradient2( low = llm_col, mid = "grey95", high = UJ_GREEN, midpoint = 0, name = "Human \u2212 LLM", limits = c(-30, 30), oob = scales::squish ) + labs(x = NULL, y = NULL, title = model_name) + theme_uj() + theme( axis.text.x = element_text(angle = 55, hjust = 1, vjust = 1, size = 8), panel.grid = element_blank() ) if (show_y_labels) { p <- p + theme(axis.text.y = element_text(size = 11)) } else { p <- p + theme(axis.text.y = element_blank()) } p } if (length(ord_p) > 0) { p_left <- build_gap_panel("GPT-5 Pro", show_y_labels = TRUE, llm_col = UJ_ORANGE) p_right <- build_gap_panel("Claude Opus 4.6", show_y_labels = FALSE, llm_col = unname(MODEL_COLORS["Claude Opus 4.6"])) p_left + p_right + plot_layout(guides = "keep") & theme(legend.position = "bottom") } else { cat("Insufficient data for gap heatmap.\n") } ``` Rows with uniformly green or orange tiles indicate papers where humans and the LLM disagree systematically across all criteria, not just on one dimension. Columns with consistent colour suggest criteria-level biases---for instance, if "Open Science" is green across nearly all papers, humans may systematically reward data-sharing practices more than the LLM does. Comparing the two panels reveals whether disagreement patterns are model-specific or shared across frontier LLMs; tiles that change colour between panels highlight papers where the models diverge from each other, not just from humans. **Qualitative critique comparison.** Beyond numeric ratings, we compare the substantive critiques each model raises against the consensus issues identified by human experts. The figure below plots coverage---the fraction of human-identified concerns that the LLM also raised in some form---against precision---the fraction of LLM-raised issues that correspond to a substantive human concern. These metrics are assessed by GPT-5.2 Pro acting as an independent judge (see [Methods](methods.qmd) for the judging protocol). ```{r} #| label: load-critique-data #| code-fold: true #| code-summary: "Load critique comparison data" #| message: false comparison_results_file <- c( "results/key_issue_comp_results.json", "results/key_issues_comparison_results.json" ) comparison_results_file <- comparison_results_file[file.exists(comparison_results_file)][1] has_critique_data <- FALSE if (!is.na(comparison_results_file)) { comparison_file <- "results/key_issues_comparison.json" if (file.exists(comparison_file)) { comparison_data <- fromJSON(comparison_file) llm_results_raw <- fromJSON(comparison_results_file) llm_results <- llm_results_raw |> as_tibble() |> unnest_wider(comparison) |> select( gpt_paper, coverage_pct, precision_pct, any_of(c("matched_pairs", "unmatched_human", "unmatched_llm")), any_of(c("missed_issues", "extra_issues")), overall_rating, overall_justification, detailed_notes ) comparison_data <- comparison_data |> left_join(llm_results, by = "gpt_paper") has_critique_data <- nrow(comparison_data) > 0 && "coverage_pct" %in% names(comparison_data) && any(!is.na(comparison_data$coverage_pct)) } } ``` ```{r} #| label: fig-coverage-precision-main #| fig-cap: "Coverage (% of human-identified issues that the LLM also raised) vs Precision (% of LLM-raised issues that match a substantive human concern) for each paper, as assessed by GPT-5.2 Pro acting as judge. Dashed lines mark the cross-paper mean for each axis. Papers in the upper-right quadrant show strong alignment between human and LLM critiques." #| fig-width: 10 #| fig-height: 7 if (has_critique_data) { crit_results <- comparison_data |> filter(!is.na(coverage_pct) & !is.na(precision_pct)) |> mutate(paper_short = str_trunc(gpt_paper, 25)) ggplot(crit_results, aes(x = coverage_pct, y = precision_pct)) + geom_point(size = 4.5, color = UJ_ORANGE, alpha = 0.85) + geom_text(aes(label = paper_short), hjust = -0.1, vjust = 0.5, size = 2.6, check_overlap = TRUE) + geom_vline(xintercept = mean(crit_results$coverage_pct), linetype = "dashed", color = UJ_BLUE, linewidth = 0.6) + geom_hline(yintercept = mean(crit_results$precision_pct), linetype = "dashed", color = UJ_BLUE, linewidth = 0.6) + scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) + scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) + labs( x = "Coverage (%) \u2014 Human issues captured by LLM", y = "Precision (%) \u2014 LLM issues that are substantive" ) + theme_uj() } ``` Coverage and precision vary substantially across papers. Some papers achieve high scores on both dimensions, indicating strong alignment between human and LLM critiques, while others reveal the LLM missing key human concerns or raising issues absent from the expert consensus. Because these metrics are themselves LLM-assessed, they should be interpreted with the caveat that an LLM judge may systematically over- or under-credit matches relative to a human annotator; manual validation through our annotation tool is underway and preliminary results are consistent with the automated scores. Detailed paper-by-paper comparisons---including matched issue pairs with severity labels, structural difference tables, and per-evaluator breakdowns---appear in [Appendix B: Critiques & Key Issues](results_critiques.qmd). Extended quantitative analysis, including per-criterion correlations, bootstrap confidence intervals, mutual information, tier prediction accuracy, cost-quality trade-offs across all six models, and top disagreement cases, is reported in [Appendix A: Results Ratings](results_ratings.qmd). The full LLM reasoning traces and assessment summaries are available in [Appendix C: LLM Traces](appendix_llm_traces.qmd). **Model comparison summary.** @tbl-model-agreement-main compares all six models on overall-rating agreement metrics against the human mean, restricted to the `r n_opus`-paper matched sample so every model is evaluated on identical papers. The **Human–Human** row is the empirical ceiling: it shows pairwise agreement between evaluators on the same papers. Models are sorted by Spearman *ρ*. ```{r} #| label: tbl-model-agreement-main #| tbl-cap: "Agreement between each LLM and human mean overall rating (matched sample, N = `r n_opus` papers). **Human–Human** row (top, bold) shows pairwise evaluator agreement as an empirical upper bound. Spearman *ρ* (rank-based) and MAE (in percentile points, directly interpretable) are primary metrics; Pearson *r* shown for completeness. Bias = LLM − Human (positive = LLM more generous). Models sorted by Spearman *ρ*." #| code-fold: true model_compare_data <- llm_metrics |> filter(criteria == "overall", paper %in% opus_papers) |> left_join( human_avg |> select(paper, human_mid), by = "paper" ) |> filter(!is.na(human_mid)) if (nrow(model_compare_data) > 0) { llm_rows <- model_compare_data |> group_by(Model = model) |> summarise( N = n(), `Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman", use = "complete.obs"), 3), `Pearson r` = round(cor(human_mid, midpoint, use = "complete.obs"), 3), `Mean bias` = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)), RMSE = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1), MAE = round(mean(abs(midpoint - human_mid), na.rm = TRUE), 1), .groups = "drop" ) |> arrange(desc(`Spearman ρ`)) model_compare_tbl <- bind_rows(hh_baseline_row, llm_rows) knitr::kable(model_compare_tbl, align = c("l", rep("r", 6)), booktabs = TRUE) |> kableExtra::row_spec(1, bold = TRUE, background = "#f0f8f0") } ``` @tbl-criteria-agreement-main breaks the same agreement metrics down by evaluation criterion, averaged across all models. Crucially, the **H-H ρ** column shows pairwise human-human Spearman ρ for each criterion — the ceiling available to any model. Criteria where H-H ρ is itself low indicate genuine expert disagreement; low LLM agreement on those criteria is therefore expected rather than a model failure. ```{r} #| label: tbl-criteria-agreement-main #| tbl-cap: "Human--LLM agreement by criterion, averaged across all models (matched sample, N = `r n_opus` papers). **H-H ρ** shows pairwise human evaluator Spearman *ρ* (the ceiling for each criterion); LLM Spearman *ρ* and other metrics are averages across all six models. Positive bias = LLMs rate higher on average. Note the ceiling variation across criteria: human evaluators agree strongly on some dimensions and barely at all on others (Open Science), which sets expectations for LLM performance." #| code-fold: true criteria_order <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") criteria_labels <- c( overall = "Overall", claims = "Claims & Evidence", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) # Human-human pairwise Spearman rho per criterion (ceiling reference) hh_crit_pairs <- metrics_human |> filter(criteria %in% criteria_order, paper %in% opus_papers) |> select(paper, criteria, evaluator, mid) |> distinct() |> group_by(paper, criteria) |> filter(n() >= 2) |> mutate(slot = paste0("E", row_number())) |> ungroup() |> pivot_wider(names_from = slot, values_from = c(mid, evaluator)) |> filter(!is.na(mid_E1), !is.na(mid_E2)) hh_crit_rho <- hh_crit_pairs |> group_by(criteria) |> summarise( `H-H ρ` = round(cor(mid_E1, mid_E2, method = "spearman", use = "complete.obs"), 3), .groups = "drop" ) human_by_crit <- metrics_human |> filter(criteria %in% criteria_order, paper %in% opus_papers) |> group_by(paper, criteria) |> summarise(human_mid = mean(mid, na.rm = TRUE), .groups = "drop") crit_compare_data <- llm_metrics |> filter(criteria %in% criteria_order, paper %in% opus_papers) |> inner_join(human_by_crit, by = c("paper", "criteria")) if (nrow(crit_compare_data) > 0) { crit_tbl <- crit_compare_data |> group_by(criteria) |> summarise( `LLM Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman", use = "complete.obs"), 3), `LLM Pearson r` = round(cor(human_mid, midpoint, use = "complete.obs"), 3), `Mean bias` = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)), RMSE = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1), .groups = "drop" ) |> left_join(hh_crit_rho, by = "criteria") |> mutate(Criterion = factor(criteria, levels = criteria_order, labels = criteria_labels[criteria_order])) |> select(Criterion, `H-H ρ`, `LLM Spearman ρ`, `LLM Pearson r`, `Mean bias`, RMSE) |> arrange(Criterion) knitr::kable(crit_tbl, align = c("l", rep("r", 5))) |> kableExtra::column_spec(2, bold = TRUE) } ``` ::: {.content-visible when-format="pdf"} All appendices, interactive figures, and full code are available at the companion website: <https://llm-uj-research-eval.netlify.app>. Source code and data are hosted at <https://github.com/valentinklotzbuecher/llm-uj-research-eval>. :::