Appendix A — Results: Ratings

Setup and libraries

source("setup_params.R")

library("tidyverse")
library("janitor")
library("stringr")
library("here")
library("knitr")
library("kableExtra")
library("ggrepel")
library("scales")
library("jsonlite")
library("purrr")
library("tibble")
# library("plotly")  # removed: all figures are now static ggplot

# Theme and colors — crisp palette (high saturation, maximum separation)
UJ_ORANGE <- "#E8722A"   # vivid saffron orange
UJ_GREEN  <- "#2D9D5E"   # rich emerald green
UJ_BLUE   <- "#2B7CE9"   # clear azure blue

MODEL_COLORS <- c(
  "GPT-5 Pro"        = "#E8722A",   # saffron orange (focal)
  "GPT-5.2 Pro"      = "#D62839",   # bright crimson
  "GPT-4o-mini"      = "#17B890",   # vivid teal
  "Claude Sonnet 4"  = "#A855F7",   # vivid purple
  "Claude Opus 4.6"  = "#7C3AED",   # deep violet
  "Gemini 2.0 Flash" = "#2B7CE9",   # clear azure
  "Human"            = "#2D9D5E"    # emerald green
)

theme_uj <- function(base_size = 12) {
  theme_minimal(base_size = base_size) +
    theme(
      panel.grid.minor   = element_blank(),
      panel.grid.major   = element_line(linewidth = 0.3, color = "grey88"),
      plot.title          = element_text(face = "bold", size = rel(1.1)),
      plot.title.position = "plot",
      plot.subtitle       = element_text(color = "grey40", size = rel(0.9)),
      plot.caption        = element_text(color = "grey50", size = rel(0.8), hjust = 0),
      axis.title          = element_text(size = rel(0.95)),
      axis.text           = element_text(size = rel(0.88)),
      legend.position     = "bottom",
      legend.text         = element_text(size = rel(0.88)),
      legend.title        = element_text(size = rel(0.9), face = "bold"),
      strip.text          = element_text(face = "bold", size = rel(0.95))
    )
}

canon_metric <- function(x) dplyr::recode(
  x,
  "advancing_knowledge" = "adv_knowledge",
  "open_science"        = "open_sci",
  "logic_communication" = "logic_comms",
  "global_relevance"    = "gp_relevance",
  "claims_evidence"     = "claims",
  .default = x
)

`%||%` <- function(x, y) if (!is.null(x)) x else y

# Pricing per 1M tokens (USD)
pricing <- tribble(
  ~model,              ~input_per_m, ~output_per_m,
  "GPT-5 Pro",         15.00,        60.00,
  "GPT-5.2 Pro",       15.00,        60.00,
  "GPT-4o-mini",       0.15,         0.60,
  "Claude Sonnet 4",   3.00,         15.00,
  "Claude Opus 4.6",   15.00,        75.00,
  "Gemini 2.0 Flash",  0.075,        0.30
)

Load human evaluation data

# Load human ratings from Unjournal
UJmap <- read_delim("data/UJ_map.csv", delim = ";", show_col_types = FALSE) |>
  mutate(label_paper_title = research, label_paper = paper) |>
  select(c("label_paper_title", "label_paper"))

rsx <- read_csv("data/rsx_evalr_rating.csv", show_col_types = FALSE) |>
  clean_names() |>
  mutate(label_paper_title = research) |>
  select(-c("research"))

research <- read_csv("data/research.csv", show_col_types = FALSE) |>
  clean_names() |>
  filter(status == "50_published evaluations (on PubPub, by Unjournal)") |>
  left_join(UJmap, by = c("label_paper_title")) |>
  mutate(doi = str_trim(doi)) |>
  mutate(label_paper = case_when(
    doi == "https://doi.org/10.3386/w31162" ~ "Walker et al. 2023",
    doi == "doi.org/10.3386/w32728" ~ "Hahn et al. 2025",
    doi == "https://doi.org/10.3386/w30011" ~ "Bhat et al. 2022",
    doi == "10.1093/wbro/lkae010" ~ "Crawfurd et al. 2023",
    TRUE ~ label_paper
  )) |>
  left_join(rsx, by = c("label_paper_title"))

key_map <- research |>
  transmute(label_paper_title = str_trim(label_paper_title), label_paper = label_paper) |>
  filter(!is.na(label_paper_title)) |>
  distinct(label_paper_title, label_paper) |>
  group_by(label_paper_title) |>
  slice(1) |>
  ungroup()

rsx_research <- rsx |>
  mutate(label_paper_title = str_trim(label_paper_title)) |>
  left_join(key_map, by = "label_paper_title", relationship = "many-to-one")

metrics_human <- rsx_research |>
  mutate(criteria = canon_metric(criteria)) |>
  filter(criteria %in% c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |>
  transmute(
    paper = label_paper, criteria, evaluator, model = "Human",
    mid = as.numeric(middle_rating),
    lo = suppressWarnings(as.numeric(lower_ci)),
    hi = suppressWarnings(as.numeric(upper_ci))
  ) |>
  filter(!is.na(paper), !is.na(mid)) |>
  mutate(
    lo = ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_),
    hi = ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_)
  ) |>
  mutate(across(c(mid, lo, hi), ~ round(.x, 4))) |>
  distinct(paper, criteria, model, evaluator, mid, lo, hi)

human_avg <- metrics_human |>
  filter(criteria == "overall") |>
  group_by(paper) |>
  summarise(
    human_mid = mean(mid, na.rm = TRUE),
    human_lo = mean(lo, na.rm = TRUE),
    human_hi = mean(hi, na.rm = TRUE),
    n_human = n(),
    .groups = "drop"
  )

# Summary stats for inline use
n_human_papers <- n_distinct(metrics_human$paper)
n_human_evaluators <- n_distinct(metrics_human$evaluator)

Load LLM evaluation data (all models)

model_dirs <- list(
  "gpt5_pro_updated_jan2026" = "GPT-5 Pro",
  "gpt52_pro_focal_jan2026" = "GPT-5.2 Pro",
  "gpt_4o_mini_2024_07_18" = "GPT-4o-mini",
  "claude_sonnet_4_20250514" = "Claude Sonnet 4",
  "claude_opus_4_6" = "Claude Opus 4.6",
  "gemini_2.0_flash" = "Gemini 2.0 Flash"
)

parse_response <- function(path, model_name) {
  tryCatch({
    r <- jsonlite::fromJSON(path, simplifyVector = FALSE)
    paper <- basename(path) |>
      str_replace("\\.response\\.json$", "") |>
      str_replace_all("_", " ")

    parsed <- NULL
    if (!is.null(r$parsed) && length(r$parsed) > 0) {
      parsed <- r$parsed
    } else if (!is.null(r$output_text) && nchar(r$output_text) > 0) {
      # Strip markdown code fences if present
      txt <- r$output_text
      txt <- sub("^\\s*```[a-z]*\\s*\n?", "", txt)
      txt <- sub("\\s*```\\s*$", "", txt)
      parsed <- jsonlite::fromJSON(txt, simplifyVector = TRUE)
    } else if (!is.null(r$output)) {
      msg <- purrr::detect(r$output, ~ .x$type == "message", .default = NULL)
      if (!is.null(msg) && length(msg$content) > 0) {
        parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector = TRUE)
      }
    }

    if (is.null(parsed)) return(NULL)

    metrics <- parsed$metrics
    metric_rows <- list()
    tier_rows <- list()

    tier_names <- c("tier_should", "tier_will", "journal_should", "journal_will")

    for (nm in names(metrics)) {
      if (nm %in% tier_names) {
        # Normalise journal_should/journal_will → tier_should/tier_will
        tier_kind <- sub("^journal_", "tier_", nm)
        tier_rows[[length(tier_rows) + 1]] <- tibble(
          paper = paper, model = model_name, tier_kind = tier_kind,
          score = metrics[[nm]]$score,
          ci_lower = metrics[[nm]]$ci_lower,
          ci_upper = metrics[[nm]]$ci_upper
        )
      } else {
        metric_rows[[length(metric_rows) + 1]] <- tibble(
          paper = paper, model = model_name, metric = nm,
          midpoint = metrics[[nm]]$midpoint,
          lower_bound = metrics[[nm]]$lower_bound,
          upper_bound = metrics[[nm]]$upper_bound
        )
      }
    }

    input_tok <- r$usage$input_tokens %||% r$input_tokens
    output_tok <- r$usage$output_tokens %||% r$output_tokens
    reasoning_tok <- r$usage$output_tokens_details$reasoning_tokens

    list(
      metrics = bind_rows(metric_rows),
      tiers = bind_rows(tier_rows),
      tokens = tibble(
        paper = paper, model = model_name,
        input_tokens = input_tok %||% NA_integer_,
        output_tokens = output_tok %||% NA_integer_,
        reasoning_tokens = reasoning_tok %||% NA_integer_
      ),
      summary = tibble(
        paper = paper, model = model_name,
        assessment_summary = parsed$assessment_summary
      )
    )
  }, error = function(e) NULL)
}

load_all_llm <- function() {
  all_metrics <- list()
  all_tiers <- list()
  all_tokens <- list()
  all_summaries <- list()

  for (dir_name in names(model_dirs)) {
    model_name <- model_dirs[[dir_name]]
    json_dir <- here("results", dir_name, "json")

    if (dir.exists(json_dir)) {
      files <- list.files(json_dir, pattern = "\\.response\\.json$", full.names = TRUE)

      for (f in files) {
        result <- parse_response(f, model_name)
        if (!is.null(result)) {
          all_metrics[[length(all_metrics) + 1]] <- result$metrics
          all_tiers[[length(all_tiers) + 1]] <- result$tiers
          all_tokens[[length(all_tokens) + 1]] <- result$tokens
          all_summaries[[length(all_summaries) + 1]] <- result$summary
        }
      }
    }
  }

  list(
    metrics = bind_rows(all_metrics) |> mutate(criteria = canon_metric(metric)),
    tiers = bind_rows(all_tiers),
    tokens = bind_rows(all_tokens),
    summaries = bind_rows(all_summaries)
  )
}

llm_data <- load_all_llm()
llm_metrics <- llm_data$metrics
llm_tiers <- llm_data$tiers
llm_tokens <- llm_data$tokens
llm_summaries <- llm_data$summaries

# Summary stats for inline use
n_llm_models <- n_distinct(llm_metrics$model)
n_llm_papers <- n_distinct(llm_metrics$paper)
llm_model_names <- unique(llm_metrics$model)
llm_model_list <- paste(llm_model_names, collapse = ", ")

# Papers with both human and LLM data
matched_papers <- intersect(
 unique(metrics_human$paper),
 unique(llm_metrics$paper)
)
n_matched <- length(matched_papers)

# Opus sample: papers evaluated by Claude Opus 4.6 AND humans
# Agreement tables and Krippendorff alpha use this sample so all
# models are compared on the same set of papers.
opus_papers <- intersect(
  llm_metrics |> filter(model == "Claude Opus 4.6") |> pull(paper) |> unique(),
  unique(metrics_human$paper)
)
n_opus <- length(opus_papers)

# Cost summary
cost_data <- llm_tokens |>
  left_join(pricing, by = "model") |>
  mutate(
    reasoning_tokens = coalesce(reasoning_tokens, 0L),
    cost_usd = (input_tokens * input_per_m + output_tokens * output_per_m) / 1e6
  )

total_cost <- sum(cost_data$cost_usd, na.rm = TRUE)
total_evaluations <- nrow(llm_tokens)

# Per-model summaries
model_summary <- cost_data |>
  group_by(model) |>
  summarise(
    n_papers = n(),
    avg_input = round(mean(input_tokens, na.rm = TRUE)),
    avg_output = round(mean(output_tokens, na.rm = TRUE)),
    avg_reasoning = round(mean(reasoning_tokens[reasoning_tokens > 0], na.rm = TRUE)),
    total_cost = sum(cost_usd, na.rm = TRUE),
    cost_per_paper = mean(cost_usd, na.rm = TRUE),
    .groups = "drop"
  )

We compare quantitative ratings from 6 frontier LLMs (GPT-5 Pro, GPT-5.2 Pro, GPT-4o-mini, Claude Sonnet 4, Claude Opus 4.6, Gemini 2.0 Flash) against human expert reviews from The Unjournal. Across all models, 45 papers have both human and LLM evaluations. Agreement metrics and Krippendorff’s alpha below are computed on the 33-paper matched sample with Claude Opus 4.6 evaluations, so that all models are compared on identical papers.

We submit each PDF through a single, schema-enforced API call. The system prompt mirrors The Unjournal rubric, blocks extrinsic priors (authors, venue, web context), and requires strict JSON: a ~1k-word diagnostic summary plus numeric midpoints and 90% credible intervals for every metric.¹ Each numeric field carries a midpoint (model’s 50% belief) and a 90% credible interval; wide intervals indicate acknowledged uncertainty.

Cost and token usage. Token costs are computed from API-reported input/output tokens (GPT-5 Pro runs with reasoning = high).

Show code

if (nrow(llm_tokens) > 0) {

  cost_table <- model_summary |>
    transmute(
      Model = model,
      Papers = n_papers,
      `Avg input` = scales::comma(avg_input),
      `Avg output` = scales::comma(avg_output),
      `Avg reasoning` = ifelse(is.finite(avg_reasoning) & avg_reasoning > 0,
                                scales::comma(avg_reasoning), "—"),
      `Total cost` = scales::dollar(total_cost, accuracy = 0.01),
      `Cost/paper` = scales::dollar(cost_per_paper, accuracy = 0.001)
    ) |>
    arrange(desc(as.numeric(gsub("[$,]", "", `Total cost`))))

  knitr::kable(cost_table, align = c("l", rep("r", 6)))
}

Table A.1: Token usage and estimated cost per model

Model	Papers	Avg input	Avg output	Avg reasoning	Total cost	Cost/paper
Claude Opus 4.6	45	116,930	1,509	—	$84.02	$1.867
GPT-5 Pro	60	38,516	6,400	4,717	$57.71	$0.962
GPT-5.2 Pro	14	45,143	2,889	707	$11.91	$0.850
Claude Sonnet 4	16	113,798	727	—	$5.64	$0.352
GPT-4o-mini	60	71,581	470	—	$0.66	$0.011
Gemini 2.0 Flash	60	18,084	1,024	—	$0.10	$0.002

Total cost across all 255 LLM evaluations: $160.03. Reasoning-capable models (GPT-5 Pro, GPT-5.2 Pro, Claude Opus 4.6) cost substantially more per paper than their lighter counterparts (GPT-4o-mini, Gemini 2.0 Flash), raising practical questions about cost-quality trade-offs for deployment at scale.

Agreement metrics. We assess agreement using Spearman ρ (rank-based, robust to outliers), MAE in percentile points (directly interpretable), RMSE, mean bias (LLM − Human; positive = LLM rates higher), and Pearson r for completeness. Full definitions appear in Methods. The Human–Human row in each table provides the empirical ceiling.

Show code

if (nrow(scatter_data) > 0) {

  llm_rows <- scatter_data |>
    group_by(Model = model) |>
    summarise(
      N              = n(),
      `Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman",
                                     use = "complete.obs"), 3),
      `Pearson r`    = round(cor(human_mid, midpoint, use = "complete.obs"), 3),
      `Mean bias`    = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)),
      RMSE           = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1),
      MAE            = round(mean(abs(midpoint - human_mid), na.rm = TRUE), 1),
      .groups = "drop"
    ) |>
    arrange(desc(`Spearman ρ`))

  agreement_tbl <- bind_rows(hh_baseline_row, llm_rows)

  knitr::kable(agreement_tbl, align = c("l", rep("r", 6))) |>
    kableExtra::row_spec(1, bold = TRUE, background = "#f0f8f0")
}

Table A.2: Agreement between each LLM and human mean overall rating, matched sample (N = r n_opus). Human–Human row (top, bold) shows pairwise evaluator agreement on the same papers as an empirical upper bound. Spearman ρ (rank-based, robust to outliers) and MAE (in percentile points, directly interpretable) are primary metrics; Pearson r is shown for completeness. Bias = LLM − Human (positive = LLM more generous). Models sorted by Spearman ρ.

Model	N	Spearman ρ	Pearson r	Mean bias	RMSE	MAE
Human–Human	27	0.472	0.584	+3.5	15.6	11.3
GPT-5 Pro	33	0.553	0.396	+5.1	14.1	10.0
Claude Opus 4.6	33	0.477	0.567	-7.6	17.2	12.4
Gemini 2.0 Flash	33	0.447	0.467	-5.3	13.2	10.8
GPT-5.2 Pro	5	0.410	0.433	-6.6	19.9	15.4
GPT-4o-mini	33	0.309	0.356	-0.1	15.9	10.7
Claude Sonnet 4	8	-0.058	-0.232	-1.6	9.1	6.8

Table A.2 reports agreement metrics between each LLM and the human mean overall rating, with the Human–Human baseline as the top row. The Human–Human row shows pairwise correlation between the two evaluators assigned to the same paper: this is the empirical ceiling any model can hope to reach, since the human “mean” used as the reference is itself a noisy estimate. Where an LLM’s Spearman ρ approaches the Human–Human baseline, it is providing signal comparable to an additional expert rater. Positive mean bias indicates the model rates papers higher than humans on average; negative bias indicates harsher ratings.

Human baseline context. Human percentile ratings are a noisy reference signal, not ground truth: even expert evaluators given the same paper differ systematically, reflecting genuine uncertainty in what a “correct” score would be. Table A.3 quantifies this noise as Krippendorff’s α_HH. The α_HL columns show how close each LLM comes to that ceiling. A model achieving α_HL ≥ α_HH on a criterion is generating signal indistinguishable from an additional human evaluator. Criteria where α_HH is already low indicate genuine expert disagreement; low α_HL on those criteria is therefore expected, not a model failure.

Show code

if (requireNamespace("irr", quietly = TRUE)) {

  library(irr)

  criteria_list <- c("overall", "claims", "methods", "adv_knowledge",
                     "logic_comms", "open_sci", "gp_relevance")

  criterion_labels <- c(
    overall = "Overall", claims = "Claims", methods = "Methods",
    adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms",
    open_sci = "Open Science", gp_relevance = "Global Relevance"
  )

  # Compute human-human alpha (each evaluator as separate rater)
  compute_hh_alpha <- function(criterion) {
    human_wide <- metrics_human |>
      filter(criteria == criterion, paper %in% opus_papers) |>
      select(paper, evaluator, mid) |>
      distinct() |>
      pivot_wider(names_from = paper, values_from = mid)

    if (nrow(human_wide) < 2 || ncol(human_wide) < 3) return(NA_real_)

    M <- as.matrix(human_wide[, -1, drop = FALSE])
    tryCatch({
      irr::kripp.alpha(M, method = "interval")$value
    }, error = function(e) NA_real_)
  }

  # Compute human-LLM alpha for a specific model
  compute_hl_alpha <- function(criterion, model_name) {
    human_mean <- metrics_human |>
      filter(criteria == criterion, paper %in% opus_papers) |>
      group_by(paper) |>
      summarise(Human = mean(mid, na.rm = TRUE), .groups = "drop")

    llm_rating <- llm_metrics |>
      filter(criteria == criterion, model == model_name, paper %in% opus_papers) |>
      select(paper, midpoint) |>
      rename(LLM = midpoint)

    combined <- inner_join(human_mean, llm_rating, by = "paper")
    if (nrow(combined) < 3) return(NA_real_)

    M <- rbind(Human = combined$Human, LLM = combined$LLM)
    tryCatch({
      irr::kripp.alpha(M, method = "interval")$value
    }, error = function(e) NA_real_)
  }

  # Get unique LLM models
  llm_models <- unique(llm_metrics$model)

  # Build comparison table
  baseline_results <- tibble(Criterion = criterion_labels[criteria_list]) |>
    mutate(criterion_key = criteria_list) |>
    rowwise() |>
    mutate(
      `α_HH` = compute_hh_alpha(criterion_key)
    ) |>
    ungroup()

  # Add columns for each LLM model
  for (mod in llm_models) {
    col_name <- paste0("α_HL (", mod, ")")
    baseline_results <- baseline_results |>
      rowwise() |>
      mutate(!!col_name := compute_hl_alpha(criterion_key, mod)) |>
      ungroup()
  }

  # Format for display — put α_HH immediately after Criterion column
  baseline_display <- baseline_results |>
    select(-criterion_key) |>
    mutate(across(where(is.numeric), ~ ifelse(is.na(.x), "\u2014", sprintf("%.2f", .x))))

  alpha_col_order <- c("Criterion", "\u03b1_HH",
                       sort(grep("^\u03b1_HL", names(baseline_display), value = TRUE)))
  if (all(alpha_col_order %in% names(baseline_display))) {
    baseline_display <- baseline_display |> select(all_of(alpha_col_order))
  }

  knitr::kable(baseline_display, align = c("l", rep("r", ncol(baseline_display) - 1))) |>
    kableExtra::column_spec(2, bold = TRUE)
}

Table A.3: Krippendorff’s α_HH (inter-human agreement, bold) vs α_HL (human–LLM agreement) by criterion, matched sample (N = r n_opus papers). α_HH is the empirical ceiling: an LLM with α_HL ≥ α_HH performs at or above the human inter-rater level for that criterion. Criteria where α_HH is already low indicate genuine evaluator disagreement.

Criterion	α_HH	α_HL (Claude Opus 4.6)	α_HL (Claude Sonnet 4)	α_HL (Gemini 2.0 Flash)	α_HL (GPT-4o-mini)	α_HL (GPT-5 Pro)	α_HL (GPT-5.2 Pro)
Overall	0.58	0.47	-0.15	0.37	0.36	0.33	0.45
Claims	0.07	0.26	-0.17	0.42	0.25	0.33	0.49
Methods	0.29	0.39	-0.30	0.27	0.31	0.32	0.21
Adv. Knowledge	0.13	0.32	0.10	0.10	0.22	0.23	0.20
Logic & Comms	0.47	0.09	0.14	-0.04	0.21	-0.12	-0.19
Open Science	0.27	-0.18	0.33	0.08	-0.08	-0.07	-0.52
Global Relevance	0.41	0.18	0.02	0.21	0.05	0.25	0.69

Where α_HL ≥ α_HH, the LLM is performing at or above the ceiling set by human inter-rater variability — its ratings are as consistent with the human reference as a second human evaluator would be. Criteria where α_HH is already low (indicating genuine evaluator disagreement) naturally cap how well any model can do, and low α_HL on those criteria reflects the difficulty of the task rather than a model failure. Extended agreement analysis — including CI coverage, bootstrap confidence intervals, and mutual information — is available in the supplementary material.

Criteria-level patterns. Figure A.1 displays the mean rating each evaluator type assigns across the seven criteria. Criteria where all models and humans converge (similar colour intensity) suggest robust agreement on that dimension; criteria with divergent columns indicate systematic differences in how humans and LLMs weigh that aspect of quality.

Show code

metric_labels <- c(
  overall = "Overall", claims = "Claims", methods = "Methods",
  adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms",
  open_sci = "Open Science", gp_relevance = "Global Relevance"
)

criteria_means <- bind_rows(
  metrics_human |>
    filter(criteria %in% names(metric_labels), paper %in% matched_papers) |>
    group_by(criteria) |>
    summarise(mean_rating = mean(mid, na.rm = TRUE), .groups = "drop") |>
    mutate(model = "Human"),
  llm_metrics |>
    filter(criteria %in% names(metric_labels), paper %in% matched_papers) |>
    group_by(model, criteria) |>
    summarise(mean_rating = mean(midpoint, na.rm = TRUE), .groups = "drop")
) |>
  mutate(criteria_label = factor(metric_labels[criteria], levels = metric_labels))

if (nrow(criteria_means) > 0) {
  ggplot(criteria_means, aes(x = criteria_label, y = model, fill = mean_rating)) +
    geom_tile(color = "white", linewidth = 0.5) +
    geom_text(aes(label = round(mean_rating, 0)), size = 4.2, color = "white", fontface = "bold") +
    scale_fill_gradient(low = "#f0f0f0", high = UJ_ORANGE, limits = c(40, 80), name = "Mean rating") +
    labs(x = NULL, y = NULL) +
    theme_uj() +
    theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 11),
          axis.text.y = element_text(size = 11),
          panel.grid = element_blank())
}

Figure A.1: Mean rating by evaluator type (rows) and criterion (columns), averaged across all matched papers. Tile colour scales from light (lower mean) to vivid orange (higher mean), clamped to 40–80. Criteria where all rows show similar colour intensity indicate broad agreement; divergent columns flag criteria-level systematic differences.

Additional models. Figure A.2 shows overall ratings for the two lightweight models—GPT-4o-mini and Gemini 2.0 Flash—against human mean ratings. These models cost 50–200× less per paper than reasoning-capable models (see Table A.1) but may sacrifice rating accuracy.

Show code

other_models <- c("GPT-4o-mini", "Gemini 2.0 Flash")

scatter_other <- llm_metrics |>
  filter(criteria == "overall", model %in% other_models) |>
  inner_join(human_avg, by = "paper") |>
  mutate(
    diff = midpoint - human_mid,
    paper_short = str_trunc(paper, 25),
    human_lo = coalesce(human_lo, human_mid),
    human_hi = coalesce(human_hi, human_mid),
    lower_bound = coalesce(lower_bound, midpoint),
    upper_bound = coalesce(upper_bound, midpoint)
  ) |>
  filter(!is.na(human_mid), !is.na(midpoint))

if (nrow(scatter_other) > 0) {
  ggplot(scatter_other, aes(x = human_mid, y = midpoint, color = model)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
    geom_point(size = 3.5, alpha = 0.7) +
    ggrepel::geom_text_repel(aes(label = paper_short), size = 2.4, max.overlaps = 8, show.legend = FALSE) +
    scale_color_manual(values = MODEL_COLORS) +
    coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
    facet_wrap(~model, ncol = 2) +
    labs(x = "Human mean rating (0\u2013100)", y = "LLM rating (0\u2013100)", color = NULL) +
    theme_uj() +
    theme(strip.text = element_text(size = 13, face = "bold"))
} else {
  cat("No matching data for lightweight model scatter plot.\n")
}

Figure A.2: Human mean overall rating (x-axis) vs LLM overall rating (y-axis) for the two lightweight models GPT-4o-mini and Gemini 2.0 Flash. Each point is one paper; the dashed diagonal is the identity line. These models cost 50–200× less per paper than reasoning-capable models (see cost table above).

For per-paper scatter plots, rank comparisons, and the gap heatmap showing Human − LLM differences by paper and criterion, see the main Results chapter. Extended reasoning traces are available in Appendix C: LLM Traces.

0–100 percentiles relative to a reference group of ~‘all serious work you have read in this area in the last three years’, overall (global quality/impact), claims_evidence (claim clarity/support), methods (design/identification/robustness), advancing_knowledge (contribution to field/practice), logic_communication (internal consistency and exposition), open_science (reproducibility, data/code availability), and global_relevance (decision and priority usefulness). Journal tiers: (0–5, continuous) capture where the work should publish vs. will publish.↩︎

--- title: "Results: Ratings" engine: knitr execute: freeze: auto --- ::: {.content-visible when-format="pdf"} Extended ratings analysis (per-criterion correlations, agreement metrics, human baseline, cost--quality trade-offs) is available in the online supplement at <https://llm-uj-research-eval.netlify.app/results_ratings.html>. ::: ::: {.content-visible when-format="html"} ```{r} #| label: setup #| code-summary: "Setup and libraries" #| code-fold: true #| message: false #| warning: false source("setup_params.R") library("tidyverse") library("janitor") library("stringr") library("here") library("knitr") library("kableExtra") library("ggrepel") library("scales") library("jsonlite") library("purrr") library("tibble") # library("plotly") # removed: all figures are now static ggplot # Theme and colors — crisp palette (high saturation, maximum separation) UJ_ORANGE <- "#E8722A" # vivid saffron orange UJ_GREEN <- "#2D9D5E" # rich emerald green UJ_BLUE <- "#2B7CE9" # clear azure blue MODEL_COLORS <- c( "GPT-5 Pro" = "#E8722A", # saffron orange (focal) "GPT-5.2 Pro" = "#D62839", # bright crimson "GPT-4o-mini" = "#17B890", # vivid teal "Claude Sonnet 4" = "#A855F7", # vivid purple "Claude Opus 4.6" = "#7C3AED", # deep violet "Gemini 2.0 Flash" = "#2B7CE9", # clear azure "Human" = "#2D9D5E" # emerald green ) theme_uj <- function(base_size = 12) { theme_minimal(base_size = base_size) + theme( panel.grid.minor = element_blank(), panel.grid.major = element_line(linewidth = 0.3, color = "grey88"), plot.title = element_text(face = "bold", size = rel(1.1)), plot.title.position = "plot", plot.subtitle = element_text(color = "grey40", size = rel(0.9)), plot.caption = element_text(color = "grey50", size = rel(0.8), hjust = 0), axis.title = element_text(size = rel(0.95)), axis.text = element_text(size = rel(0.88)), legend.position = "bottom", legend.text = element_text(size = rel(0.88)), legend.title = element_text(size = rel(0.9), face = "bold"), strip.text = element_text(face = "bold", size = rel(0.95)) ) } canon_metric <- function(x) dplyr::recode( x, "advancing_knowledge" = "adv_knowledge", "open_science" = "open_sci", "logic_communication" = "logic_comms", "global_relevance" = "gp_relevance", "claims_evidence" = "claims", .default = x ) `%||%` <- function(x, y) if (!is.null(x)) x else y # Pricing per 1M tokens (USD) pricing <- tribble( ~model, ~input_per_m, ~output_per_m, "GPT-5 Pro", 15.00, 60.00, "GPT-5.2 Pro", 15.00, 60.00, "GPT-4o-mini", 0.15, 0.60, "Claude Sonnet 4", 3.00, 15.00, "Claude Opus 4.6", 15.00, 75.00, "Gemini 2.0 Flash", 0.075, 0.30 ) ``` ```{r} #| label: load-human-data #| code-fold: true #| code-summary: "Load human evaluation data" #| message: false # Load human ratings from Unjournal UJmap <- read_delim("data/UJ_map.csv", delim = ";", show_col_types = FALSE) |> mutate(label_paper_title = research, label_paper = paper) |> select(c("label_paper_title", "label_paper")) rsx <- read_csv("data/rsx_evalr_rating.csv", show_col_types = FALSE) |> clean_names() |> mutate(label_paper_title = research) |> select(-c("research")) research <- read_csv("data/research.csv", show_col_types = FALSE) |> clean_names() |> filter(status == "50_published evaluations (on PubPub, by Unjournal)") |> left_join(UJmap, by = c("label_paper_title")) |> mutate(doi = str_trim(doi)) |> mutate(label_paper = case_when( doi == "https://doi.org/10.3386/w31162" ~ "Walker et al. 2023", doi == "doi.org/10.3386/w32728" ~ "Hahn et al. 2025", doi == "https://doi.org/10.3386/w30011" ~ "Bhat et al. 2022", doi == "10.1093/wbro/lkae010" ~ "Crawfurd et al. 2023", TRUE ~ label_paper )) |> left_join(rsx, by = c("label_paper_title")) key_map <- research |> transmute(label_paper_title = str_trim(label_paper_title), label_paper = label_paper) |> filter(!is.na(label_paper_title)) |> distinct(label_paper_title, label_paper) |> group_by(label_paper_title) |> slice(1) |> ungroup() rsx_research <- rsx |> mutate(label_paper_title = str_trim(label_paper_title)) |> left_join(key_map, by = "label_paper_title", relationship = "many-to-one") metrics_human <- rsx_research |> mutate(criteria = canon_metric(criteria)) |> filter(criteria %in% c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |> transmute( paper = label_paper, criteria, evaluator, model = "Human", mid = as.numeric(middle_rating), lo = suppressWarnings(as.numeric(lower_ci)), hi = suppressWarnings(as.numeric(upper_ci)) ) |> filter(!is.na(paper), !is.na(mid)) |> mutate( lo = ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_), hi = ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_) ) |> mutate(across(c(mid, lo, hi), ~ round(.x, 4))) |> distinct(paper, criteria, model, evaluator, mid, lo, hi) human_avg <- metrics_human |> filter(criteria == "overall") |> group_by(paper) |> summarise( human_mid = mean(mid, na.rm = TRUE), human_lo = mean(lo, na.rm = TRUE), human_hi = mean(hi, na.rm = TRUE), n_human = n(), .groups = "drop" ) # Summary stats for inline use n_human_papers <- n_distinct(metrics_human$paper) n_human_evaluators <- n_distinct(metrics_human$evaluator) ``` ```{r} #| label: load-llm-data #| code-fold: true #| code-summary: "Load LLM evaluation data (all models)" #| message: false model_dirs <- list( "gpt5_pro_updated_jan2026" = "GPT-5 Pro", "gpt52_pro_focal_jan2026" = "GPT-5.2 Pro", "gpt_4o_mini_2024_07_18" = "GPT-4o-mini", "claude_sonnet_4_20250514" = "Claude Sonnet 4", "claude_opus_4_6" = "Claude Opus 4.6", "gemini_2.0_flash" = "Gemini 2.0 Flash" ) parse_response <- function(path, model_name) { tryCatch({ r <- jsonlite::fromJSON(path, simplifyVector = FALSE) paper <- basename(path) |> str_replace("\\.response\\.json$", "") |> str_replace_all("_", " ") parsed <- NULL if (!is.null(r$parsed) && length(r$parsed) > 0) { parsed <- r$parsed } else if (!is.null(r$output_text) && nchar(r$output_text) > 0) { # Strip markdown code fences if present txt <- r$output_text txt <- sub("^\\s*```[a-z]*\\s*\n?", "", txt) txt <- sub("\\s*```\\s*$", "", txt) parsed <- jsonlite::fromJSON(txt, simplifyVector = TRUE) } else if (!is.null(r$output)) { msg <- purrr::detect(r$output, ~ .x$type == "message", .default = NULL) if (!is.null(msg) && length(msg$content) > 0) { parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector = TRUE) } } if (is.null(parsed)) return(NULL) metrics <- parsed$metrics metric_rows <- list() tier_rows <- list() tier_names <- c("tier_should", "tier_will", "journal_should", "journal_will") for (nm in names(metrics)) { if (nm %in% tier_names) { # Normalise journal_should/journal_will → tier_should/tier_will tier_kind <- sub("^journal_", "tier_", nm) tier_rows[[length(tier_rows) + 1]] <- tibble( paper = paper, model = model_name, tier_kind = tier_kind, score = metrics[[nm]]$score, ci_lower = metrics[[nm]]$ci_lower, ci_upper = metrics[[nm]]$ci_upper ) } else { metric_rows[[length(metric_rows) + 1]] <- tibble( paper = paper, model = model_name, metric = nm, midpoint = metrics[[nm]]$midpoint, lower_bound = metrics[[nm]]$lower_bound, upper_bound = metrics[[nm]]$upper_bound ) } } input_tok <- r$usage$input_tokens %||% r$input_tokens output_tok <- r$usage$output_tokens %||% r$output_tokens reasoning_tok <- r$usage$output_tokens_details$reasoning_tokens list( metrics = bind_rows(metric_rows), tiers = bind_rows(tier_rows), tokens = tibble( paper = paper, model = model_name, input_tokens = input_tok %||% NA_integer_, output_tokens = output_tok %||% NA_integer_, reasoning_tokens = reasoning_tok %||% NA_integer_ ), summary = tibble( paper = paper, model = model_name, assessment_summary = parsed$assessment_summary ) ) }, error = function(e) NULL) } load_all_llm <- function() { all_metrics <- list() all_tiers <- list() all_tokens <- list() all_summaries <- list() for (dir_name in names(model_dirs)) { model_name <- model_dirs[[dir_name]] json_dir <- here("results", dir_name, "json") if (dir.exists(json_dir)) { files <- list.files(json_dir, pattern = "\\.response\\.json$", full.names = TRUE) for (f in files) { result <- parse_response(f, model_name) if (!is.null(result)) { all_metrics[[length(all_metrics) + 1]] <- result$metrics all_tiers[[length(all_tiers) + 1]] <- result$tiers all_tokens[[length(all_tokens) + 1]] <- result$tokens all_summaries[[length(all_summaries) + 1]] <- result$summary } } } } list( metrics = bind_rows(all_metrics) |> mutate(criteria = canon_metric(metric)), tiers = bind_rows(all_tiers), tokens = bind_rows(all_tokens), summaries = bind_rows(all_summaries) ) } llm_data <- load_all_llm() llm_metrics <- llm_data$metrics llm_tiers <- llm_data$tiers llm_tokens <- llm_data$tokens llm_summaries <- llm_data$summaries # Summary stats for inline use n_llm_models <- n_distinct(llm_metrics$model) n_llm_papers <- n_distinct(llm_metrics$paper) llm_model_names <- unique(llm_metrics$model) llm_model_list <- paste(llm_model_names, collapse = ", ") # Papers with both human and LLM data matched_papers <- intersect( unique(metrics_human$paper), unique(llm_metrics$paper) ) n_matched <- length(matched_papers) # Opus sample: papers evaluated by Claude Opus 4.6 AND humans # Agreement tables and Krippendorff alpha use this sample so all # models are compared on the same set of papers. opus_papers <- intersect( llm_metrics |> filter(model == "Claude Opus 4.6") |> pull(paper) |> unique(), unique(metrics_human$paper) ) n_opus <- length(opus_papers) # Cost summary cost_data <- llm_tokens |> left_join(pricing, by = "model") |> mutate( reasoning_tokens = coalesce(reasoning_tokens, 0L), cost_usd = (input_tokens * input_per_m + output_tokens * output_per_m) / 1e6 ) total_cost <- sum(cost_data$cost_usd, na.rm = TRUE) total_evaluations <- nrow(llm_tokens) # Per-model summaries model_summary <- cost_data |> group_by(model) |> summarise( n_papers = n(), avg_input = round(mean(input_tokens, na.rm = TRUE)), avg_output = round(mean(output_tokens, na.rm = TRUE)), avg_reasoning = round(mean(reasoning_tokens[reasoning_tokens > 0], na.rm = TRUE)), total_cost = sum(cost_usd, na.rm = TRUE), cost_per_paper = mean(cost_usd, na.rm = TRUE), .groups = "drop" ) ``` We compare quantitative ratings from `r n_llm_models` frontier LLMs (`r llm_model_list`) against human expert reviews from [The Unjournal](https://unjournal.pubpub.org). Across all models, `r n_matched` papers have both human and LLM evaluations. Agreement metrics and Krippendorff's alpha below are computed on the `r n_opus`-paper matched sample with Claude Opus 4.6 evaluations, so that all models are compared on identical papers. We submit each PDF through a single, schema-enforced API call. The system prompt mirrors The Unjournal rubric, blocks extrinsic priors (authors, venue, web context), and requires strict JSON: a ~1k-word diagnostic summary plus numeric midpoints and 90% credible intervals for every metric.^[0–100 percentiles relative to a reference group of ~'all serious work you have read in this area in the last three years', overall (global quality/impact), claims_evidence (claim clarity/support), methods (design/identification/robustness), advancing_knowledge (contribution to field/practice), logic_communication (internal consistency and exposition), open_science (reproducibility, data/code availability), and global_relevance (decision and priority usefulness). Journal tiers: (0–5, continuous) capture where the work should publish vs. will publish.] Each numeric field carries a midpoint (model's 50% belief) and a 90% credible interval; wide intervals indicate acknowledged uncertainty. **Cost and token usage.** Token costs are computed from API-reported input/output tokens (GPT-5 Pro runs with `reasoning` = high). ```{r} #| label: tbl-cost-overview #| tbl-cap: "Token usage and estimated cost per model" if (nrow(llm_tokens) > 0) { cost_table <- model_summary |> transmute( Model = model, Papers = n_papers, `Avg input` = scales::comma(avg_input), `Avg output` = scales::comma(avg_output), `Avg reasoning` = ifelse(is.finite(avg_reasoning) & avg_reasoning > 0, scales::comma(avg_reasoning), "—"), `Total cost` = scales::dollar(total_cost, accuracy = 0.01), `Cost/paper` = scales::dollar(cost_per_paper, accuracy = 0.001) ) |> arrange(desc(as.numeric(gsub("[$,]", "", `Total cost`)))) knitr::kable(cost_table, align = c("l", rep("r", 6))) } ``` Total cost across all `r total_evaluations` LLM evaluations: **`r scales::dollar(total_cost, accuracy = 0.01)`**. Reasoning-capable models (GPT-5 Pro, GPT-5.2 Pro, Claude Opus 4.6) cost substantially more per paper than their lighter counterparts (GPT-4o-mini, Gemini 2.0 Flash), raising practical questions about cost-quality trade-offs for deployment at scale. ```{r} #| label: scatter-data-prep #| include: false # Prepare scatter_data for tbl-agreement — restricted to common sample scatter_data <- llm_metrics |> filter(criteria == "overall", paper %in% opus_papers) |> inner_join(human_avg, by = "paper") |> mutate( diff = midpoint - human_mid, paper_short = str_trunc(paper, 25), human_lo = coalesce(human_lo, human_mid), human_hi = coalesce(human_hi, human_mid), lower_bound = coalesce(lower_bound, midpoint), upper_bound = coalesce(upper_bound, midpoint) ) |> filter(!is.na(human_mid), !is.na(midpoint)) ``` ```{r} #| label: compute-hh-baseline #| include: false # Human-Human pairwise agreement (baseline for agreement tables). # For papers with ≥2 evaluators, assign E1/E2 by row order within paper, # then compute pairwise correlation and error metrics across all such papers. hh_pairs <- metrics_human |> filter(criteria == "overall", paper %in% opus_papers) |> select(paper, evaluator, mid) |> distinct() |> group_by(paper) |> filter(n() >= 2) |> mutate(slot = paste0("E", row_number())) |> ungroup() |> pivot_wider(names_from = slot, values_from = c(mid, evaluator)) |> filter(!is.na(mid_E1), !is.na(mid_E2)) hh_n <- nrow(hh_pairs) hh_pearson <- round(cor(hh_pairs$mid_E1, hh_pairs$mid_E2), 3) hh_spearman <- round(cor(hh_pairs$mid_E1, hh_pairs$mid_E2, method = "spearman"), 3) hh_mae <- round(mean(abs(hh_pairs$mid_E1 - hh_pairs$mid_E2), na.rm = TRUE), 1) hh_rmse <- round(sqrt(mean((hh_pairs$mid_E1 - hh_pairs$mid_E2)^2, na.rm = TRUE)), 1) hh_bias <- sprintf("%+.1f", mean(hh_pairs$mid_E1 - hh_pairs$mid_E2, na.rm = TRUE)) hh_baseline_row <- tibble( Model = "Human\u2013Human", N = hh_n, `Spearman ρ` = hh_spearman, `Pearson r` = hh_pearson, `Mean bias` = hh_bias, RMSE = hh_rmse, MAE = hh_mae ) ``` **Agreement metrics.** We assess agreement using Spearman *ρ* (rank-based, robust to outliers), MAE in percentile points (directly interpretable), RMSE, mean bias (LLM − Human; positive = LLM rates higher), and Pearson *r* for completeness. Full definitions appear in [Methods](methods.qmd). The **Human–Human** row in each table provides the empirical ceiling. ```{r} #| label: tbl-agreement #| tbl-cap: "Agreement between each LLM and human mean overall rating, matched sample (N = `r n_opus`). **Human–Human** row (top, bold) shows pairwise evaluator agreement on the same papers as an empirical upper bound. Spearman *ρ* (rank-based, robust to outliers) and MAE (in percentile points, directly interpretable) are primary metrics; Pearson *r* is shown for completeness. Bias = LLM − Human (positive = LLM more generous). Models sorted by Spearman *ρ*." if (nrow(scatter_data) > 0) { llm_rows <- scatter_data |> group_by(Model = model) |> summarise( N = n(), `Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman", use = "complete.obs"), 3), `Pearson r` = round(cor(human_mid, midpoint, use = "complete.obs"), 3), `Mean bias` = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)), RMSE = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1), MAE = round(mean(abs(midpoint - human_mid), na.rm = TRUE), 1), .groups = "drop" ) |> arrange(desc(`Spearman ρ`)) agreement_tbl <- bind_rows(hh_baseline_row, llm_rows) knitr::kable(agreement_tbl, align = c("l", rep("r", 6))) |> kableExtra::row_spec(1, bold = TRUE, background = "#f0f8f0") } ``` @tbl-agreement reports agreement metrics between each LLM and the human mean overall rating, with the Human–Human baseline as the top row. The Human–Human row shows pairwise correlation between the two evaluators assigned to the same paper: this is the empirical ceiling any model can hope to reach, since the human "mean" used as the reference is itself a noisy estimate. Where an LLM's Spearman *ρ* approaches the Human–Human baseline, it is providing signal comparable to an additional expert rater. Positive mean bias indicates the model rates papers higher than humans on average; negative bias indicates harsher ratings. **Human baseline context.** Human percentile ratings are a noisy reference signal, not ground truth: even expert evaluators given the same paper differ systematically, reflecting genuine uncertainty in what a "correct" score would be. @tbl-human-baseline quantifies this noise as Krippendorff's α~HH~. The α~HL~ columns show how close each LLM comes to that ceiling. A model achieving α~HL~ ≥ α~HH~ on a criterion is generating signal indistinguishable from an additional human evaluator. Criteria where α~HH~ is already low indicate genuine expert disagreement; low α~HL~ on those criteria is therefore expected, not a model failure. ```{r} #| label: tbl-human-baseline #| tbl-cap: "Krippendorff's α~HH~ (inter-human agreement, **bold**) vs α~HL~ (human–LLM agreement) by criterion, matched sample (N = `r n_opus` papers). α~HH~ is the empirical ceiling: an LLM with α~HL~ ≥ α~HH~ performs at or above the human inter-rater level for that criterion. Criteria where α~HH~ is already low indicate genuine evaluator disagreement." #| code-fold: true if (requireNamespace("irr", quietly = TRUE)) { library(irr) criteria_list <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") criterion_labels <- c( overall = "Overall", claims = "Claims", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) # Compute human-human alpha (each evaluator as separate rater) compute_hh_alpha <- function(criterion) { human_wide <- metrics_human |> filter(criteria == criterion, paper %in% opus_papers) |> select(paper, evaluator, mid) |> distinct() |> pivot_wider(names_from = paper, values_from = mid) if (nrow(human_wide) < 2 || ncol(human_wide) < 3) return(NA_real_) M <- as.matrix(human_wide[, -1, drop = FALSE]) tryCatch({ irr::kripp.alpha(M, method = "interval")$value }, error = function(e) NA_real_) } # Compute human-LLM alpha for a specific model compute_hl_alpha <- function(criterion, model_name) { human_mean <- metrics_human |> filter(criteria == criterion, paper %in% opus_papers) |> group_by(paper) |> summarise(Human = mean(mid, na.rm = TRUE), .groups = "drop") llm_rating <- llm_metrics |> filter(criteria == criterion, model == model_name, paper %in% opus_papers) |> select(paper, midpoint) |> rename(LLM = midpoint) combined <- inner_join(human_mean, llm_rating, by = "paper") if (nrow(combined) < 3) return(NA_real_) M <- rbind(Human = combined$Human, LLM = combined$LLM) tryCatch({ irr::kripp.alpha(M, method = "interval")$value }, error = function(e) NA_real_) } # Get unique LLM models llm_models <- unique(llm_metrics$model) # Build comparison table baseline_results <- tibble(Criterion = criterion_labels[criteria_list]) |> mutate(criterion_key = criteria_list) |> rowwise() |> mutate( `α_HH` = compute_hh_alpha(criterion_key) ) |> ungroup() # Add columns for each LLM model for (mod in llm_models) { col_name <- paste0("α_HL (", mod, ")") baseline_results <- baseline_results |> rowwise() |> mutate(!!col_name := compute_hl_alpha(criterion_key, mod)) |> ungroup() } # Format for display — put α_HH immediately after Criterion column baseline_display <- baseline_results |> select(-criterion_key) |> mutate(across(where(is.numeric), ~ ifelse(is.na(.x), "\u2014", sprintf("%.2f", .x)))) alpha_col_order <- c("Criterion", "\u03b1_HH", sort(grep("^\u03b1_HL", names(baseline_display), value = TRUE))) if (all(alpha_col_order %in% names(baseline_display))) { baseline_display <- baseline_display |> select(all_of(alpha_col_order)) } knitr::kable(baseline_display, align = c("l", rep("r", ncol(baseline_display) - 1))) |> kableExtra::column_spec(2, bold = TRUE) } ``` Where α~HL~ ≥ α~HH~, the LLM is performing at or above the ceiling set by human inter-rater variability — its ratings are as consistent with the human reference as a second human evaluator would be. Criteria where α~HH~ is already low (indicating genuine evaluator disagreement) naturally cap how well any model can do, and low α~HL~ on those criteria reflects the difficulty of the task rather than a model failure. Extended agreement analysis — including CI coverage, bootstrap confidence intervals, and mutual information — is available in the supplementary material. **Criteria-level patterns.** @fig-criteria-heatmap displays the mean rating each evaluator type assigns across the seven criteria. Criteria where all models and humans converge (similar colour intensity) suggest robust agreement on that dimension; criteria with divergent columns indicate systematic differences in how humans and LLMs weigh that aspect of quality. ```{r} #| label: fig-criteria-heatmap #| fig-cap: "Mean rating by evaluator type (rows) and criterion (columns), averaged across all matched papers. Tile colour scales from light (lower mean) to vivid orange (higher mean), clamped to 40\u201380. Criteria where all rows show similar colour intensity indicate broad agreement; divergent columns flag criteria-level systematic differences." #| fig-width: 11 #| fig-height: 6.5 metric_labels <- c( overall = "Overall", claims = "Claims", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) criteria_means <- bind_rows( metrics_human |> filter(criteria %in% names(metric_labels), paper %in% matched_papers) |> group_by(criteria) |> summarise(mean_rating = mean(mid, na.rm = TRUE), .groups = "drop") |> mutate(model = "Human"), llm_metrics |> filter(criteria %in% names(metric_labels), paper %in% matched_papers) |> group_by(model, criteria) |> summarise(mean_rating = mean(midpoint, na.rm = TRUE), .groups = "drop") ) |> mutate(criteria_label = factor(metric_labels[criteria], levels = metric_labels)) if (nrow(criteria_means) > 0) { ggplot(criteria_means, aes(x = criteria_label, y = model, fill = mean_rating)) + geom_tile(color = "white", linewidth = 0.5) + geom_text(aes(label = round(mean_rating, 0)), size = 4.2, color = "white", fontface = "bold") + scale_fill_gradient(low = "#f0f0f0", high = UJ_ORANGE, limits = c(40, 80), name = "Mean rating") + labs(x = NULL, y = NULL) + theme_uj() + theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 11), axis.text.y = element_text(size = 11), panel.grid = element_blank()) } ``` **Additional models.** @fig-scatter-other shows overall ratings for the two lightweight models---GPT-4o-mini and Gemini 2.0 Flash---against human mean ratings. These models cost 50--200× less per paper than reasoning-capable models (see @tbl-cost-overview) but may sacrifice rating accuracy. ```{r} #| label: fig-scatter-other #| fig-cap: "Human mean overall rating (x-axis) vs LLM overall rating (y-axis) for the two lightweight models GPT-4o-mini and Gemini 2.0 Flash. Each point is one paper; the dashed diagonal is the identity line. These models cost 50\u2013200\u00d7 less per paper than reasoning-capable models (see cost table above)." #| fig-width: 11 #| fig-height: 6 other_models <- c("GPT-4o-mini", "Gemini 2.0 Flash") scatter_other <- llm_metrics |> filter(criteria == "overall", model %in% other_models) |> inner_join(human_avg, by = "paper") |> mutate( diff = midpoint - human_mid, paper_short = str_trunc(paper, 25), human_lo = coalesce(human_lo, human_mid), human_hi = coalesce(human_hi, human_mid), lower_bound = coalesce(lower_bound, midpoint), upper_bound = coalesce(upper_bound, midpoint) ) |> filter(!is.na(human_mid), !is.na(midpoint)) if (nrow(scatter_other) > 0) { ggplot(scatter_other, aes(x = human_mid, y = midpoint, color = model)) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") + geom_point(size = 3.5, alpha = 0.7) + ggrepel::geom_text_repel(aes(label = paper_short), size = 2.4, max.overlaps = 8, show.legend = FALSE) + scale_color_manual(values = MODEL_COLORS) + coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) + facet_wrap(~model, ncol = 2) + labs(x = "Human mean rating (0\u2013100)", y = "LLM rating (0\u2013100)", color = NULL) + theme_uj() + theme(strip.text = element_text(size = 13, face = "bold")) } else { cat("No matching data for lightweight model scatter plot.\n") } ``` For per-paper scatter plots, rank comparisons, and the gap heatmap showing Human − LLM differences by paper and criterion, see the [main Results chapter](results.qmd). Extended reasoning traces are available in [Appendix C: LLM Traces](appendix_llm_traces.qmd). :::