ARFBench Multimodal Time Series Reasoning Leaderboard

ARFBench (Anomaly Reasoning Framework Benchmark) is a multimodal time-series reasoning benchmark consisting of 750 question-answer (QA) pairs composed from real-world incident data collected at Datadog, a leading observability platform.

The benchmark evaluates models across various aspects of time-series anomaly reasoning:

  • Presence: Detecting if anomalies exist in the data
  • Identification: Identifying specific anomalous metrics
  • Start Time: Determining when anomalies began
  • End Time: Determining when anomalies ended
  • Magnitude: Assessing the severity of anomalies
  • Categorization: Classifying anomaly types
  • Correlation: Understanding relationships between anomalies
  • Indicator: Identifying leading indicators
{
  • "headers": [
    • "model",
    • "model_type",
    • "accuracy",
    • "tier_i_accuracy",
    • "tier_ii_accuracy",
    • "tier_iii_accuracy",
    • "overall_f1",
    • "tier_i_f1",
    • "tier_ii_f1",
    • "tier_iii_f1"
    ],
  • "data": [
    • [
      • "Model-Expert Oracle",
      • "Baseline",
      • 87.2,
      • 96.4,
      • 80.3,
      • 90.5,
      • 82.8,
      • 89,
      • 77.1,
      • 86.3
      ],
    • [
      • "Domain Experts (n=2)",
      • "Baseline",
      • 72.7,
      • 89.3,
      • 67.7,
      • 71.4,
      • 64.6,
      • 76.1,
      • 64.5,
      • 60.9
      ],
    • [
      • "Non-domain Experts (n=2)",
      • "Baseline",
      • 69.7,
      • 80.4,
      • 63.2,
      • 72,
      • 60.7,
      • 68,
      • 59.9,
      • 59
      ],
    • [
      • "Toto-1.0-QA-Experimental 32B (TSFM-VLM)",
      • "Post-trained TSFM",
      • 63.9,
      • 84.7,
      • 55.6,
      • 64.6,
      • 48.9,
      • 66.3,
      • 48.4,
      • 43.5
      ],
    • [
      • "GPT-5",
      • "VLM",
      • 62.7,
      • 82,
      • 55.9,
      • 62.5,
      • 51.9,
      • 66.9,
      • 51.2,
      • 47.5
      ],
    • [
      • "GPT-5.4",
      • "VLM",
      • 61.3,
      • 81.1,
      • 54.2,
      • 61.3,
      • 51.4,
      • 62.6,
      • 50.4,
      • 48.4
      ],
    • [
      • "Gemini 3 Pro",
      • "VLM",
      • 58.1,
      • 82.9,
      • 51,
      • 56.5,
      • 49.6,
      • 67.8,
      • 49.7,
      • 43.4
      ],
    • [
      • "Qwen3-VL 32B (post-trained)",
      • "Post-trained TSFM",
      • 56.9,
      • 84.7,
      • 50.3,
      • 53.8,
      • 46.6,
      • 69.8,
      • 44.9,
      • 40.5
      ],
    • [
      • "GPT-5 (text)",
      • "LLM",
      • 56.4,
      • 82.6,
      • 45.2,
      • 57.9,
      • 43.8,
      • 66.1,
      • 39.6,
      • 40.3
      ],
    • [
      • "Claude Opus 4.6",
      • "VLM",
      • 54.8,
      • 88.3,
      • 52.3,
      • 45.9,
      • 46.7,
      • 65.8,
      • 49.1,
      • 38.2
      ],
    • [
      • "Qwen3-VL 32B",
      • "VLM",
      • 52.8,
      • 80.2,
      • 46.7,
      • 49.2,
      • 45.1,
      • 65.1,
      • 41.9,
      • 41.3
      ],
    • [
      • "Toto-1.0-Qwen3 32B (TSFM-LLM)",
      • "Post-trained TSFM",
      • 48.8,
      • 82.9,
      • 47.4,
      • 38.7,
      • 33.9,
      • 60,
      • 43.6,
      • 16.4
      ],
    • [
      • "Qwen3 32B",
      • "LLM",
      • 47.9,
      • 80.9,
      • 35.1,
      • 48.6,
      • 36.1,
      • 55.7,
      • 31.5,
      • 33.8
      ],
    • [
      • "GPT-4.1",
      • "VLM",
      • 47.9,
      • 80.2,
      • 50.3,
      • 34.8,
      • 44,
      • 65.1,
      • 48,
      • 33.1
      ],
    • [
      • "Claude Sonnet 4.5",
      • "VLM",
      • 47.2,
      • 83.8,
      • 43.5,
      • 38.4,
      • 37.9,
      • 63.2,
      • 40.6,
      • 26.9
      ],
    • [
      • "GPT-4o",
      • "VLM",
      • 47.2,
      • 79.3,
      • 49,
      • 34.8,
      • 42.4,
      • 64.2,
      • 43.8,
      • 33.8
      ],
    • [
      • "Qwen3-VL 8B",
      • "VLM",
      • 45.3,
      • 80.2,
      • 40.8,
      • 37.8,
      • 34.7,
      • 63.5,
      • 36.1,
      • 23.6
      ],
    • [
      • "Per-category Frequent Choice",
      • "Baseline",
      • 45.1,
      • 84.7,
      • 30.1,
      • 45.6,
      • 17.3,
      • 45.9,
      • 12.3,
      • 12.5
      ],
    • [
      • "ChatTS 8B (TS-LLM)",
      • "Post-trained TSFM",
      • 31.1,
      • 60.4,
      • 26.5,
      • 25.5,
      • 22.1,
      • 48.1,
      • 20,
      • 15.4
      ],
    • [
      • "Random Choice",
      • "Baseline",
      • 24.5,
      • 50,
      • 20,
      • 20,
      • 22.5,
      • 45.6,
      • 20.5,
      • 16.8
      ],
    • [
      • "OpenTSLM 1B (TS-LLM)",
      • "Post-trained TSFM",
      • 0.8,
      • 0,
      • 2,
      • 0,
      • 1.2,
      • 0,
      • 3,
      • 0
      ]
    ],
  • "metadata": null
}