ARFBench Multimodal Time Series Reasoning Leaderboard
ARFBench (Anomaly Reasoning Framework Benchmark) is a multimodal time-series reasoning benchmark consisting of 750 question-answer (QA) pairs composed from real-world incident data collected at Datadog, a leading observability platform.
The benchmark evaluates models across various aspects of time-series anomaly reasoning:
- Presence: Detecting if anomalies exist in the data
- Identification: Identifying specific anomalous metrics
- Start Time: Determining when anomalies began
- End Time: Determining when anomalies ended
- Magnitude: Assessing the severity of anomalies
- Categorization: Classifying anomaly types
- Correlation: Understanding relationships between anomalies
- Indicator: Identifying leading indicators
{
- "headers": [
- "model",
- "model_type",
- "accuracy",
- "tier_i_accuracy",
- "tier_ii_accuracy",
- "tier_iii_accuracy",
- "overall_f1",
- "tier_i_f1",
- "tier_ii_f1",
- "tier_iii_f1"
- "data": [
- [
- "Model-Expert Oracle",
- "Baseline",
- 87.2,
- 96.4,
- 80.3,
- 90.5,
- 82.8,
- 89,
- 77.1,
- 86.3
- [
- "Domain Experts (n=2)",
- "Baseline",
- 72.7,
- 89.3,
- 67.7,
- 71.4,
- 64.6,
- 76.1,
- 64.5,
- 60.9
- [
- "Non-domain Experts (n=2)",
- "Baseline",
- 69.7,
- 80.4,
- 63.2,
- 72,
- 60.7,
- 68,
- 59.9,
- 59
- [
- "Toto-1.0-QA-Experimental 32B (TSFM-VLM)",
- "Post-trained TSFM",
- 63.9,
- 84.7,
- 55.6,
- 64.6,
- 48.9,
- 66.3,
- 48.4,
- 43.5
- [
- "GPT-5",
- "VLM",
- 62.7,
- 82,
- 55.9,
- 62.5,
- 51.9,
- 66.9,
- 51.2,
- 47.5
- [
- "GPT-5.4",
- "VLM",
- 61.3,
- 81.1,
- 54.2,
- 61.3,
- 51.4,
- 62.6,
- 50.4,
- 48.4
- [
- "Gemini 3 Pro",
- "VLM",
- 58.1,
- 82.9,
- 51,
- 56.5,
- 49.6,
- 67.8,
- 49.7,
- 43.4
- [
- "Qwen3-VL 32B (post-trained)",
- "Post-trained TSFM",
- 56.9,
- 84.7,
- 50.3,
- 53.8,
- 46.6,
- 69.8,
- 44.9,
- 40.5
- [
- "GPT-5 (text)",
- "LLM",
- 56.4,
- 82.6,
- 45.2,
- 57.9,
- 43.8,
- 66.1,
- 39.6,
- 40.3
- [
- "Claude Opus 4.6",
- "VLM",
- 54.8,
- 88.3,
- 52.3,
- 45.9,
- 46.7,
- 65.8,
- 49.1,
- 38.2
- [
- "Qwen3-VL 32B",
- "VLM",
- 52.8,
- 80.2,
- 46.7,
- 49.2,
- 45.1,
- 65.1,
- 41.9,
- 41.3
- [
- "Toto-1.0-Qwen3 32B (TSFM-LLM)",
- "Post-trained TSFM",
- 48.8,
- 82.9,
- 47.4,
- 38.7,
- 33.9,
- 60,
- 43.6,
- 16.4
- [
- "Qwen3 32B",
- "LLM",
- 47.9,
- 80.9,
- 35.1,
- 48.6,
- 36.1,
- 55.7,
- 31.5,
- 33.8
- [
- "GPT-4.1",
- "VLM",
- 47.9,
- 80.2,
- 50.3,
- 34.8,
- 44,
- 65.1,
- 48,
- 33.1
- [
- "Claude Sonnet 4.5",
- "VLM",
- 47.2,
- 83.8,
- 43.5,
- 38.4,
- 37.9,
- 63.2,
- 40.6,
- 26.9
- [
- "GPT-4o",
- "VLM",
- 47.2,
- 79.3,
- 49,
- 34.8,
- 42.4,
- 64.2,
- 43.8,
- 33.8
- [
- "Qwen3-VL 8B",
- "VLM",
- 45.3,
- 80.2,
- 40.8,
- 37.8,
- 34.7,
- 63.5,
- 36.1,
- 23.6
- [
- "Per-category Frequent Choice",
- "Baseline",
- 45.1,
- 84.7,
- 30.1,
- 45.6,
- 17.3,
- 45.9,
- 12.3,
- 12.5
- [
- "ChatTS 8B (TS-LLM)",
- "Post-trained TSFM",
- 31.1,
- 60.4,
- 26.5,
- 25.5,
- 22.1,
- 48.1,
- 20,
- 15.4
- [
- "Random Choice",
- "Baseline",
- 24.5,
- 50,
- 20,
- 20,
- 22.5,
- 45.6,
- 20.5,
- 16.8
- [
- "OpenTSLM 1B (TS-LLM)",
- "Post-trained TSFM",
- 0.8,
- 0,
- 2,
- 0,
- 1.2,
- 0,
- 3,
- 0
- [
- "metadata": null
{
- "headers": [
- "model",
- "model_type",
- "overall_f1",
- "presence",
- "identification",
- "start_time",
- "end_time",
- "magnitude",
- "categorization",
- "correlation",
- "indicator"
- "data": [
- [
- "Model-Expert Oracle",
- "Baseline",
- 82.8,
- 89,
- 68.3,
- 83.4,
- 1,
- 67,
- 75.6,
- 94.4,
- 77.8
- [
- "Domain Experts (n=2)",
- "Baseline",
- 64.6,
- 76.1,
- 77.5,
- 74.2,
- 72.6,
- 51.8,
- 67.3,
- 64.1,
- 57.6
- [
- "Non-domain Experts (n=2)",
- "Baseline",
- 61.3,
- 68,
- 79,
- 67.4,
- 67.2,
- 40.3,
- 61.2,
- 58.4,
- 62.4
- [
- "GPT-5",
- "VLM",
- 51.9,
- 66.8,
- 32.8,
- 44.2,
- 47.8,
- 59.1,
- 57,
- 49,
- 45.9
- [
- "GPT-5.4",
- "VLM",
- 51.4,
- 62.6,
- 29.6,
- 53.3,
- 55.1,
- 51.7,
- 54.1,
- 47.7,
- 49.1
- [
- "Gemini 3 Pro",
- "VLM",
- 49.6,
- 67.8,
- 38.6,
- 43.3,
- 57.1,
- 50.3,
- 54.5,
- 57,
- 29.2
- [
- "Toto-1.0-QA-Experimental 32B (TSFM-VLM)",
- "Post-trained TSFM",
- 48.9,
- 66.3,
- 46.9,
- 23,
- 48.8,
- 54.1,
- 58.4,
- 44.2,
- 42.7
- [
- "Claude Opus 4.6",
- "VLM",
- 46.7,
- 65.8,
- 34.3,
- 36.1,
- 45.1,
- 53.8,
- 59.2,
- 51.6,
- 24.1
- [
- "Qwen3-VL 32B (post-trained)",
- "Post-trained TSFM",
- 46.6,
- 69.7,
- 40.5,
- 37.2,
- 36.7,
- 48.9,
- 50.3,
- 46.8,
- 33.9
- [
- "Qwen3-VL 32B",
- "VLM",
- 45.1,
- 65.1,
- 25,
- 30.8,
- 46.7,
- 46.9,
- 49,
- 47.5,
- 34.7
- [
- "GPT-4.1",
- "VLM",
- 44,
- 65.1,
- 29.2,
- 33.5,
- 32.7,
- 63.7,
- 55.9,
- 42.9,
- 23.3
- [
- "GPT-5 (text)",
- "LLM",
- 43.8,
- 66.1,
- 38.1,
- 27.9,
- 27,
- 44.8,
- 47.6,
- 38,
- 42.4
- [
- "GPT-4o",
- "VLM",
- 42.4,
- 64.2,
- 34.6,
- 30.3,
- 36.1,
- 51.8,
- 50.8,
- 40.1,
- 27.2
- [
- "Claude Sonnet 4.5",
- "VLM",
- 37.9,
- 63.2,
- 16.8,
- 33.2,
- 31.3,
- 49.3,
- 49.8,
- 33.8,
- 19.8
- [
- "Qwen3 32B",
- "LLM",
- 36.1,
- 55.7,
- 28.4,
- 26.6,
- 26.9,
- 31.4,
- 36.8,
- 32.3,
- 35.4
- [
- "Qwen3-VL 8B",
- "VLM",
- 34.7,
- 63.5,
- 28.6,
- 21.8,
- 23.5,
- 47,
- 42.8,
- 33.1,
- 13.8
- [
- "Toto-1.0-Qwen3 32B (TSFM-LLM)",
- "Post-trained TSFM",
- 33.9,
- 59.9,
- 17.5,
- 41.3,
- 23,
- 35.9,
- 66.2,
- 18.6,
- 14.1
- [
- "Random Choice",
- "Baseline",
- 22.5,
- 45.6,
- 21.2,
- 18.9,
- 18.2,
- 20.4,
- 21.7,
- 15.8,
- 17.8
- [
- "ChatTS 8B (TS-LLM)",
- "Post-trained TSFM",
- 22.1,
- 48.1,
- 22.2,
- 15,
- 14.4,
- 27.9,
- 17.9,
- 21.4,
- 9.2
- [
- "Per-category Frequent Choice",
- "Baseline",
- 17.3,
- 45.9,
- 10.8,
- 16.3,
- 14.1,
- 6,
- 14.6,
- 12,
- 13.1
- [
- "OpenTSLM 1B (TS-LLM)",
- "Post-trained TSFM",
- 1.2,
- 0,
- 8.2,
- 2.7,
- 0,
- 6,
- 0,
- 0,
- 0
- [
- "metadata": null
{
- "headers": [
- "model",
- "model_type",
- "overall_accuracy",
- "presence",
- "identification",
- "start_time",
- "end_time",
- "magnitude",
- "categorization",
- "correlation",
- "indicator"
- "data": [
- [
- "Model-Expert Oracle",
- "Baseline",
- 87.2,
- 96.4,
- 77.8,
- 78.6,
- 100,
- 68.4,
- 84.6,
- 95.4,
- 85.4
- [
- "Domain Experts (n=2)",
- "Baseline",
- 72.7,
- 89.3,
- 77.8,
- 67.9,
- 75,
- 60.5,
- 72.4,
- 74.4,
- 68.3
- [
- "Non-domain Experts (n=2)",
- "Baseline",
- 69.7,
- 80.4,
- 66.7,
- 64.3,
- 68.8,
- 60.5,
- 61.5,
- 72.1,
- 72
- [
- "Toto-1.0-QA-Experimental (TSFM-VLM)",
- "Post-trained TSFM",
- 63.9,
- 84.7,
- 47.4,
- 26.8,
- 59.4,
- 64.5,
- 66.3,
- 68.8,
- 60.1
- [
- "GPT-5",
- "VLM",
- 62.7,
- 82,
- 31.6,
- 44.6,
- 68.8,
- 65.8,
- 59.6,
- 63.5,
- 61.3
- [
- "GPT-5.4",
- "VLM",
- 61.3,
- 81.1,
- 31.6,
- 63.6,
- 65.6,
- 57.9,
- 56.7,
- 61.8,
- 60.7
- [
- "Gemini 3 Pro",
- "VLM",
- 58.1,
- 82.9,
- 28.9,
- 44.6,
- 62.5,
- 56.7,
- 54.8,
- 71.2,
- 41.1
- [
- "Qwen3-VL 32B (post-trained)",
- "Post-trained TSFM",
- 56.9,
- 84.7,
- 36.8,
- 41.1,
- 43.8,
- 63.2,
- 52.9,
- 67.6,
- 39.3
- [
- "GPT-5 (text)",
- "LLM",
- 56.4,
- 82.6,
- 47.4,
- 29.6,
- 38.7,
- 51.4,
- 50,
- 56.9,
- 59
- [
- "Claude Opus 4.6",
- "VLM",
- 54.8,
- 88.3,
- 31.6,
- 37.5,
- 53.1,
- 57.9,
- 63.5,
- 65.9,
- 25.2
- [
- "Qwen3-VL 32B",
- "VLM",
- 52.8,
- 80.2,
- 23.7,
- 33.9,
- 56.3,
- 59.2,
- 50,
- 61.8,
- 36.2
- [
- "Toto-1.0-Qwen3 (TSFM-LLM)",
- "Post-trained TSFM",
- 48.8,
- 82.9,
- 10.5,
- 35.7,
- 34.4,
- 47.4,
- 71.2,
- 41.8,
- 35.6
- [
- "Qwen3 32B (text)",
- "LLM",
- 47.9,
- 80.9,
- 28.9,
- 27.3,
- 35.5,
- 37.3,
- 39.8,
- 50.9,
- 46.3
- [
- "GPT-4.1",
- "VLM",
- 47.9,
- 80.2,
- 28.9,
- 33.9,
- 40.6,
- 68.4,
- 56.7,
- 45.9,
- 23.3
- [
- "Claude Sonnet 4.5",
- "VLM",
- 47.2,
- 83.8,
- 18.4,
- 30.4,
- 37.5,
- 53.9,
- 53.8,
- 58.8,
- 17.2
- [
- "GPT-4o",
- "VLM",
- 47.2,
- 79.3,
- 39.5,
- 35.7,
- 43.8,
- 61.8,
- 51.9,
- 45.3,
- 23.9
- [
- "Qwen3-VL 8B",
- "VLM",
- 45.3,
- 80.2,
- 26.3,
- 25,
- 31.3,
- 57.9,
- 45.2,
- 57.1,
- 17.8
- [
- "Per-category Frequent Choice",
- "Baseline",
- 45.1,
- 84.7,
- 36.8,
- 35.7,
- 34.4,
- 17.1,
- 32.7,
- 42.9,
- 48.5
- [
- "ChatTS (TS-LLM)",
- "Post-trained TSFM",
- 31.1,
- 59.5,
- 15.8,
- 16.1,
- 15.6,
- 28.9,
- 20.2,
- 40,
- 14.7
- [
- "Random Choice",
- "Baseline",
- 24.5,
- 50,
- 20,
- 20,
- 20,
- 20,
- 20,
- 20,
- 20
- [
- "OpenTSLM (TS-LLM)",
- "Post-trained TSFM",
- 0.8,
- 0,
- 0,
- 3.6,
- 0,
- 5.3,
- 0,
- 0,
- 0
- [
- "metadata": null
For more details on the benchmark, refer to the ARFBench dataset card
Reproducibility
See the ARFBench repository for more details on how to reproduce the benchmark.