๐ CapArena-Auto Leaderboard ๐
๐ฌ Metric Explanations
CapArena-Auto is an arena-style automated evaluation benchmark for detailed captioning. It includes 600 evaluation images and assesses model performance through pairwise battles with three baseline models. The final score is calculated by GPT4o-as-a-Judge.
{
- "headers": [
- "",
- "Model",
- "Score_Avg โฌ๏ธ",
- "Score_GPT",
- "Score_COG",
- "Score_CPM",
- "Length_Avg",
- "Available on the hub"
- "data": [
- [
- 1,
- "<a target="_blank" href="https://ai.google.dev/gemini-api/docs/models/gemini" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Gemini-1.5-pro-002</a>",
- 56.17,
- 29,
- 61,
- 78.5,
- 168.56,
- false
- [
- 2,
- "<a target="_blank" href="https://platform.openai.com/docs/models/gpt-4o" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ GPT-4o-0806</a>",
- 44,
- 0,
- 55.5,
- 76.5,
- 115.8,
- false
- [
- 3,
- "<a target="_blank" href="https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Qwen2.5VL-72B</a>",
- 35.33,
- -1,
- 49,
- 58,
- 163.67,
- true
- [
- 4,
- "<a target="_blank" href="https://ai.google.dev/gemini-api/docs/models/gemini" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Gemini-2.0-flash-exp</a>",
- 30.83,
- -2,
- 39.5,
- 55,
- 416.99,
- false
- [
- 5,
- "<a target="_blank" href="https://huggingface.co/AIDC-AI/Ovis2-34B" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Ovis-2-34b</a>",
- 27,
- -15,
- 33.5,
- 62.5,
- 120.2,
- true
- [
- 6,
- "<a target="_blank" href="https://www.anthropic.com/news/claude-3-5-sonnet" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Claude-3.5-Sonnet-0620</a>",
- 21.5,
- -14,
- 30,
- 48.5,
- 147.93,
- false
- [
- 7,
- "<a target="_blank" href="https://huggingface.co/OpenGVLab/InternVL2-26B" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ InternVL2-26B</a>",
- 13,
- -38.5,
- 20,
- 57.5,
- 236.32,
- true
- [
- 8,
- "<a target="_blank" href="https://platform.openai.com/docs/models/gpt-4o" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ GPT-4o-mini-0718</a>",
- 9.33,
- -36,
- 17,
- 47,
- 139.83,
- false
- [
- 9,
- "<a target="_blank" href="https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Ovis-1_6-27b</a>",
- 3,
- -49.5,
- 14.5,
- 44,
- 94.16,
- true
- [
- 10,
- "<a target="_blank" href="https://github.com/THUDM/GLM-4" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ GLM-4V-Plus</a>",
- -0.17,
- -51.5,
- 13,
- 38,
- 109.27,
- false
- [
- 11,
- "<a target="_blank" href="https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ CogVLM2-llama3-chat-19B</a>",
- -8.5,
- -56.5,
- 0,
- 31,
- 115.87,
- true
- [
- 12,
- "<a target="_blank" href="https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Qwen2-VL-72B-Instruct</a>",
- -9,
- -50.5,
- -4.5,
- 28,
- 114.45,
- true
- [
- 13,
- "<a target="_blank" href="https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ LLaVA-OV-72b</a>",
- -12.33,
- -57.5,
- -6,
- 26.5,
- 200.88,
- true
- [
- 14,
- "<a target="_blank" href="https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ LLama-3.2-90B</a>",
- -25.67,
- -72,
- -13,
- 8,
- 160.25,
- true
- [
- 15,
- "<a target="_blank" href="https://cloud.tencent.com/document/product/1729/101832" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Hunyuan-standard-vision</a>",
- -26,
- -63,
- -19,
- 4,
- 354.1,
- false
- [
- 16,
- "<a target="_blank" href="https://huggingface.co/OpenGVLab/InternVL2_5-8B" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Internvl2-5-8b</a>",
- -29.83,
- -71,
- -29,
- 10.5,
- 117.77,
- true
- [
- 17,
- "<a target="_blank" href="https://huggingface.co/openbmb/MiniCPM-V-2_6" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ MiniCPM-V2.6-8B</a>",
- -38,
- -80,
- -34,
- 0,
- 106.74,
- true
- [
- 18,
- "<a target="_blank" href="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Qwen2-VL-2B-Instruct</a>",
- -48.67,
- -86,
- -49.5,
- -10.5,
- 116.84,
- true
- [
- 19,
- "<a target="_blank" href="https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ Qwen2-VL-7B-Instruct</a>",
- -49,
- -78,
- -59,
- -10,
- 97.81,
- true
- [
- 20,
- "<a target="_blank" href="https://huggingface.co/llava-hf/llava-v1.6-34b-hf" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ LLaVA-1.6-34B</a>",
- -67.5,
- -92,
- -53.5,
- -57,
- 124.81,
- true
- [
- 21,
- "<a target="_blank" href="https://huggingface.co/nyu-visionx/cambrian-34b" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ cambrian-34b</a>",
- -75,
- -93,
- -76,
- -56,
- 120.23,
- true
- [
- 22,
- "<a target="_blank" href="https://huggingface.co/llava-hf/llava-1.5-7b-hf" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">๐ LLaVA-1.5-7B</a>",
- -94,
- -99.5,
- -92,
- -90.5,
- 74.38,
- true
- [
- "metadata": null