{
  "links": {
    "bibtex": "https://inspirehep.net/api/literature/3137091?format=bibtex",
    "latex-eu": "https://inspirehep.net/api/literature/3137091?format=latex-eu",
    "latex-us": "https://inspirehep.net/api/literature/3137091?format=latex-us",
    "json": "https://inspirehep.net/api/literature/3137091?format=json",
    "json-expanded": "https://inspirehep.net/api/literature/3137091?format=json-expanded",
    "cv": "https://inspirehep.net/api/literature/3137091?format=cv",
    "citations": "https://inspirehep.net/api/literature/?q=refersto%3Arecid%3A3137091"
  },
  "updated": "2026-04-02T04:27:05.998824+00:00",
  "id": "3137091",
  "revision_id": 4,
  "uuid": "a1982e2a-41e7-4a6e-aef7-db37b0c6d817",
  "created": "2026-03-31T04:08:20.110430+00:00",
  "metadata": {
    "citation_count_without_self_citations": 2,
    "citation_count": 2,
    "report_numbers": [
      {
        "value": "RISE-AGI-2026-002",
        "source": "arXiv"
      }
    ],
    "core": true,
    "titles": [
      {
        "title": "PRBench: End-to-end Paper Reproduction in Physics Research",
        "source": "arXiv"
      }
    ],
    "$schema": "https://inspirehep.net/schemas/records/hep.json",
    "authors": [
      {
        "uuid": "829ba7a2-a1e7-42d8-ad5a-7a2024fc726c",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1893028"
        },
        "full_name": "Qiu, Shi"
      },
      {
        "uuid": "c4b26eb8-9d81-449c-9b88-172f8bd4c6d7",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2204047"
        },
        "full_name": "Deng, Junyi"
      },
      {
        "uuid": "156be35a-df64-4015-85c5-ad442b79bb85",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1914968"
        },
        "full_name": "Deng, Yiwei"
      },
      {
        "uuid": "ee913d4d-766d-45eb-b03f-50735e57b9e3",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2039127"
        },
        "full_name": "Dong, Haoran"
      },
      {
        "uuid": "599709db-2385-4ecd-98c5-293831688f59",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2183425"
        },
        "full_name": "Fu, Jieyu"
      },
      {
        "uuid": "023359b9-31c8-4928-847b-22aed79878c8",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1914139"
        },
        "full_name": "Li, Mao"
      },
      {
        "uuid": "613c18d1-1f71-4abf-b34b-a71be23fff88",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1887338"
        },
        "full_name": "Li, Zeyu"
      },
      {
        "uuid": "3a23297f-00ec-4cbd-83bb-7c36ce01526f",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2747087"
        },
        "full_name": "Zhang, Zhaolong",
        "curated_relation": true
      },
      {
        "uuid": "9b49600d-7e7d-4e2b-a97d-a93fb17a48db",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1925956"
        },
        "full_name": "Zheng, Huiwen"
      },
      {
        "uuid": "a065b22b-0552-4ebe-b462-5a7e0b7f2c89",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/3112707"
        },
        "full_name": "Bao, Leidong"
      },
      {
        "uuid": "4ea6c6ab-4cc3-4328-849a-4d2c8b073f02",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/3137093"
        },
        "full_name": "Lv, Anqi"
      },
      {
        "uuid": "d2f57c94-6a1e-49d7-8dbf-e699035969fe",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2739740"
        },
        "full_name": "Mo, Zihan"
      },
      {
        "uuid": "84469af5-df33-4fd3-bd92-edd25e61bcaa",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1877903"
        },
        "full_name": "Niu, Yadi"
      },
      {
        "uuid": "7a1edc92-deb6-41f4-a53c-fd09d2602b98",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1889123"
        },
        "full_name": "Peng, Yiyang"
      },
      {
        "uuid": "c8edf894-71f3-4117-ade2-d5ca78dd1cc4",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1233818"
        },
        "full_name": "Tian, Yu"
      },
      {
        "uuid": "71191b24-ea40-4dae-a1e6-74fc67420bb8",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1873641"
        },
        "full_name": "Wang, Yili"
      },
      {
        "uuid": "3e630ca7-bfe1-468c-bb48-d1da857dbd7f",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2128983"
        },
        "full_name": "Wang, Ziyu"
      },
      {
        "uuid": "45595e3e-2123-4456-bb7e-6d87fb98505c",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1961592"
        },
        "full_name": "Wang, Zi-Yu"
      },
      {
        "uuid": "2972105f-cc60-4c6c-9239-d84c16f7bef1",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2621660"
        },
        "full_name": "Wei, Jiashen"
      },
      {
        "uuid": "07159b27-b52d-4346-8801-6b4c96fb38de",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1911747"
        },
        "full_name": "Wu, Liuheng"
      },
      {
        "uuid": "020c859d-16c8-4d37-a88e-6d4dd0d49dbc",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/3137095"
        },
        "full_name": "Xue, Aoran"
      },
      {
        "uuid": "91c7b74c-d8da-40bc-802b-912e5a081352",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2704792"
        },
        "full_name": "Yang, Leyi"
      },
      {
        "uuid": "7d9ad545-ad2c-4cab-9034-414b939d7f3e",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2573134"
        },
        "full_name": "Yuan, Guanglu"
      },
      {
        "uuid": "86c96d9f-8759-42d3-9c30-a4ae0e8f446e",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1041189"
        },
        "full_name": "Zhan, Xiarui"
      },
      {
        "uuid": "faab2a67-2fd2-4cc8-857d-a687e5d5e0ee",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2011069"
        },
        "full_name": "Zhang, Jingjun"
      },
      {
        "uuid": "2c214063-344a-4c00-9037-8409118996da",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2747071"
        },
        "full_name": "Zheng, Zifan"
      },
      {
        "uuid": "195c1490-881f-4a5f-bba8-1cb8593de18c",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2158903"
        },
        "full_name": "Liu, Pengfei"
      },
      {
        "uuid": "082acf88-00e1-4681-ae1a-a6271b3af962",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/3137096"
        },
        "full_name": "Zhen, Linrui"
      },
      {
        "uuid": "1412d6cf-8759-4a93-9dea-41d01d3bd77a",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1000352"
        },
        "full_name": "Li, Kaiyang"
      },
      {
        "uuid": "cae8fa82-77e7-4b0c-8770-3d3637c0fac9",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1987060"
        },
        "full_name": "Li, Qichang"
      },
      {
        "uuid": "3e1a9e69-1821-438f-b50e-5068a57da7d0",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/982084"
        },
        "full_name": "Zhou, Ziheng"
      },
      {
        "uuid": "d7c8379f-4873-4c17-b779-c73cb5187ada",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2631479"
        },
        "full_name": "Nian, Guo-En"
      },
      {
        "uuid": "29aeda16-eabf-43d6-a53d-b19454425a03",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2124621"
        },
        "full_name": "Xiao, Yunwei"
      },
      {
        "uuid": "2d093113-977d-4f06-a61f-f48949260ed1",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1034325"
        },
        "full_name": "Cao, Qing-Hong"
      },
      {
        "uuid": "0b15c334-ee38-4a2d-ab02-2efba502a0d2",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1273310"
        },
        "full_name": "Dai, Linjie"
      },
      {
        "uuid": "43b18003-960b-4226-b340-aa8b58ca821b",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1040303"
        },
        "full_name": "Feng, Xu"
      },
      {
        "uuid": "1a6994b8-9e75-42aa-ba98-001260a47979",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2142479"
        },
        "full_name": "Gao, Peng"
      },
      {
        "uuid": "4b84dc30-9214-466a-9ab3-5dbf1bb21024",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1041058"
        },
        "full_name": "Gu, Ying"
      },
      {
        "uuid": "79986802-f4a3-4fbf-ae56-9aa54c290c1a",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1913207"
        },
        "full_name": "Liu, Chang"
      },
      {
        "uuid": "6d5421d5-2b46-45d8-bb4a-af96784a1366",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2130561"
        },
        "full_name": "Liu, Jia"
      },
      {
        "uuid": "b23a11d2-f0ae-4a48-bc8a-9396bb76d420",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/999547"
        },
        "full_name": "Luo, Ming-xing"
      },
      {
        "uuid": "117e090a-9aa3-4fc2-9d0c-a91935fe5e7c",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1056966"
        },
        "full_name": "Ma, Yan-Qing"
      },
      {
        "uuid": "64a655b7-2347-4838-a646-e7a580d4b27a",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1923165"
        },
        "full_name": "Peng, Liang-You"
      },
      {
        "uuid": "51a372cc-93cc-492c-b761-ae532a36a262",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1029601"
        },
        "full_name": "Song, Huichao"
      },
      {
        "uuid": "fb4eb73e-034c-42e0-be80-2b7dadcedf52",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1969799"
        },
        "full_name": "Wang, Shufeng"
      },
      {
        "uuid": "9de85a85-6968-4572-8d1c-a9e3e4ca3b5a",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1905143"
        },
        "full_name": "Wang, Chenxu"
      },
      {
        "uuid": "4dcf520c-ad2d-474f-bad9-c8849fe7b724",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1315761"
        },
        "full_name": "Wang, Tao"
      },
      {
        "uuid": "b09e670e-7f60-486a-a0cb-ac785d390fb0",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1066122"
        },
        "full_name": "Wang, Yi-Nan"
      },
      {
        "uuid": "ef6eceea-67f9-4836-b8cf-16027fc4e660",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/2082932"
        },
        "full_name": "Wu, Chengyin"
      },
      {
        "uuid": "9739bbcc-c4b7-4915-8524-0670930658b7",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1932646"
        },
        "full_name": "Zhao, Pengwei"
      },
      {
        "uuid": "38c315da-f7e5-4ba8-8dba-84c159c6b7c5",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1061019"
        },
        "full_name": "Zhu, Hua Xing"
      }
    ],
    "curated": false,
    "figures": [
      {
        "key": "e6b47e74a57c07639b56d7efdf80e7ed",
        "url": "https://inspirehep.net/files/e6b47e74a57c07639b56d7efdf80e7ed",
        "label": "fig:placeholder",
        "source": "arxiv",
        "caption": "Overview of paper reproduction. As stated, reproducing computational results from a published paper is a comprehensive and demanding task.",
        "filename": "overview_fig1.png",
        "material": "preprint"
      },
      {
        "key": "678ecbf56447e62db911a4ef2925c3ac",
        "url": "https://inspirehep.net/files/678ecbf56447e62db911a4ef2925c3ac",
        "label": "fig:curation_pipeline",
        "source": "arxiv",
        "caption": "Overview of the PRBench task curation pipeline. The process consists of four stages: paper selection, expert reference reproduction, task specification, and independent verification, ensuring that each task is executable, scientifically grounded, and suitable for rigorous evaluation.",
        "filename": "fig2.png",
        "material": "preprint"
      },
      {
        "key": "fdd24555cfc55961677275ddf2c18629",
        "url": "https://inspirehep.net/files/fdd24555cfc55961677275ddf2c18629",
        "label": "fig:eval-pipeline",
        "source": "arxiv",
        "caption": "Overview of the PRBench evaluation pipeline. A green agent orchestrates the process and performs grading, while a white agent executes the task inside a sandboxed Docker environment.",
        "filename": "fig1-4.png",
        "material": "preprint"
      }
    ],
    "license": [
      {
        "url": "http://creativecommons.org/licenses/by/4.0/",
        "license": "CC BY 4.0",
        "material": "preprint"
      }
    ],
    "texkeys": [
      "Qiu:2026vrx"
    ],
    "citeable": true,
    "abstracts": [
      {
        "value": "AI agents powered by large language models exhibit strong reasoning and problem-solving capabilities, enabling them to assist scientific research tasks such as formula derivation and code generation. However, whether these agents can reliably perform end-to-end reproduction from real scientific papers remains an open question. We introduce PRBench, a benchmark of 30 expert-curated tasks spanning 11 subfields of physics. Each task requires an agent to comprehend the methodology of a published paper, implement the corresponding algorithms from scratch, and produce quantitative results matching the original publication. Agents are provided only with the task instruction and paper content, and operate in a sandboxed execution environment. All tasks are contributed by domain experts from over 20 research groups at the School of Physics, Peking University, each grounded in a real published paper and validated through end-to-end reproduction with verified ground-truth results and detailed scoring rubrics. Using an agentified assessment pipeline, we evaluate a set of coding agents on PRBench and analyze their capabilities across key dimensions of scientific reasoning and execution. The best-performing agent, OpenAI Codex powered by GPT-5.3-Codex, achieves a mean overall score of 34%. All agents exhibit a zero end-to-end callback success rate, with particularly poor performance in data accuracy and code correctness. We further identify systematic failure modes, including errors in formula implementation, inability to debug numerical simulations, and fabrication of output data. Overall, PRBench provides a rigorous benchmark for evaluating progress toward autonomous scientific research.",
        "source": "arXiv"
      }
    ],
    "references": [
      {
        "raw_refs": [
          {
            "value": "[1] Trieu H Trinh, Yuhuai Wu, Quoc V Le, He He, and Thang Luong. Solving olympiad geometry without human demonstrations. Nature, 625:476–482, 2024.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Yuhuai Wu, Quoc V Le",
            "He He, and Thang Luong. Solving olympiad geometry without human demonstrations"
          ],
          "label": "1",
          "texkey": "trinh2024alphageometry",
          "authors": [
            {
              "full_name": "Trinh, Trieu H."
            }
          ],
          "publication_info": {
            "year": 2024,
            "page_end": "482",
            "page_start": "476",
            "journal_title": "Nature",
            "journal_volume": "625"
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[2] Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen, Jinyi Hu, Xu Han, Yujie Huang, Yuxiang Zhang, Jie Liu, Lei Qi, Zhiyuan Liu, and Maosong Sun. Olympiadbench: A challenging benchmark for promoting agi with olympiad-level bilingual multimodal scientific problems. 2024. URL https://arxiv.org/abs/2402.14008.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen, Jinyi Hu, Xu Han, Yujie Huang, Yuxiang Zhang, Jie Liu, Lei Qi, Zhiyuan Liu, and Maosong Sun. Olympiadbench: A challenging benchmark for promoting agi with olympiad-level bilingual multimodal scientific problems.. URL"
          ],
          "label": "2",
          "texkey": "he2024olympiadbench",
          "arxiv_eprint": "2402.14008",
          "publication_info": {
            "year": 2024
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[3] Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse, Andrew N. Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, and Wojciech Zaremba. Evaluating large language models trained on code. 2021. URL https://arxiv.org/abs/2107.03374.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse",
            "Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, and Wojciech Zaremba. Evaluating large language models trained on code.. URL"
          ],
          "label": "3",
          "texkey": "chen2021codex",
          "authors": [
            {
              "full_name": "Carr, Andrew N."
            }
          ],
          "arxiv_eprint": "2107.03374",
          "publication_info": {
            "year": 2021
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[4] Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, and Gabriel Synnaeve. Code llama: Open foundation models for code. 2024. URL https://arxiv.org/abs/2308.12950.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, and Gabriel Synnaeve. Code llama: Open foundation models for code.. URL"
          ],
          "label": "4",
          "texkey": "roziere2023code",
          "arxiv_eprint": "2308.12950",
          "publication_info": {
            "year": 2024
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[5] Daniil A Boiko, Robert MacKnight, Ben Kline, and Gabe Gomes. Autonomous chemical research with large language models. Nature, 624:570–578, 2023.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Robert MacKnight, Ben Kline, and Gabe Gomes. Autonomous chemical research with large language models"
          ],
          "label": "5",
          "texkey": "boiko2023autonomous",
          "authors": [
            {
              "full_name": "Boiko, Daniil A."
            }
          ],
          "publication_info": {
            "year": 2023,
            "page_end": "578",
            "page_start": "570",
            "journal_title": "Nature",
            "journal_volume": "624"
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[6] Bernardino Romera-Paredes, Mohammadamin Barekatain, Alexander Novikov, Matej Balog, M Pawan Kumar, Emilien Dupont, Francisco JR Ruiz, Jordan S Ellenberg, Pengming Wang, Omar Fawzi, et al. Mathematical discoveries from program search with large language models. Nature, 625(7995):468–475, 2024.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Bernardino Romera-Paredes, Mohammadamin Barekatain, Alexander Novikov, Matej",
            "Pawan Kumar, Emilien Dupont, Francisco JR Ruiz, Jordan S Ellenberg",
            "Pengming Wang, Omar Fawzi, et al",
            "Mathematical discoveries from program search with large language models. Nature, 625(7995):468-475"
          ],
          "label": "6",
          "texkey": "romera2024mathematical",
          "authors": [
            {
              "full_name": "Balog, M."
            }
          ],
          "publication_info": {
            "year": 2024
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[7] Alexander Novikov, Ngân Vũ, Marvin Eisenberger, Emilien Dupont, Po-Sen Huang, Adam Zsolt Wagner, Sergey Shirobokov, Borislav Kozlovskii, Francisco J. R. Ruiz, Abbas Mehrabian, M. Pawan Kumar, Abigail See, Swarat Chaudhuri, George Holland, Alex Davies, Sebastian Nowozin, Pushmeet Kohli, and Matej Balog. Alphaevolve: A coding agent for scientific and algorithmic discovery. 2025. URL https://arxiv.org/abs/2506.13131.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Alexander Novikov, Ngân Vũ, Marvin Eisenberger, Emilien Dupont, Po-Sen Huang, Adam Zsolt Wagner, Sergey Shirobokov, Borislav Kozlovskii",
            "Abbas Mehrabian, M",
            "Pawan Kumar, Abigail See, Swarat Chaudhuri, George Holland, Alex Davies, Sebastian Nowozin, Pushmeet Kohli, and Matej Balog. Alphaevolve: A coding agent for scientific and algorithmic discovery.. URL"
          ],
          "label": "7",
          "texkey": "novikov2025alphaevolve",
          "authors": [
            {
              "full_name": "Ruiz, Francisco J.R."
            }
          ],
          "arxiv_eprint": "2506.13131",
          "publication_info": {
            "year": 2025
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[8] Miles Wang, Robi Lin, Kat Hu, Joy Jiao, Neil Chowdhury, Ethan Chang, and Tejal Patwardhan. Frontierscience: Evaluating AI’s ability to perform expert-level scientific tasks, 2026. URL https://arxiv.org/abs/2601.21165.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Miles Wang, Robi Lin, Kat Hu, Joy Jiao, Neil Chowdhury, Ethan Chang, and Tejal Patwardhan. Frontierscience: Evaluating AI’s ability to perform expert-level scientific tasks,. URL"
          ],
          "label": "8",
          "texkey": "wang2026frontierscience",
          "arxiv_eprint": "2601.21165",
          "publication_info": {
            "year": 2026
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[9] Ziru Chen, Shijie Chen, Yuting Ning, Qianheng Zhang, Boshi Wang, Botao Yu, Yifei Li, Zeyi Liao, Chen Wei, Zitong Lu, Vishal Dey, Mingyi Xue, Frazier N. Baker, Benjamin Burns, Daniel Adu-Ampratwum, Xuhui Huang, Xia Ning, Song Gao, Yu Su, and Huan Sun. Scienceagentbench: Toward rigorous assessment of language agents for data-driven scientific discovery. 2025. URL https://arxiv.org/abs/2410.05080.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Ziru Chen, Shijie Chen, Yuting Ning, Qianheng Zhang, Boshi Wang, Botao Yu, Yifei Li, Zeyi Liao, Chen Wei, Zitong Lu, Vishal Dey, Mingyi Xue",
            "Burns, Daniel Adu-Ampratwum, Xuhui Huang, Xia Ning, Song Gao, Yu Su, and Huan Sun. Scienceagentbench: Toward rigorous assessment of language agents for data-driven scientific discovery.. URL"
          ],
          "label": "9",
          "texkey": "chen2024scienceagentbench",
          "authors": [
            {
              "full_name": "Baker, Frazier N."
            }
          ],
          "imprint": {
            "publisher": "Benjamin"
          },
          "arxiv_eprint": "2410.05080",
          "publication_info": {
            "year": 2025
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[10] Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan. Swe-bench: Can language models resolve real-world github issues? 2024. URL https://arxiv.org/abs/2310.06770.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan. Swe-bench: Can language models resolve real-world github issues?. URL"
          ],
          "label": "10",
          "texkey": "jimenez2024swebench",
          "authors": [
            {
              "full_name": "Jimenez, Carlos E."
            }
          ],
          "arxiv_eprint": "2310.06770",
          "publication_info": {
            "year": 2024
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[11] Minyang Tian, Luyu Gao, Shizhuo Dylan Zhang, Xinan Chen, Cunwei Fan, Xuefei Guo, Roland Haas, Pan Ji, Kittithat Krongchon, Yao Li, Shengyan Liu, Di Luo, Yutao Ma, Hao Tong, Kha Trinh, Chenyu Tian, Zihan Wang, Bohao Wu, Yanyu Xiong, Shengzhu Yin, Minhui Zhu, Kilian Lieret, Yanxin Lu, Genglin Liu, Yufeng Du, Tianhua Tao, Ofir Press, Jamie Callan, Eliu Huerta, and Hao Peng. Scicode: A research coding benchmark curated by scientists. 2024. URL https://arxiv.org/abs/2407.13168.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Minyang Tian, Luyu Gao, Shizhuo Dylan Zhang, Xinan Chen, Cunwei Fan, Xuefei Guo, Roland Haas, Pan Ji, Kittithat Krongchon, Yao Li, Shengyan Liu, Di Luo, Yutao Ma, Hao Tong, Kha Trinh, Chenyu Tian, Zihan Wang, Bohao Wu, Yanyu Xiong, Shengzhu Yin, Minhui Zhu, Kilian Lieret, Yanxin Lu, Genglin Liu, Yufeng Du, Tianhua Tao, Ofir Press, Jamie Callan, Eliu Huerta, and Hao Peng. Scicode: A research coding benchmark curated by scientists.. URL"
          ],
          "label": "11",
          "texkey": "tian2024scicode",
          "arxiv_eprint": "2407.13168",
          "publication_info": {
            "year": 2024
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[12] Shi Qiu, Shaoyang Guo, Zhuo-Yang Song, Yunbo Sun, Zeyu Cai, Jiashen Wei, Tianyu Luo, Yixuan Yin, Haoxu Zhang, Yi Hu, Chenyang Wang, Chencheng Tang, Haoling Chang, Qi Liu, Ziheng Zhou, Tianyu Zhang, Jingtian Zhang, Zhangyi Liu, Minghao Li, Yuku Zhang, Boxuan Jing, Xianqi Yin, Yutong Ren, Zizhuo Fu, Jiaming Ji, Weike Wang, Xudong Tian, Anqi Lv, Laifu Man, Jianxiang Li, Feiyu Tao, Qihua Sun, Zhou Liang, Yushu Mu, Zhongxuan Li, JingJun Zhang, Shutao Zhang, Xiaotian Li, Xingqi Xia, Jiawei Lin, Zheyu Shen, Jiahang Chen, Qiuhao Xiong, Binran Wang, Fengyuan Wang, Ziyang Ni, Bohan Zhang, Fan Cui, Changkun Shao, Qing-Hong Cao, Ming xing Luo, Yaodong Yang, Muhan Zhang, and Hua Xing Zhu. Phybench: Holistic evaluation of physical perception and reasoning in large language models, 2025. URL https://arxiv.org/abs/2504.16074.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Shi Qiu, Shaoyang Guo, Zhuo-Yang Song, Yunbo Sun, Zeyu Cai, Jiashen Wei, Tianyu Luo, Yixuan Yin, Haoxu Zhang, Yi Hu, Chenyang Wang, Chencheng Tang, Haoling Chang, Qi Liu, Ziheng Zhou, Tianyu Zhang, Jingtian Zhang, Zhangyi Liu, Minghao Li, Yuku Zhang, Boxuan Jing, Xianqi Yin, Yutong Ren, Zizhuo Fu, Jiaming Ji, Weike Wang, Xudong Tian, Anqi Lv, Laifu Man, Jianxiang Li, Feiyu Tao, Qihua Sun, Zhou Liang, Yushu Mu, Zhongxuan Li, JingJun Zhang, Shutao Zhang, Xiaotian Li, Xingqi Xia, Jiawei Lin, Zheyu Shen, Jiahang Chen, Qiuhao Xiong, Binran Wang, Fengyuan Wang, Ziyang Ni, Bohan Zhang, Fan Cui, Changkun Shao, Qing-Hong Cao, Ming xing Luo, Yaodong Yang, Muhan Zhang, and Hua Xing Zhu. Phybench: Holistic evaluation of physical perception and reasoning in large language models,. URL"
          ],
          "label": "12",
          "texkey": "qiu2025phybenchholisticevaluationphysical",
          "arxiv_eprint": "2504.16074",
          "publication_info": {
            "year": 2025
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[13] AgentBeats Team. Agentbeats: An open platform for standardized and reproducible agent evaluation. https://docs.agentbeats.org/, 2025. Documentation website.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "AgentBeats Team. Agentbeats: An open platform for standardized and reproducible agent evaluation",
            "Documentation website"
          ],
          "urls": [
            {
              "value": "https://docs.agentbeats.org/"
            }
          ],
          "label": "13",
          "texkey": "agentbeats",
          "publication_info": {
            "year": 2025
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[14] John Jumper, Richard Evans, Alexander Pritzel, Tim Green, Michael Figurnov, Olaf Ronneberger, Kathryn Tunyasuvunakool, Russ Bates, Augustin Žídek, Anna Potapenko, et al. Highly accurate protein structure prediction with AlphaFold. Nature, 596:583–589, 2021.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "John Jumper, Richard Evans, Alexander Pritzel, Tim Green, Michael Figurnov, Olaf Ronneberger, Kathryn Tunyasuvunakool, Russ Bates, Augustin Žídek, Anna",
            "Highly accurate protein structure prediction with AlphaFold"
          ],
          "label": "14",
          "texkey": "jumper2021alphafold",
          "authors": [
            {
              "full_name": "Potapenko"
            }
          ],
          "publication_info": {
            "year": 2021,
            "page_end": "589",
            "page_start": "583",
            "journal_title": "Nature",
            "journal_volume": "596"
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[15] Amil Merchant, Simon Batzner, Samuel S Schoenholz, Muratahan Aykol, Gowoon Cheon, and Ekin Dogus Cubuk. Scaling deep learning for materials discovery. Nature, 624:80–85, 2023.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Amil Merchant, Simon Batzner",
            "Muratahan Aykol, Gowoon Cheon, and Ekin Dogus Cubuk. Scaling deep learning for materials discovery"
          ],
          "label": "15",
          "texkey": "merchant2023scaling",
          "authors": [
            {
              "full_name": "Schoenholz, Samuel S."
            }
          ],
          "publication_info": {
            "year": 2023,
            "page_end": "85",
            "page_start": "80",
            "journal_title": "Nature",
            "journal_volume": "624"
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[16] Remi Lam, Alvaro Sanchez-Gonzalez, Matthew Willson, Peter Wirnsberger, Meire Fortunato, Ferran Alet, Suman Ravuri, Timo Ewalds, Zach Eaton-Rosen, Weihua Hu, Alexander Merose, Stephan Hoyer, George Holland, Oriol Vinyals, Jacklynn Stott, Alexander Pritzel, Shakir Mohamed, and Peter Battaglia. Graphcast: Learning skillful medium-range global weather forecasting. Science, 382:1416–1421, 2023.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "Remi Lam, Alvaro Sanchez-Gonzalez, Matthew Willson, Peter Wirnsberger, Meire Fortunato, Ferran Alet, Suman Ravuri, Timo Ewalds, Zach Eaton-Rosen, Weihua Hu, Alexander Merose, Stephan Hoyer, George Holland, Oriol Vinyals, Jacklynn Stott, Alexander Pritzel, Shakir Mohamed, and Peter Battaglia. Graphcast: Learning skillful medium-range global weather forecasting"
          ],
          "label": "16",
          "texkey": "lam2023graphcast",
          "publication_info": {
            "year": 2023,
            "page_end": "1421",
            "page_start": "1416",
            "journal_title": "Science",
            "journal_volume": "382"
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[17] The impact of large language models on scientific title=The Impact of Large Language Models on Scientific Discovery: a Preliminary Study using GPT-4, author=Microsoft Research AI4Science and Microsoft Azure Quantum, year=2023, eprint=2311.07361, archiveprefix=arXiv, primaryclass=cs.CL, url=https://arxiv.org/abs/2311.07361,.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "The impact of large language models on scientific title=The Impact of Large Language Models on Scientific Discovery: a Preliminary Study using GPT-4, author=Microsoft Research AI4Science and Microsoft Azure Quantum, year=, eprint=",
            "archiveprefix=arXiv, primaryclass=cs.CL, url="
          ],
          "label": "17",
          "texkey": "ai4science2023impact",
          "arxiv_eprint": "2311.07361",
          "publication_info": {
            "year": 2023
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[18] David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R. Bowman. Gpqa: A graduate-level google-proof q&a benchmark. 2023. URL https://arxiv.org/abs/2311.12022.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R. Bowman. Gpqa: A graduate-level google-proof q&a benchmark.. URL"
          ],
          "label": "18",
          "texkey": "rein2024gpqa",
          "arxiv_eprint": "2311.12022",
          "publication_info": {
            "year": 2023
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[19] A2A Project Contributors. Agent2agent (A2A) protocol. https://github.com/a2aprojec t/A2A, 2025. Technical specification repository.",
            "schema": "text",
            "source": "arXiv"
          }
        ],
        "reference": {
          "misc": [
            "A2A Project Contributors. Agent2agent (A2A) protocol",
            "t/A2A,. Technical specification repository"
          ],
          "urls": [
            {
              "value": "https://github.com/a2aprojec"
            },
            {
              "value": "https://github.com/a2aproject/A2A"
            }
          ],
          "label": "19",
          "texkey": "google2025a2a",
          "publication_info": {
            "year": 2025
          }
        }
      }
    ],
    "public_notes": [
      {
        "value": "17 pages, 3 figures",
        "source": "arXiv"
      }
    ],
    "arxiv_eprints": [
      {
        "value": "2603.27646",
        "categories": [
          "cs.CL",
          "hep-lat",
          "hep-ph",
          "physics.comp-ph",
          "physics.optics"
        ]
      }
    ],
    "document_type": [
      "article"
    ],
    "preprint_date": "2026-03-29",
    "control_number": 3137091,
    "number_of_pages": 17,
    "inspire_categories": [
      {
        "term": "Computing",
        "source": "arxiv"
      },
      {
        "term": "Lattice",
        "source": "arxiv"
      },
      {
        "term": "Phenomenology-HEP",
        "source": "arxiv"
      },
      {
        "term": "General Physics",
        "source": "arxiv"
      }
    ]
  }
}