{
  "revision_id": 13,
  "uuid": "27b4db51-ca45-4514-b349-401189017f72",
  "created": "2022-03-15T02:54:30.889071+00:00",
  "links": {
    "bibtex": "https://inspirehep.net/api/literature/2050810?format=bibtex",
    "latex-eu": "https://inspirehep.net/api/literature/2050810?format=latex-eu",
    "latex-us": "https://inspirehep.net/api/literature/2050810?format=latex-us",
    "json": "https://inspirehep.net/api/literature/2050810?format=json",
    "json-expanded": "https://inspirehep.net/api/literature/2050810?format=json-expanded",
    "cv": "https://inspirehep.net/api/literature/2050810?format=cv",
    "citations": "https://inspirehep.net/api/literature/?q=refersto%3Arecid%3A2050810"
  },
  "id": "2050810",
  "updated": "2025-08-04T18:02:42.961934+00:00",
  "metadata": {
    "documents": [
      {
        "key": "0e256b3416d2c701c308aae8a2e550a5",
        "url": "https://inspirehep.net/files/0e256b3416d2c701c308aae8a2e550a5",
        "filename": "document",
        "fulltext": true
      }
    ],
    "citation_count_without_self_citations": 32,
    "publication_info": [
      {
        "cnum": "C21-07-26.3",
        "year": 2022,
        "artid": "035",
        "page_start": "035",
        "journal_title": "PoS",
        "journal_record": {
          "$ref": "https://inspirehep.net/api/journals/1213080"
        },
        "journal_volume": "LATTICE2021",
        "conference_record": {
          "$ref": "https://inspirehep.net/api/conferences/1898790"
        }
      },
      {
        "cnum": "C21-07-26.3",
        "conference_record": {
          "$ref": "https://inspirehep.net/api/conferences/1898790"
        }
      }
    ],
    "citation_count": 48,
    "core": true,
    "dois": [
      {
        "value": "10.22323/1.396.0035",
        "source": "SISSA"
      }
    ],
    "titles": [
      {
        "title": "Grid: OneCode and FourAPIs",
        "source": "SISSA"
      },
      {
        "title": "Grid: OneCode and FourAPIs",
        "source": "arXiv"
      }
    ],
    "$schema": "https://inspirehep.net/schemas/records/hep.json",
    "authors": [
      {
        "uuid": "f41b0f45-2dbc-4e19-a221-0660f737b2dd",
        "emails": [
          "ayamaguc@staffmail.ed.ac.uk"
        ],
        "record": {
          "$ref": "https://inspirehep.net/api/authors/982924"
        },
        "full_name": "Yamaguchi, Azusa",
        "affiliations": [
          {
            "value": "Edinburgh U.",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/902787"
            },
            "curated_relation": true
          }
        ],
        "signature_block": "YANAGACa"
      },
      {
        "uuid": "eb091525-7c6e-4ef8-8c4c-eda0efd83b87",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1015525"
        },
        "full_name": "Boyle, Peter",
        "affiliations": [
          {
            "value": "Brookhaven",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/902689"
            },
            "curated_relation": true
          },
          {
            "value": "Edinburgh U.",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/902787"
            },
            "curated_relation": true
          }
        ],
        "signature_block": "BYLp"
      },
      {
        "uuid": "41e85519-9766-484d-892b-79bca848d7db",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1045359"
        },
        "full_name": "Cossu, Guido",
        "affiliations": [
          {
            "value": "Unlisted, JP",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/912184"
            },
            "curated_relation": true
          }
        ],
        "signature_block": "CASg"
      },
      {
        "uuid": "7b431a80-dd4f-46fd-8d0c-82b24794736f",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1397574"
        },
        "full_name": "Filaci, Gianluca",
        "affiliations": [
          {
            "value": "Edinburgh U.",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/902787"
            },
            "curated_relation": true
          }
        ],
        "signature_block": "FALACg"
      },
      {
        "uuid": "d94fc419-907d-448f-a16f-4ffdd7c3b862",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1063281"
        },
        "full_name": "Lehner, Christoph",
        "affiliations": [
          {
            "value": "Regensburg U.",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/903753"
            },
            "curated_relation": true
          }
        ],
        "signature_block": "LANARc"
      },
      {
        "uuid": "96b9dbae-82c3-4fd6-b7e4-f0796c26203a",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1242536"
        },
        "full_name": "Portelli, Antonin",
        "affiliations": [
          {
            "value": "Edinburgh U.",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/902787"
            },
            "curated_relation": true
          }
        ],
        "signature_block": "PARTALa"
      }
    ],
    "curated": true,
    "figures": [
      {
        "key": "c31f481a91d7b08e87a2849a9ef2be76",
        "url": "https://inspirehep.net/files/c31f481a91d7b08e87a2849a9ef2be76",
        "label": "fig_cuda_sycl",
        "source": "arxiv",
        "caption": "Macro implementation of kernel offload for CUDA and SYCL. Grid and user code use consistently the accelerator\\_for construct. We emphasise that most user code uses either Grid functions or expression template engine and only expert kernels use the accelerator\\_for . This is an internal implementation detail that may be useful to others developing independent GPU codes.",
        "filename": "CUDA_SYCL.png",
        "material": "preprint"
      },
      {
        "key": "e4ffc86fb6cd84e2b0d05c884a782c24",
        "url": "https://inspirehep.net/files/e4ffc86fb6cd84e2b0d05c884a782c24",
        "label": "fig_hip_openmp",
        "source": "arxiv",
        "caption": "Macro implementation of kernel offload for HIP and OpenMP. Grid and user code use consistently the accelerator\\_for construct. We emphasise that most user code uses either Grid functions or expression template engine and only expert kernels use the accelerator\\_for. This is an internal implementation detail that may be useful to others developing independent GPU codes.",
        "filename": "HIP_OPENMP.png",
        "material": "preprint"
      },
      {
        "key": "4d147645082ae0f6557c1e0be795740c",
        "url": "https://inspirehep.net/files/4d147645082ae0f6557c1e0be795740c",
        "label": "perf",
        "source": "arxiv",
        "caption": "We show the performance per node on two recent systems comprising 4 x Nvidia A100 GPUs per node. The Atos Sequana ``Tursa'' system in Edinburgh (an identical technology to the Juelich Booster system), and the phase one Perlmutter system at NERSC, LBNL. The Perlmutter should be upgraded in phase 2 and is anticipated to give significantly upgraded performance. With current GPU's a ratio of 200Gbit/s interconnect per 4TF/s seems a sweet spot.",
        "filename": "performance.png",
        "material": "preprint"
      },
      {
        "key": "34af4498a8012bd33ded6f4db4d4291a",
        "url": "https://inspirehep.net/files/34af4498a8012bd33ded6f4db4d4291a",
        "label": "network",
        "source": "arxiv",
        "caption": "ATOS_network.pdf",
        "filename": "ATOS_network.png",
        "material": "preprint"
      },
      {
        "key": "f120ba74b35895313aa1c7129bb5e096",
        "url": "https://inspirehep.net/files/f120ba74b35895313aa1c7129bb5e096",
        "label": "nsys",
        "source": "arxiv",
        "caption": "We show the CUDA Nsight-sys profile of our code running on node zero of a 16 node job on Tursa. The communication and computation are perfectly overlapped and this system is well balanced for QCD. After continued optimisation of our code 16 multi-GPU nodes using 64 GPU's deliver that same performance as 1024 (substantially cheaper) nodes of the previous system it replaces. All kernels in the sequence (including face assembly) have been profiled and verified to obtain around 80\\% of the peak memory bandwidth.",
        "filename": "Dslash_opt.png",
        "material": "preprint"
      },
      {
        "key": "fde981d13f66d3693988255aa9b8d457",
        "url": "https://inspirehep.net/files/fde981d13f66d3693988255aa9b8d457",
        "label": "ncu",
        "source": "arxiv",
        "caption": "We show the CUDA Nsight-compute profile of the main kernel DWF code. All kernels in the sequence (including face assembly) have been profiled and verified to obtain around 80\\% of the peak memory bandwidth. This kernel is typical, but as it is floating point rich it is also seen to obtain 39\\% utilisation of the floating point pipeline and a high fraction of the available cache bandwidth. Communication and computation are being efficiently overlapped while this kernel runs.",
        "filename": "Dslash_ncu.png",
        "material": "preprint"
      }
    ],
    "license": [
      {
        "url": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
        "license": "CC-BY-NC-ND-4.0",
        "imposing": "SISSA"
      },
      {
        "url": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/",
        "license": "arXiv nonexclusive-distrib 1.0",
        "material": "preprint"
      }
    ],
    "texkeys": [
      "Yamaguchi:2022feu",
      "Boyle:2022nef"
    ],
    "citeable": true,
    "imprints": [
      {
        "date": "2022-07-07"
      }
    ],
    "keywords": [
      {
        "value": "Grid computing",
        "schema": "INSPIRE"
      },
      {
        "value": "interface",
        "schema": "INSPIRE"
      },
      {
        "value": "performance",
        "schema": "INSPIRE"
      },
      {
        "value": "programming",
        "schema": "INSPIRE"
      },
      {
        "value": "lattice field theory",
        "schema": "INSPIRE"
      }
    ],
    "refereed": false,
    "abstracts": [
      {
        "value": "We discuss a substantial update to the Grid software library for Lattice QCD,\n enabling it to port to multiple GPU architectures while retaining CPU vectorisation and\n SIMD execution within OpenMP threads. The GPU environments supported include vendor specific Nvidia CUDA and AMD HIP environments and a (mostly) standards based SyCL implementation. This is performed by an internal abstraction interface giving single source cross-platform performance portability across all number of planned Exascale architectures, and all those planned by the US Department of Energy.",
        "source": "SISSA"
      },
      {
        "value": "We discuss a substantial update to the Grid software library for Lattice QCD, enabling it to port to multiple GPU architectures while retaining CPU vectorisation and SIMD execution within OpenMP threads. The GPU environments supported include vendor specific Nvidia CUDA and AMD HIP environments and a (mostly) standards based SYCL implementation. This is performed by an internal abstraction interface giving single source cross-platform performance portability across all number of planned Exascale architectures, and all those planned by the US Department of Energy.",
        "source": "arXiv"
      }
    ],
    "references": [
      {
        "raw_refs": [
          {
            "value": "[1] “GridDocumentation”https://github.com/paboyle/Grid/blob/develop/documentation/Grid.pdf",
            "schema": "text",
            "source": "desy"
          }
        ],
        "reference": {
          "misc": [
            "\"GridDocumentation"
          ],
          "urls": [
            {
              "value": "https://github.com/paboyle/Grid/blob/develop/documentation/Grid.pdf"
            }
          ],
          "label": "1",
          "texkey": "GridManual"
        }
      },
      {
        "record": {
          "$ref": "https://inspirehep.net/api/literature/1409303"
        },
        "raw_refs": [
          {
            "value": "[2] P. A. Boyle, G. Cossu, A. Yamaguchi and A. Portelli, “Grid: A next generation data parallel C++ QCD library,” PoS LATTICE2015 (2016), 023 doi:10.22323/1.251.0023",
            "schema": "text",
            "source": "desy"
          }
        ],
        "reference": {
          "dois": [
            "10.22323/1.251.0023"
          ],
          "label": "2",
          "title": {
            "title": "Grid: A next generation data parallel C++ QCD library"
          },
          "texkey": "Boyle:2016lbp",
          "authors": [
            {
              "full_name": "Boyle, P.A."
            },
            {
              "full_name": "Cossu, G."
            },
            {
              "full_name": "Yamaguchi, A."
            },
            {
              "full_name": "Portelli, A."
            }
          ],
          "publication_info": {
            "year": 2016,
            "artid": "023",
            "page_start": "023",
            "journal_title": "PoS",
            "journal_volume": "LATTICE2015"
          }
        }
      },
      {
        "record": {
          "$ref": "https://inspirehep.net/api/literature/1850747"
        },
        "raw_refs": [
          {
            "value": "[3] P. Boyle and A. Yamaguchi, “Comparison of Domain Wall Fermion Multigrid Methods,” [arXiv:2103.05034 [hep-lat]].",
            "schema": "text",
            "source": "desy"
          }
        ],
        "reference": {
          "label": "3",
          "title": {
            "title": "Comparison of Domain Wall Fermion Multigrid Methods"
          },
          "texkey": "Boyle:2021wcf",
          "authors": [
            {
              "full_name": "Boyle, P."
            },
            {
              "full_name": "Yamaguchi, A."
            }
          ],
          "arxiv_eprint": "2103.05034"
        }
      },
      {
        "raw_refs": [
          {
            "value": "[4] https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html",
            "schema": "text",
            "source": "desy"
          }
        ],
        "reference": {
          "urls": [
            {
              "value": "https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html"
            }
          ],
          "label": "4",
          "texkey": "SYCL"
        }
      },
      {
        "raw_refs": [
          {
            "value": "[5] Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah, IEEE Transactions on Parallel and Distributed Systems “Kokkos 3: Programming Model Extensions for the Exascale Era” 2022, Vol 33 No 4, 805-817. doi 10.1109/TPDS.2021.3097283",
            "schema": "text",
            "source": "desy"
          }
        ],
        "reference": {
          "dois": [
            "10.1109/TPDS.2021.3097283"
          ],
          "misc": [
            "Trott",
            "and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S",
            "and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah, IEEE Transactions on Parallel and Distributed Systems",
            "Vol 33 No 4, 805-817"
          ],
          "label": "5",
          "title": {
            "title": "Kokkos 3: Programming Model Extensions for the Exascale Era"
          },
          "texkey": "Kokkos",
          "authors": [
            {
              "full_name": "R., Christian"
            }
          ],
          "publication_info": {
            "year": 2022
          }
        }
      },
      {
        "raw_refs": [
          {
            "value": "[6] RAJA Performance Portability Layer. https://github.com/LLNL/RAJA",
            "schema": "text",
            "source": "desy"
          }
        ],
        "reference": {
          "misc": [
            "RAJA Performance Portability Layer"
          ],
          "urls": [
            {
              "value": "https://github.com/LLNL/RAJA"
            }
          ],
          "label": "6",
          "texkey": "Raja"
        }
      }
    ],
    "public_notes": [
      {
        "value": "10 pages, 6 figures",
        "source": "arXiv"
      }
    ],
    "arxiv_eprints": [
      {
        "value": "2203.06777",
        "categories": [
          "hep-lat"
        ]
      }
    ],
    "document_type": [
      "conference paper"
    ],
    "preprint_date": "2022-03-13",
    "control_number": 2050810,
    "number_of_pages": 10,
    "inspire_categories": [
      {
        "term": "Lattice",
        "source": "arxiv"
      }
    ],
    "accelerator_experiments": [
      {
        "record": {
          "$ref": "https://inspirehep.net/api/experiments/2957096"
        },
        "legacy_name": "Grid",
        "curated_relation": true
      }
    ]
  }
}