{
  "revision_id": 97,
  "updated": "2025-08-04T18:01:28.963774+00:00",
  "links": {
    "bibtex": "https://inspirehep.net/api/literature/1409303?format=bibtex",
    "latex-eu": "https://inspirehep.net/api/literature/1409303?format=latex-eu",
    "latex-us": "https://inspirehep.net/api/literature/1409303?format=latex-us",
    "json": "https://inspirehep.net/api/literature/1409303?format=json",
    "json-expanded": "https://inspirehep.net/api/literature/1409303?format=json-expanded",
    "cv": "https://inspirehep.net/api/literature/1409303?format=cv",
    "citations": "https://inspirehep.net/api/literature/?q=refersto%3Arecid%3A1409303"
  },
  "metadata": {
    "citation_count": 193,
    "documents": [
      {
        "key": "eb39e0a215ce392f6c68d7ad1d57d8cd",
        "url": "https://inspirehep.net/files/eb39e0a215ce392f6c68d7ad1d57d8cd",
        "source": "PoS",
        "filename": "PoS(LATTICE 2015)023.pdf",
        "fulltext": true
      }
    ],
    "citation_count_without_self_citations": 149,
    "publication_info": [
      {
        "cnum": "C15-07-14",
        "year": 2016,
        "artid": "023",
        "page_start": "023",
        "journal_title": "PoS",
        "parent_record": {
          "$ref": "https://inspirehep.net/api/literature/1391574"
        },
        "journal_record": {
          "$ref": "https://inspirehep.net/api/journals/1213080"
        },
        "journal_volume": "LATTICE2015",
        "conference_record": {
          "$ref": "https://inspirehep.net/api/conferences/1323575"
        }
      }
    ],
    "core": true,
    "dois": [
      {
        "value": "10.22323/1.251.0023"
      }
    ],
    "urls": [
      {
        "value": "http://pos.sissa.it/archive/conferences/251/023/LATTICE%202015_023.pdf",
        "description": "PoS server"
      }
    ],
    "titles": [
      {
        "title": "Grid: A next generation data parallel C++ QCD library",
        "source": "arXiv"
      },
      {
        "title": "Grid: A next generation data parallel C++ QCD library"
      }
    ],
    "$schema": "https://inspirehep.net/schemas/records/hep.json",
    "authors": [
      {
        "uuid": "e2834f5b-3dff-4db2-9a8f-dc3a1487c5db",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1015525"
        },
        "full_name": "Boyle, Peter",
        "affiliations": [
          {
            "value": "Edinburgh U.",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/902787"
            }
          }
        ],
        "signature_block": "BYLp",
        "curated_relation": true,
        "raw_affiliations": [
          {
            "value": "School of Physics, The University of Edinburgh, Edinburgh EH9 3JZ, UK"
          }
        ]
      },
      {
        "uuid": "58a1dc23-6085-41b9-ad46-b972805834fc",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1045359"
        },
        "full_name": "Cossu, Guido",
        "affiliations": [
          {
            "value": "KEK, Tsukuba",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/902916"
            }
          }
        ],
        "signature_block": "CASg",
        "curated_relation": true
      },
      {
        "uuid": "fc342404-7909-4a85-94e5-6d18fbf6bc8d",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/982924"
        },
        "full_name": "Yamaguchi, Azusa",
        "affiliations": [
          {
            "value": "Edinburgh U.",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/902787"
            }
          }
        ],
        "signature_block": "YANAGACa"
      },
      {
        "uuid": "46e2741b-c9a3-477e-898d-21e0988b668e",
        "record": {
          "$ref": "https://inspirehep.net/api/authors/1242536"
        },
        "full_name": "Portelli, Antonin",
        "affiliations": [
          {
            "value": "Edinburgh U.",
            "record": {
              "$ref": "https://inspirehep.net/api/institutions/902787"
            }
          }
        ],
        "signature_block": "PARTALa",
        "curated_relation": true
      }
    ],
    "curated": true,
    "figures": [
      {
        "key": "4b9529fd8572451026c3c6d85459a80d",
        "url": "https://inspirehep.net/files/4b9529fd8572451026c3c6d85459a80d",
        "source": "arxiv",
        "caption": "Two different approaches to SIMD accelerating matrix-vector products. The left approach suffers from long latency operations when horizonally adding a summation register (in addition to the fact that increasing vector lengths do not typically neatly divide the index ranks of QCD fields), while the right hand approach to performing many of these operations does not.",
        "filename": "mvecbad.png"
      },
      {
        "key": "013e1db859122d29672830ac972e541c",
        "url": "https://inspirehep.net/files/013e1db859122d29672830ac972e541c",
        "source": "arxiv",
        "caption": "Two different approaches to SIMD accelerating matrix-vector products. The left approach suffers from long latency operations when horizonally adding a summation register (in addition to the fact that increasing vector lengths do not typically neatly divide the index ranks of QCD fields), while the right hand approach to performing many of these operations does not.",
        "filename": "mvecgood.png"
      },
      {
        "key": "d4c700f8e78e22ba2fdf11df2e28f293",
        "url": "https://inspirehep.net/files/d4c700f8e78e22ba2fdf11df2e28f293",
        "source": "arxiv",
        "caption": "A key idea is to overdecompose and then interleave elements from different virtual nodes in adjacent SIMD lanes. After transformation it is simple to generate code such that one node performs the work on $N=4$ virtual nodes, each with a correspondingly reduced local volume.",
        "filename": "overdecompose.png"
      },
      {
        "key": "8e2b4845dc323bc7976c0ea6b900478a",
        "url": "https://inspirehep.net/files/8e2b4845dc323bc7976c0ea6b900478a",
        "source": "arxiv",
        "caption": "Permutation, and all SIMD lane crossing overhead, is suppressed by the surface to volume ratio in our interleaved virtual node scheme; we illustrate here a one-dimensional example of a cshift operation. Only the interior three steps are performed in grid since this is the native, internal data layout -- the exterior diagrammatic representation as unpacked into virtual nodes is illustrative only.",
        "filename": "cshift1.png"
      },
      {
        "key": "69f9478976d7d8dc4ae63d3dda5b0b6a",
        "url": "https://inspirehep.net/files/69f9478976d7d8dc4ae63d3dda5b0b6a",
        "source": "arxiv",
        "caption": "Elemental rules are implemented for our scalar matrix and vector classes. We may use these and modern C++11 syntax to infer the rules for composite types in arbitrary tensor combinations. For clarity, if there were five index types in composition the C++ 98 approach would require the $9^5$ cases to be enumerated in inline functions in header files and this is a combinatorially large saving in code volume.",
        "filename": "StencilInit.png"
      },
      {
        "key": "591772e54391f9506420b617f5c1510c",
        "url": "https://inspirehep.net/files/591772e54391f9506420b617f5c1510c",
        "source": "arxiv",
        "caption": "Elemental rules are implemented for our scalar matrix and vector classes. We may use these and modern C++11 syntax to infer the rules for composite types in arbitrary tensor combinations. For clarity, if there were five index types in composition the C++ 98 approach would require the $9^5$ cases to be enumerated in inline functions in header files and this is a combinatorially large saving in code volume.",
        "filename": "Stencil.png"
      },
      {
        "key": "e6ea0918bc8c456d6655b267e5ac3481",
        "url": "https://inspirehep.net/files/e6ea0918bc8c456d6655b267e5ac3481",
        "source": "arxiv",
        "caption": "We compare the performance of Grid (red) on SU(3)$\\times$SU(3) matrix multiplication to peak (blue), the limit imposed by memory bandwidth (purple), and to that of the QDP++ code system (green).",
        "filename": "SU3SU3.png"
      },
      {
        "key": "5be9cfcb0841946be5c986a699048563",
        "url": "https://inspirehep.net/files/5be9cfcb0841946be5c986a699048563",
        "source": "arxiv",
        "caption": "We compare the SU(3)$\\times$SU(3) performance (Gflop/s) versus footprint (bytes) under AVX-1 instructions of a slightly slower clocked quad-core Haswell (Crystalwell) to a quad-core Ivybridge. The effect of 128MB integrated on-package eDRAM cache is clearly visible.",
        "filename": "crystalwell.png"
      },
      {
        "key": "24305d89f4388a7d237d9b6e99a4bce2",
        "url": "https://inspirehep.net/files/24305d89f4388a7d237d9b6e99a4bce2",
        "source": "arxiv",
        "caption": "{\\bf Left panel:} comparison of the code generation quality for domain wall dslash of different compilers. A fixed $N_c=3$ version involves hand unrolled using the Grid SIMD data types, while $N_c$ loops variants work for any $N_c$ but are rather slower for the Intel V16.0 icpc compiler. The Clang++ compiler performs reasonably even without unrolling. {\\bf Right panel:} The corresponding plot using g++ v4.9 on a 24 core Archer XC30 node. Cache reuse is greater for Domain Wall fermions than for Wilson Fermions since the gauge field is reused $L_s$ times, while the input vector in the stencil operation is reused $8$ times. Performance counters suggest the Domain Wall code is very much cache bound and the main memory transfers are roughly 1/8th of the lowest level of cache accesses as expected.",
        "filename": "compilers.png"
      },
      {
        "key": "87475dd1a40ecf4ea94f6be49e751c35",
        "url": "https://inspirehep.net/files/87475dd1a40ecf4ea94f6be49e751c35",
        "source": "arxiv",
        "caption": "{\\bf Left panel:} comparison of the code generation quality for domain wall dslash of different compilers. A fixed $N_c=3$ version involves hand unrolled using the Grid SIMD data types, while $N_c$ loops variants work for any $N_c$ but are rather slower for the Intel V16.0 icpc compiler. The Clang++ compiler performs reasonably even without unrolling. {\\bf Right panel:} The corresponding plot using g++ v4.9 on a 24 core Archer XC30 node. Cache reuse is greater for Domain Wall fermions than for Wilson Fermions since the gauge field is reused $L_s$ times, while the input vector in the stencil operation is reused $8$ times. Performance counters suggest the Domain Wall code is very much cache bound and the main memory transfers are roughly 1/8th of the lowest level of cache accesses as expected.",
        "filename": "xc30.png"
      }
    ],
    "license": [
      {
        "url": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/",
        "imposing": "arXiv"
      },
      {
        "license": "CC-BY-NC-SA",
        "imposing": "SISSA"
      }
    ],
    "texkeys": [
      "Boyle:2015tjk",
      "Boyle:2016lbp"
    ],
    "citeable": true,
    "imprints": [
      {
        "date": "2016-07-15",
        "publisher": "SISSA"
      }
    ],
    "keywords": [
      {
        "value": "Grid computing",
        "schema": "INSPIRE"
      },
      {
        "value": "computer: performance",
        "schema": "INSPIRE"
      },
      {
        "value": "programming",
        "schema": "INSPIRE"
      },
      {
        "value": "quantum chromodynamics: lattice",
        "schema": "INSPIRE"
      },
      {
        "value": "new physics",
        "schema": "INSPIRE"
      },
      {
        "value": "energy: high",
        "schema": "INSPIRE"
      },
      {
        "value": "numerical methods: performance",
        "schema": "INSPIRE"
      },
      {
        "value": "quantum chromodynamics",
        "schema": "INSPIRE"
      },
      {
        "value": "lattice field theory",
        "schema": "INSPIRE"
      }
    ],
    "abstracts": [
      {
        "value": "In this proceedings we discuss the motivation, implementation details, and performance of a new physics code base called Grid. It is intended to be more performant, more general, but similar in spirit to QDP++\\cite{QDP}. Our approach is to engineer the basic type system to be consistently fast, rather than bolt on a few optimised routines, and we are attempt to write all our optimised routines directly in the Grid framework. It is hoped this will deliver best known practice performance across the next generation of supercomputers, which will provide programming challenges to traditional scalar codes. We illustrate the programming patterns used to implement our goals, and advances in productivity that have been enabled by using new features in C++11.",
        "source": "arXiv"
      },
      {
        "value": "In this proceedings we discuss the motivation, implementation details, and performance of a new physics code base called Grid. It is intended to be more performant, more general, but similar in spirit to QDP++[ 6 ]. Our approach is to engineer the basic type system to be consistently fast, rather than bolt on a few optimised routines, and we are attempt to write all our optimised routines directly in the Grid framework. It is hoped this will deliver best known practice performance across the next generation of supercomputers, which will provide programming challenges to traditional scalar codes. We illustrate the programming patterns used to implement our goals, and advances in productivity that have been enabled by using new features in C++11"
      }
    ],
    "references": [
      {
        "reference": {
          "urls": [
            {
              "value": "http://www.agner.org/optimize/"
            }
          ],
          "label": "1"
        }
      },
      {
        "reference": {
          "misc": [
            "github.com/paboyle/Grid"
          ],
          "label": "2"
        }
      },
      {
        "reference": {
          "misc": [
            "HotChips"
          ],
          "label": "3",
          "title": {
            "title": "Knights Landing (KNL): 2nd Generation Intel Xeon Phi Processor"
          },
          "authors": [
            {
              "full_name": "Sodani, A."
            }
          ],
          "publication_info": {
            "year": 2015
          }
        }
      },
      {
        "record": {
          "$ref": "https://inspirehep.net/api/literature/1297738"
        },
        "reference": {
          "misc": [
            "[RBC and UKQCD Collaborations]"
          ],
          "label": "4",
          "title": {
            "title": "Overview of Columbia Physics System"
          },
          "authors": [
            {
              "full_name": "Jung, C."
            }
          ],
          "publication_info": {
            "year": 2014,
            "artid": "417",
            "page_start": "417",
            "journal_title": "PoS",
            "journal_volume": "LATTICE2013"
          }
        },
        "curated_relation": false
      },
      {
        "record": {
          "$ref": "https://inspirehep.net/api/literature/1189318"
        },
        "reference": {
          "label": "5",
          "title": {
            "title": "The BAGEL assembler generation library"
          },
          "authors": [
            {
              "full_name": "Boyle, P.A."
            }
          ],
          "publication_info": {
            "year": 2009,
            "artid": "2739",
            "page_start": "2739",
            "journal_title": "Comput.Phys.Commun.",
            "journal_volume": "180"
          }
        },
        "curated_relation": false
      },
      {
        "record": {
          "$ref": "https://inspirehep.net/api/literature/658088"
        },
        "reference": {
          "misc": [
            "[SciDAC and LHPC and UKQCD Collaborations]"
          ],
          "label": "6",
          "title": {
            "title": "The Chroma software system for lattice QCD"
          },
          "authors": [
            {
              "full_name": "Edwards, R.G."
            }
          ],
          "arxiv_eprint": "hep-lat/0409003",
          "publication_info": {
            "year": 2005,
            "artid": "832",
            "page_start": "832",
            "journal_title": "Nucl.Phys.B Proc.Suppl.",
            "journal_volume": "140"
          }
        },
        "curated_relation": false
      },
      {
        "record": {
          "$ref": "https://inspirehep.net/api/literature/1216422"
        },
        "reference": {
          "label": "7",
          "title": {
            "title": "The BlueGene/Q supercomputer"
          },
          "authors": [
            {
              "full_name": "Boyle, P.A."
            }
          ],
          "publication_info": {
            "year": 2012,
            "artid": "020",
            "page_start": "020",
            "journal_title": "PoS",
            "journal_volume": "LATTICE2012"
          }
        },
        "curated_relation": false
      },
      {
        "reference": {
          "urls": [
            {
              "value": "http://jhnet.co.uk/articles/cpp_magic"
            }
          ],
          "label": "8"
        }
      },
      {
        "reference": {
          "urls": [
            {
              "value": "https://software.intel.com/sites/landingpage/IntrinsicsGuide/"
            }
          ],
          "label": "9"
        }
      },
      {
        "reference": {
          "misc": [
            "and others, IEEE Micro, Volume 32 Issue 2, pp48-60"
          ],
          "label": "10",
          "title": {
            "title": "The ibm blue gene/q compute chip"
          },
          "authors": [
            {
              "full_name": "Haring, R."
            },
            {
              "full_name": "Ohmacht, M."
            }
          ],
          "publication_info": {
            "year": 2012
          }
        }
      },
      {
        "reference": {
          "misc": [
            "Veldhuizen, Todd C++ Report 7.5 : 26-31"
          ],
          "label": "11",
          "title": {
            "title": "Expression templates."
          },
          "publication_info": {
            "year": 1995
          }
        }
      },
      {
        "reference": {
          "urls": [
            {
              "value": "http://people.csail.mit.edu/bradley/cm5docs/nov06/ConnectionMachineModelCM2TechnicalSummary.pdf"
            }
          ],
          "label": "12"
        }
      },
      {
        "reference": {
          "misc": [
            "and"
          ],
          "urls": [
            {
              "value": "http://www.boost.org"
            },
            {
              "value": "https://github.com/boostorg/hana"
            }
          ],
          "label": "13"
        }
      }
    ],
    "public_notes": [
      {
        "value": "14 pages, Lattice 2015",
        "source": "arXiv"
      }
    ],
    "arxiv_eprints": [
      {
        "value": "1512.03487",
        "categories": [
          "hep-lat",
          "cs.DC",
          "cs.MS"
        ]
      }
    ],
    "document_type": [
      "conference paper"
    ],
    "preprint_date": "2015-12-10",
    "control_number": 1409303,
    "legacy_version": "20190409154914.0",
    "deleted_records": [
      {
        "$ref": "https://inspirehep.net/api/literature/1476875"
      }
    ],
    "number_of_pages": 14,
    "inspire_categories": [
      {
        "term": "Lattice"
      },
      {
        "term": "Computing"
      }
    ],
    "legacy_creation_date": "2016-07-19",
    "accelerator_experiments": [
      {
        "record": {
          "$ref": "https://inspirehep.net/api/experiments/2957096"
        },
        "legacy_name": "Grid",
        "curated_relation": true
      }
    ]
  },
  "id": "1409303",
  "uuid": "1bae45e9-d16f-440d-8be0-531f7d087652",
  "created": "2016-07-19T00:00:00+00:00"
}