/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/communicator/SharedMemory.cc Copyright (C) 2015 Author: Peter Boyle Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #define Mheader "SharedMemoryMpi: " #include #include #ifdef GRID_CUDA #include #endif #ifdef GRID_HIP #include #endif #ifdef GRID_SYCL #ifdef ACCELERATOR_AWARE_MPI #define GRID_SYCL_LEVEL_ZERO_IPC #define SHM_SOCKETS #else #endif #include #endif #include #include NAMESPACE_BEGIN(Grid); #ifdef SHM_SOCKETS /* * Barbaric extra intranode communication route in case we need sockets to pass FDs * Forced by level_zero not being nicely designed */ static int sock; static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d"; static char sock_path[256]; class UnixSockets { public: static void Open(int rank) { int errnum; sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0); struct sockaddr_un sa_un = { 0 }; sa_un.sun_family = AF_UNIX; snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank); unlink(sa_un.sun_path); if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) { perror("bind failure"); exit(EXIT_FAILURE); } } static int RecvFileDescriptor(void) { int n; int fd; char buf[1]; struct iovec iov; struct msghdr msg; struct cmsghdr *cmsg; char cms[CMSG_SPACE(sizeof(int))]; iov.iov_base = buf; iov.iov_len = 1; memset(&msg, 0, sizeof msg); msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = (caddr_t)cms; msg.msg_controllen = sizeof cms; if((n=recvmsg(sock, &msg, 0)) < 0) { perror("recvmsg failed"); return -1; } if(n == 0){ perror("recvmsg returned 0"); return -1; } cmsg = CMSG_FIRSTHDR(&msg); memmove(&fd, CMSG_DATA(cmsg), sizeof(int)); return fd; } static void SendFileDescriptor(int fildes,int xmit_to_rank) { struct msghdr msg; struct iovec iov; struct cmsghdr *cmsg = NULL; char ctrl[CMSG_SPACE(sizeof(int))]; char data = ' '; memset(&msg, 0, sizeof(struct msghdr)); memset(ctrl, 0, CMSG_SPACE(sizeof(int))); iov.iov_base = &data; iov.iov_len = sizeof(data); sprintf(sock_path,sock_path_fmt,xmit_to_rank); struct sockaddr_un sa_un = { 0 }; sa_un.sun_family = AF_UNIX; snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank); msg.msg_name = (void *)&sa_un; msg.msg_namelen = sizeof(sa_un); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_controllen = CMSG_SPACE(sizeof(int)); msg.msg_control = ctrl; cmsg = CMSG_FIRSTHDR(&msg); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; cmsg->cmsg_len = CMSG_LEN(sizeof(int)); *((int *) CMSG_DATA(cmsg)) = fildes; sendmsg(sock, &msg, 0); }; }; #endif /*Construct from an MPI communicator*/ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) { assert(_ShmSetup==0); WorldComm = comm; MPI_Comm_rank(WorldComm,&WorldRank); MPI_Comm_size(WorldComm,&WorldSize); // WorldComm, WorldSize, WorldRank ///////////////////////////////////////////////////////////////////// // Split into groups that can share memory ///////////////////////////////////////////////////////////////////// #ifndef GRID_MPI3_SHM_NONE MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); #else MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm); #endif MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_size(WorldShmComm ,&WorldShmSize); if ( WorldRank == 0) { std::cout << Mheader " World communicator of size " < world_ranks(WorldSize); for(int r=0;r MyGroup; MyGroup.resize(WorldShmSize); for(int rank=0;rank()); int myleader = MyGroup[0]; std::vector leaders_1hot(WorldSize,0); std::vector leaders_group(WorldNodes,0); leaders_1hot [ myleader ] = 1; /////////////////////////////////////////////////////////////////// // global sum leaders over comm world /////////////////////////////////////////////////////////////////// int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm); assert(ierr==0); /////////////////////////////////////////////////////////////////// // find the group leaders world rank /////////////////////////////////////////////////////////////////// int group=0; for(int l=0;l>1)^binary; return gray; } int Log2Size(int TwoToPower,int MAXLOG2) { int log2size = -1; for(int i=0;i<=MAXLOG2;i++){ if ( (0x1< HyperCubeCoords(maxhdim,0); std::vector RootHyperCubeCoords(maxhdim,0); int R; int I; int N; const int namelen = _POSIX_HOST_NAME_MAX; char name[namelen]; // Parse ICE-XA hostname to get hypercube location gethostname(name,namelen); int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; assert(nscan==3); int nlo = N%9; int nhi = N/9; uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ; uint32_t rootcoor = hypercoor; ////////////////////////////////////////////////////////////////// // Print debug info ////////////////////////////////////////////////////////////////// for(int d=0;d>d)&0x1; } std::string hname(name); // std::cout << "hostname "<=0); ////////////////////////////////////// // Printing ////////////////////////////////////// for(int d=0;d>d)&0x1; } //////////////////////////////////////////////////////////////// // Identify subblock of ranks on node spreading across dims // in a maximally symmetrical way //////////////////////////////////////////////////////////////// int ndimension = processors.size(); Coordinate processor_coor(ndimension); Coordinate WorldDims = processors; Coordinate ShmDims (ndimension); Coordinate NodeDims (ndimension); Coordinate ShmCoor (ndimension); Coordinate NodeCoor (ndimension); Coordinate WorldCoor(ndimension); Coordinate HyperCoor(ndimension); GetShmDims(WorldDims,ShmDims); SHM = ShmDims; //////////////////////////////////////////////////////////////// // Establish torus of processes and nodes with sub-blockings //////////////////////////////////////////////////////////////// for(int d=0;d> bits; } //////////////////////////////////////////////////////////////// // Check processor counts match //////////////////////////////////////////////////////////////// int Nprocessors=1; for(int i=0;i shmids(WorldShmSize); if ( WorldShmRank == 0 ) { for(int r=0;r(theGridAccelerator->get_device()); auto zeContext = sycl::get_native(theGridAccelerator->get_context()); ze_ipc_mem_handle_t ihandle; clone_mem_t handle; if ( r==WorldShmRank ) { auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); if ( err != ZE_RESULT_SUCCESS ) { std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<pw_name,WorldNode,r); shm_unlink(shm_name); int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); if ( fd < 0 ) { perror("failed shm_open"); assert(0); } ftruncate(fd, size); int mmap_flag = MAP_SHARED; #ifdef MAP_POPULATE mmap_flag |= MAP_POPULATE; #endif #ifdef MAP_HUGETLB if (flags) mmap_flag |= MAP_HUGETLB; #endif void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); if ( ptr == (void * )MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); WorldShmCommBufs[r] =ptr; close(fd); } } MPI_Barrier(WorldShmComm); if ( WorldShmRank != 0 ) { for(int r=0;rpw_name,WorldNode,r); int fd=shm_open(shm_name,O_RDWR,0666); if ( fd<0 ) { perror("failed shm_open"); assert(0); } void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); WorldShmCommBufs[r] =ptr; close(fd); } } _ShmAlloc=1; _ShmAllocBytes = bytes; } #endif #endif // End NVCC case for GPU device buffers ///////////////////////////////////////////////////////////////////////// // Routines accessing shared memory should route through for GPU safety ///////////////////////////////////////////////////////////////////////// void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) { #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) acceleratorMemSet(dest,0,bytes); #else bzero(dest,bytes); #endif } //void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) //{ //#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) // acceleratorCopyToDevice(src,dest,bytes); //#else // bcopy(src,dest,bytes); //#endif //} //////////////////////////////////////////////////////// // Global shared functionality finished // Now move to per communicator functionality //////////////////////////////////////////////////////// void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) { int rank, size; MPI_Comm_rank(comm,&rank); MPI_Comm_size(comm,&size); ShmRanks.resize(size); ///////////////////////////////////////////////////////////////////// // Split into groups that can share memory ///////////////////////////////////////////////////////////////////// #ifndef GRID_MPI3_SHM_NONE MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); #else MPI_Comm_split(comm, rank, 0, &ShmComm); #endif MPI_Comm_rank(ShmComm ,&ShmRank); MPI_Comm_size(ShmComm ,&ShmSize); ShmCommBufs.resize(ShmSize); ////////////////////////////////////////////////////////////////////// // Map ShmRank to WorldShmRank and use the right buffer ////////////////////////////////////////////////////////////////////// assert (GlobalSharedMemory::ShmAlloc()==1); heap_size = GlobalSharedMemory::ShmAllocBytes(); for(int r=0;r ranks(size); for(int r=0;r