/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/Accelerator.h Copyright (C) 2015 Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #pragma once #include #ifdef HAVE_MALLOC_MALLOC_H #include #endif #ifdef HAVE_MALLOC_H #include #endif #ifdef HAVE_MM_MALLOC_H #include #endif #ifdef __APPLE__ // no memalign inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); } #endif #ifdef GRID_DEVICE_MEMORY_ALLOCATOR #define acceleratorAllocDevice acceleratorAllocDeviceInternal #define acceleratorFreeDevice acceleratorFreeDeviceInternal #endif NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////////////////////// // Accelerator primitives; fall back to threading if not CUDA or SYCL ////////////////////////////////////////////////////////////////////////////////// // // Function attributes // // accelerator // accelerator_inline // // Parallel looping // // accelerator_for // accelerator_forNB // uint32_t accelerator_barrier(); // device synchronise // // Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd // // uint32_t acceleratorThreads(void); // void acceleratorThreads(uint32_t); // // Warp control and info: // // acceleratorInit; // void acceleratorSynchronise(void); // synch warp etc.. // int acceleratorSIMTlane(int Nsimd); // // Memory management: // // int acceleratorIsCommunicable(void *pointer); // void *acceleratorAllocShared(size_t bytes); // void acceleratorFreeShared(void *ptr); // // void *acceleratorAllocDevice(size_t bytes); // void acceleratorFreeDevice(void *ptr); // // void *acceleratorCopyToDevice(void *from,void *to,size_t bytes); // void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes); // ////////////////////////////////////////////////////////////////////////////////// uint32_t acceleratorThreads(void); void acceleratorThreads(uint32_t); void acceleratorInit(void); ////////////////////////////////////////////// // CUDA acceleration ////////////////////////////////////////////// #ifdef GRID_CUDA #include #ifdef __CUDA_ARCH__ #define GRID_SIMT #endif #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline extern int acceleratorAbortOnGpuError; extern cudaStream_t copyStream; extern cudaStream_t computeStream; accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT return threadIdx.x; #else return 0; #endif } // CUDA specific inline void acceleratorMem(void) { size_t free_t,total_t,used_t; cudaMemGetInfo(&free_t,&total_t); used_t=total_t-free_t; std::cout << " MemoryManager : GPU used "<>>(num1,num2,nsimd,lambda); \ } \ } #define accelerator_for6dNB(iter1, num1, \ iter2, num2, \ iter3, num3, \ iter4, num4, \ iter5, num5, \ iter6, num6, ... ) \ { \ typedef uint64_t Iterator; \ auto lambda = [=] accelerator \ (Iterator iter1,Iterator iter2, \ Iterator iter3,Iterator iter4, \ Iterator iter5,Iterator iter6) mutable { \ __VA_ARGS__; \ }; \ dim3 cu_blocks (num1,num2,num3); \ dim3 cu_threads(num4,num5,num6); \ Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ } template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { // Weird permute is to make lane coalesce for large blocks uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; uint64_t z = threadIdx.x; if ( (x < num1) && (y __global__ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, uint64_t num4, uint64_t num5, uint64_t num6, lambda Lambda) { uint64_t iter1 = blockIdx.x; uint64_t iter2 = blockIdx.y; uint64_t iter3 = blockIdx.z; uint64_t iter4 = threadIdx.x; uint64_t iter5 = threadIdx.y; uint64_t iter6 = threadIdx.z; if ( (iter1 < num1) && (iter2 #include #include #include NAMESPACE_BEGIN(Grid); inline void acceleratorMem(void) { std::cout <<" SYCL acceleratorMem not implemented"<>()[2]; #else return 0; #endif } // SYCL specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](sycl::handler &cgh) { \ unsigned long nt=acceleratorThreads(); \ if(nt < 8)nt=8; \ unsigned long unum1 = num1; \ unsigned long unum2 = num2; \ unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \ sycl::range<3> local {nt,1,nsimd}; \ sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ cgh.parallel_for( \ sycl::nd_range<3>(global,local), \ [=] (sycl::nd_item<3> item) /*mutable*/ \ [[sycl::reqd_sub_group_size(16)]] \ { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ auto lane = item.get_global_id(2); \ { if (iter1 < unum1){ __VA_ARGS__ } }; \ }); \ }); #define accelerator_barrier(dummy) { theGridAccelerator->wait(); theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); theCopyAccelerator->wait(); } /////// // Asynch event interface /////// typedef sycl::event acceleratorEvent_t; inline void acceleratorEventWait(acceleratorEvent_t ev) { ev.wait(); } inline int acceleratorEventIsComplete(acceleratorEvent_t ev) { return (ev.get_info() == sycl::info::event_command_status::complete); } inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);} inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();theCopyAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) { #if 0 auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); if ( uvm = sycl::usm::alloc::shared ) return 1; else return 0; #endif return 1; } #endif ////////////////////////////////////////////// // HIP acceleration ////////////////////////////////////////////// #ifdef GRID_HIP NAMESPACE_END(Grid); #include NAMESPACE_BEGIN(Grid); #ifdef __HIP_DEVICE_COMPILE__ #define GRID_SIMT #endif #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline inline void acceleratorMem(void) { size_t free_t,total_t,used_t; auto discard = hipMemGetInfo(&free_t,&total_t); used_t=total_t-free_t; std::cout << " MemoryManager : GPU used "< __global__ __launch_bounds__(64,1) void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) { // Following the same scheme as CUDA for now uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; uint64_t z = threadIdx.x; if ( (x < numx) && (y __global__ __launch_bounds__(1024,1) void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) { // Following the same scheme as CUDA for now uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; uint64_t z = threadIdx.x; if ( (x < numx) && (yget_context()); #endif } ////////////////////////////////////////////// // Common on all GPU targets ////////////////////////////////////////////// #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) // FIXME -- the non-blocking nature got broken March 30 2023 by PAB #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} ); #define accelerator_for( iter, num, nsimd, ... ) \ accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \ accelerator_barrier(dummy); #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \ accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \ accelerator_barrier(dummy); #define GRID_ACCELERATED #endif ////////////////////////////////////////////// // CPU Target - No accelerator just thread instead ////////////////////////////////////////////// #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) #undef GRID_SIMT typedef int acceleratorEvent_t; inline void acceleratorMem(void) { /* struct rusage rusage; getrusage( RUSAGE_SELF, &rusage ); return (size_t)rusage.ru_maxrss; */ std::cout <<" system acceleratorMem not implemented"<ext_oneapi_submit_barrier(); theGridAccelerator->ext_oneapi_submit_barrier(); }; #else // Ordering within a stream guaranteed on Nvidia & AMD inline void acceleratorFenceComputeStream(void){ }; #endif /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch /////////////////////////////////////////////////// accelerator_inline void acceleratorSynchronise(void) // Only Nvidia needs { #ifdef GRID_SIMT #ifdef GRID_CUDA __syncwarp(); #endif #endif return; } accelerator_inline void acceleratorSynchroniseAll(void) { #ifdef GRID_SIMT #ifdef GRID_CUDA __syncthreads(); #endif #ifdef GRID_SYCL // No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier #endif #ifdef GRID_HIP __syncthreads(); #endif #endif return; } accelerator_inline void acceleratorFence(void) { #ifdef GRID_SIMT #ifdef GRID_CUDA __threadfence(); #endif #ifdef GRID_SYCL // FIXMEE #endif #ifdef GRID_HIP __threadfence(); #endif #endif return; } inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); acceleratorCopySynchronise(); } template void acceleratorPut(T& dev,const T&host) { acceleratorCopyToDevice((void *)&host,&dev,sizeof(T)); } template T acceleratorGet(T& dev) { T host; acceleratorCopyFromDevice(&dev,&host,sizeof(T)); return host; } NAMESPACE_END(Grid); #ifdef GRID_DEVICE_MEMORY_ALLOCATOR #undef acceleratorAllocDevice #undef acceleratorFreeDevice #endif