/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/Init.cc Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ /****************************************************************************/ /* pab: Signal magic. Processor state dump is x86-64 specific */ /****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_UNWIND #include #endif #include #ifdef __APPLE__ static int feenableexcept (unsigned int excepts) { #if 0 // Fails on Apple M1 static fenv_t fenv; unsigned int new_excepts = excepts & FE_ALL_EXCEPT; unsigned int old_excepts; // previous masks int iold_excepts; // previous masks if ( fegetenv (&fenv) ) return -1; old_excepts = fenv.__control & FE_ALL_EXCEPT; // unmask fenv.__control &= ~new_excepts; fenv.__mxcsr &= ~(new_excepts << 7); iold_excepts = (int) old_excepts; return ( fesetenv (&fenv) ? -1 : iold_excepts ); #endif return 0; } #endif #ifndef HOST_NAME_MAX #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX #endif NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////// // Convenience functions to access stadard command line arg // driven parallelism controls ////////////////////////////////////////////////////// static Coordinate Grid_default_latt; static Coordinate Grid_default_mpi; int GridThread::_threads =1; int GridThread::_hyperthreads=1; int GridThread::_cores=1; char hostname[HOST_NAME_MAX+1]; char *GridHostname(void) { return hostname; } const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;}; const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;}; const Coordinate GridDefaultSimd(int dims,int nsimd) { Coordinate layout(dims); int nn=nsimd; for(int d=dims-1;d>=0;d--){ if ( nn>=2) { layout[d]=2; nn/=2; } else { layout[d]=1; } } assert(nn==1); return layout; } //////////////////////////////////////////////////////////// // Command line parsing assist for stock controls //////////////////////////////////////////////////////////// std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option) { char ** itr = std::find(begin, end, option); if (itr != end && ++itr != end) { std::string payload(*itr); return payload; } return std::string(""); } bool GridCmdOptionExists(char** begin, char** end, const std::string& option) { return std::find(begin, end, option) != end; } // Comma separated list void GridCmdOptionCSL(std::string str,std::vector & vec) { size_t pos = 0; std::string token; std::string delimiter(","); vec.resize(0); while ((pos = str.find(delimiter)) != std::string::npos) { token = str.substr(0, pos); vec.push_back(token); str.erase(0, pos + delimiter.length()); } token = str; vec.push_back(token); return; } template void GridCmdOptionIntVector(const std::string &str,VectorInt & vec) { vec.resize(0); std::stringstream ss(str); int i; while (ss >> i){ vec.push_back(i); if(std::ispunct(ss.peek())) ss.ignore(); } return; } template void GridCmdOptionIntVector(const std::string &str,std::vector & vec); template void GridCmdOptionIntVector(const std::string &str,Coordinate & vec); void GridCmdOptionInt(std::string &str,int & val) { std::stringstream ss(str); ss>>val; return; } void GridCmdOptionFloat(std::string &str,double & val) { std::stringstream ss(str); ss>>val; return; } void GridParseLayout(char **argv,int argc, Coordinate &latt_c, Coordinate &mpi_c) { auto mpi =std::vector({1,1,1,1}); auto latt=std::vector({8,8,8,8}); GridThread::SetMaxThreads(); std::string arg; if( GridCmdOptionExists(argv,argv+argc,"--mpi") ){ arg = GridCmdOptionPayload(argv,argv+argc,"--mpi"); GridCmdOptionIntVector(arg,mpi); } if( GridCmdOptionExists(argv,argv+argc,"--grid") ){ arg= GridCmdOptionPayload(argv,argv+argc,"--grid"); GridCmdOptionIntVector(arg,latt); } if( GridCmdOptionExists(argv,argv+argc,"--threads") ){ std::vector ompthreads(0); #ifndef GRID_OMP std::cout << GridLogWarning << "'--threads' option used but Grid was" << " not compiled with thread support" << std::endl; #endif arg= GridCmdOptionPayload(argv,argv+argc,"--threads"); GridCmdOptionIntVector(arg,ompthreads); assert(ompthreads.size()==1); GridThread::SetThreads(ompthreads[0]); } if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){ std::vector gputhreads(0); arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads"); GridCmdOptionIntVector(arg,gputhreads); assert(gputhreads.size()==1); acceleratorThreads(gputhreads[0]); } if( GridCmdOptionExists(argv,argv+argc,"--cores") ){ int cores; arg= GridCmdOptionPayload(argv,argv+argc,"--cores"); GridCmdOptionInt(arg,cores); GridThread::SetCores(cores); } // Copy back into coordinate format int nd = mpi.size(); assert(latt.size()==nd); latt_c.resize(nd); mpi_c.resize(nd); for(int d=0;d std::string GridCmdVectorIntToString(const VectorInt & vec_in){ int sz = vec_in.size(); std::vector vec(sz); for(int s=0;s(oss, " ")); return oss.str(); } ///////////////////////////////////////////////////////// // Reinit guard ///////////////////////////////////////////////////////// static MemoryStats dbgMemStats; static int Grid_is_initialised; ///////////////////////////////////////////////////////// // Reinit guard ///////////////////////////////////////////////////////// void GridBanner(void) { std::cout < dlMap; void Grid_init(int *argc,char ***argv) { assert(Grid_is_initialised == 0); GridLogger::GlobalStopWatch.Start(); std::string arg; ////////////////////////////////////////////////////////// // Early intialisation necessities without rank knowledge ////////////////////////////////////////////////////////// acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){ int MB; arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm"); GridCmdOptionInt(arg,MB); uint64_t MB64 = MB; GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL; } if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-mpi") ){ int forcempi; arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm-mpi"); GridCmdOptionInt(arg,forcempi); Stencil_force_mpi = (bool)forcempi; } if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){ int MB; arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem"); GridCmdOptionInt(arg,MB); uint64_t MB64 = MB; MemoryManager::DeviceMaxBytes = MB64*1024LL*1024LL; } if( GridCmdOptionExists(*argv,*argv+*argc,"--hypercube") ){ int enable; arg= GridCmdOptionPayload(*argv,*argv+*argc,"--hypercube"); GridCmdOptionInt(arg,enable); GlobalSharedMemory::HPEhypercube = enable; } if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){ GlobalSharedMemory::Hugepages = 1; } if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ Grid_debug_handler_init(); } // Sleep n-seconds at end of handler if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay"); GridCmdOptionInt(arg,signal_delay); } // periodic wakeup with stack trace printed if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){ Grid_debug_heartbeat(); } // periodic wakeup with empty handler (interrupts some system calls) if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){ Grid_heartbeat(); } #if defined(A64FX) if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ std::cout << "Option --comms-overlap currently not supported on QPACE4. Exiting." << std::endl; exit(EXIT_FAILURE); } #endif ////////////////////////////////////////////////////////// // Memory manager ////////////////////////////////////////////////////////// MemoryManager::Init(); ////////////////////////////////////////////////////////// // MPI initialisation ////////////////////////////////////////////////////////// CartesianCommunicator::Init(argc,argv); GridLogger::GlobalStopWatch.Stop(); CartesianCommunicator::BarrierWorld(); GridLogger::GlobalStopWatch.Reset();// Back to zero with synchronised clock GridLogger::GlobalStopWatch.Start(); //////////////////////////////////// // Banner after MPI (unless GPU) //////////////////////////////////// if ( CartesianCommunicator::RankWorld() == 0 ) { GridBanner(); } ///////////////////////////////////////////////////////////////// // Rank information can be used to control who logs ///////////////////////////////////////////////////////////////// if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){ Grid_quiesce_nodes(); } else { FILE *fp; std::ostringstream fname; int rank = CartesianCommunicator::RankWorld(); int radix=64; char* root = getenv("GRID_STDOUT_ROOT"); if (root) { fname << root ; mkdir(fname.str().c_str(), S_IRWXU ); fname << "/"; } fname << (rank/radix)*radix ; mkdir(fname.str().c_str(), S_IRWXU ); fname << "/"; fname<<"Grid.stdout."; fname< logstreams; std::string defaultLog("Error,Warning,Message"); GridCmdOptionCSL(defaultLog,logstreams); GridLogConfigure(logstreams); if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){ arg = GridCmdOptionPayload(*argv,*argv+*argc,"--log"); GridCmdOptionCSL(arg,logstreams); GridLogConfigure(logstreams); } //////////////////////////////////// // Help message //////////////////////////////////// if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){ std::cout<=0 && dig< 16){ SIGLOG(digits[dig]); } } void sig_print_uint(uint32_t A) { int dig; int nz=0; #define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig; DIGIT(1000000000); // Catches 4BN = 2^32 DIGIT(100000000); DIGIT(10000000); DIGIT(1000000); DIGIT(100000); DIGIT(10000); DIGIT(1000); DIGIT(100); DIGIT(10); DIGIT(1); if (nz==0) SIGLOG("0"); } void sig_print_hex(uint64_t A) { int nz=0; int dig; #define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig; SIGLOG("0x"); NIBBLE((A>>(15*4))&0xF); NIBBLE((A>>(14*4))&0xF); NIBBLE((A>>(13*4))&0xF); NIBBLE((A>>(12*4))&0xF); NIBBLE((A>>(11*4))&0xF); NIBBLE((A>>(10*4))&0xF); NIBBLE((A>>(9*4))&0xF); NIBBLE((A>>(8*4))&0xF); NIBBLE((A>>(7*4))&0xF); NIBBLE((A>>(6*4))&0xF); NIBBLE((A>>(5*4))&0xF); NIBBLE((A>>(4*4))&0xF); NIBBLE((A>>(3*4))&0xF); NIBBLE((A>>(2*4))&0xF); NIBBLE((A>>4)&0xF); sig_print_dig(A&0xF); } /* #ifdef __linux__ #ifdef __x86_64__ ucontext_t * uc= (ucontext_t *)ptr; struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip); #endif #endif */ void Grid_generic_handler(int sig,siginfo_t *si,void * ptr) { SIGLOG("Signal handler on host "); SIGLOG(hostname); SIGLOG(" process id "); sig_print_uint((uint32_t)getpid()); SIGLOG("\n"); SIGLOG("FlightRecorder step "); sig_print_uint(FlightRecorder::StepLoggingCounter); SIGLOG(" stage "); SIGLOG(FlightRecorder::StepName); SIGLOG("\n"); SIGLOG("Caught signal "); sig_print_uint(si->si_signo); SIGLOG("\n"); SIGLOG(" mem address "); sig_print_hex((uint64_t)si->si_addr); SIGLOG("\n"); SIGLOG(" code "); sig_print_uint(si->si_code); SIGLOG("\n"); ucontext_t *uc= (ucontext_t *)ptr; SIGLOG("Backtrace:\n"); #ifdef HAVE_UNWIND // Debug cross check on offsets // int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE); // backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr); unw_cursor_t cursor; unw_word_t ip, off; if (!unw_init_local(&cursor, uc) ) { SIGLOG(" frame IP function\n"); int level = 0; int ret = 0; while(1) { char name[128]; if (level >= _NBACKTRACE) return; unw_get_reg(&cursor, UNW_REG_IP, &ip); sig_print_uint(level); SIGLOG(" "); sig_print_hex(ip); SIGLOG(" "); for(int r=0;r=dlMap[r].start) &&(ip