// a slight modification on ReadLattice class (if not inheritance) // to enable parallel reading/writing of "Gauge Connection Format" lattice data // Read the format of Gauge Connection Format // from QCDSP {load,unload}_lattice format #include #include #include #include #include #include #include #include #include #include using namespace std; CPS_START_NAMESPACE #ifdef USE_C11_RNG static void CalcRand(char *msite,int data_per_site, Float *RandSum, Float *Rand2Sum){ static int called=0; RNGSTATE *dump = (RNGSTATE*) msite; stringstream ss_dump; for(int i =0;i> temp_rng; std::normal_distribution grand; Float rn = grand(temp_rng); // VRB.Result("","CalcRand","data_per_site=%d dump[0]=%u rn=%g\n",data_per_site,dump[0],rn); *RandSum += rn; *Rand2Sum += rn*rn; } #endif #define PROFILE /*********************************************************************/ /* ParallelIO functions ***********************************************/ /*********************************************************************/ // the last three pointers used to return information when loading Lattice Random Generators int ParallelIO::load(char * data, const int data_per_site, const int site_mem, const LatHeaderBase & hd, const DataConversion & dconv, const int dimension /* 4 or 5 */, unsigned int * ptrcsum, unsigned int * ptrpdcsum, Float * rand_sum, Float * rand_2_sum) { const char *fname="load()"; if(!UniqueID() & hd.headerType() == LatHeaderBase::LATTICE_HEADER && GJP.Gparity()){ if(!doGparityReconstructUstarField()){ printf("Loading both U and U* fields\n"); }else{ printf("Loading U field and reconstructing U* field\n"); } } VRB.Result(cname,fname,"%p %d %d %p %p %d %p %p %p %p\n", data, data_per_site, site_mem, &hd, &dconv, dimension /* 4 or 5 */, ptrcsum, ptrpdcsum, rand_sum,rand_2_sum); VRB.Result(cname,fname,"hostDataSize()=%d\n",dconv.hostDataSize()); int error = 0; QioArg & rd_arg = qio_arg; // check dimensions, b.c, etc int nx = rd_arg.Xnodes() * rd_arg.XnodeSites(); int ny = rd_arg.Ynodes() * rd_arg.YnodeSites(); int nz = rd_arg.Znodes() * rd_arg.ZnodeSites(); int nt = rd_arg.Tnodes() * rd_arg.TnodeSites(); int ns = rd_arg.Snodes() * rd_arg.SnodeSites(); int chars_per_site = data_per_site * dconv.fileDataSize(); int nstacked = 1; //CK in end use only a single random generator per site. However for testing it is useful to have two //if(hd.headerType() == LatHeaderBase::LATTICE_HEADER && GJP.Gparity()) nstacked=2; //2 gauge fields consecutive in memory if(GJP.Gparity()) nstacked=2; //2 gauge fields consecutive in memory if(hd.headerType() == LatHeaderBase::LATTICE_HEADER && GJP.Gparity() && doGparityReconstructUstarField() ) nstacked = 1; //no point in saving U* field when it can be trivially calculated from U at load int64_t yblk = nx*chars_per_site; int64_t zblk = ny * yblk; int64_t tblk = nz * zblk; int64_t stkblk = tblk *nt; int64_t sblk = stkblk *nstacked; //tblk * nt; //printf("blk sizes y:%d z:%d t:%d stk:%d s:%d\n",yblk,zblk,tblk,stkblk,sblk); int xbegin = rd_arg.XnodeSites() * rd_arg.Xcoor(), xend = rd_arg.XnodeSites() * (rd_arg.Xcoor()+1); int ybegin = rd_arg.YnodeSites() * rd_arg.Ycoor(), yend = rd_arg.YnodeSites() * (rd_arg.Ycoor()+1); int zbegin = rd_arg.ZnodeSites() * rd_arg.Zcoor(), zend = rd_arg.ZnodeSites() * (rd_arg.Zcoor()+1); int tbegin = rd_arg.TnodeSites() * rd_arg.Tcoor(), tend = rd_arg.TnodeSites() * (rd_arg.Tcoor()+1); int sbegin = rd_arg.SnodeSites() * rd_arg.Scoor(), send = rd_arg.SnodeSites() * (rd_arg.Scoor()+1); // all open file and check error ifstream input(rd_arg.FileName); if ( !input.good() ) error = 1; // executed by all, sync and share error status information if(synchronize(error) != 0) ERR.FileR(cname, fname, rd_arg.FileName); // TempBufAlloc is a Mem Allocator that prevents mem leak on function exits TempBufAlloc fbuf(chars_per_site); // buffer only stores one site // these two only needed when loading LatRng TempBufAlloc rng(data_per_site * dconv.hostDataSize()); #ifndef USE_C11_RNG UGrandomGenerator * ugran = (UGrandomGenerator*)data; #endif // read in parallel manner, node 0 will assign & dispatch IO time slots uint32_t csum = 0; uint32_t pdcsum = 0; Float RandSum = 0; Float Rand2Sum = 0; int siteid = 0; char * pd = data; VRB.Result(cname, fname, "Parallel loading starting\n"); setConcurIONumber(rd_arg.ConcurIONumber); // setConcurIONumber(1); // getIOTimeSlot(); if (hd.dataStart()>0) input.seekg(hd.dataStart(),ios_base::beg); int64_t jump = 0; if(dimension == 5) jump = sbegin * sblk; for(int sr=sbegin; dimension==4 || sr2) printf("Node %d:read jump=%d csum=%x try_num=%d\n",UniqueID(),jump,dconv.checksum(fbuf,data_per_site),try_num); if(hd.headerType() == LatHeaderBase::LATTICE_HEADER) { for(int mat=0;mat<4;mat++) { dconv.file2host(pd, fbuf + chars_per_site/4*mat, data_per_site/4); pd += site_mem/4; } } else { // LatHeaderBase::LATRNG_HEADER // load #ifdef USE_C11_RNG dconv.file2host(pd, fbuf, data_per_site); CalcRand(pd,data_per_site,&RandSum, &Rand2Sum); pd += site_mem; #else dconv.file2host(rng,fbuf,data_per_site); ugran[siteid].load(rng.IntPtr()); // generate next rand for verification Float rn = ugran[siteid].Grand(1); RandSum += rn; Rand2Sum += rn*rn; // recover loading ugran[siteid].load(rng.IntPtr()); #endif } siteid++; } jump = (nx-xend) * chars_per_site; // "jump" restart from 0 and count } jump += (ny-yend) * yblk; } jump += (nz-zend) * zblk; if(dimension == 4 && stk==nstacked-1) VRB.Result(cname,fname, "Parallel loading: %d%% done.\n", (int)((tr-tbegin+1) * 100.0 /(tend-tbegin))); } jump += (nt-tend) * tblk; //after nt * tblk is start of next stacked field } if(dimension == 4) break; //jump += (nt-tend) * tblk; VRB.Result(cname,fname, "Parallel loading: %d%% done.\n",(int)((sr-sbegin+1) * 100.0 /(send-sbegin))); } VRB.Flow(cname,fname, "This Group Done!\n"); sync_error: finishIOTimeSlot(); // input.close(); if ( !input.good() ) error = 1; if(synchronize(error) != 0) ERR.FileR(cname, fname, rd_arg.FileName); // This 3 lines differ from unloading part // these nodes participate in loading but not in summation if(dimension == 4 && rd_arg.Scoor()!=0) { csum = pdcsum = 0; RandSum = Rand2Sum = 0; } VRB.Result(cname,fname,"Parallel Loading done!\n"); if(ptrcsum) *ptrcsum = csum; if(ptrpdcsum) *ptrpdcsum = pdcsum; if(rand_sum) *rand_sum = RandSum; if(rand_2_sum) *rand_2_sum = Rand2Sum; return 1; } int ParallelIO::store(iostream & output, char * data, const int data_per_site, const int site_mem, LatHeaderBase & hd, const DataConversion & dconv, const int dimension /* 4 or 5 */, unsigned int * ptrcsum, unsigned int * ptrpdcsum, Float * rand_sum, Float * rand_2_sum) { const char * fname = "store()"; // printf("Node %d: ParallelIO::store()\n",UniqueID()); int error = 0; // const int data_per_site = wt_arg.ReconRow3 ? 4*12 : 4*18; const int chars_per_site = data_per_site * dconv.fileDataSize(); QioArg & wt_arg = qio_arg; int xbegin = wt_arg.XnodeSites() * wt_arg.Xcoor(), xend = wt_arg.XnodeSites() * (wt_arg.Xcoor()+1); int ybegin = wt_arg.YnodeSites() * wt_arg.Ycoor(), yend = wt_arg.YnodeSites() * (wt_arg.Ycoor()+1); int zbegin = wt_arg.ZnodeSites() * wt_arg.Zcoor(), zend = wt_arg.ZnodeSites() * (wt_arg.Zcoor()+1); int tbegin = wt_arg.TnodeSites() * wt_arg.Tcoor(), tend = wt_arg.TnodeSites() * (wt_arg.Tcoor()+1); int sbegin = wt_arg.SnodeSites() * wt_arg.Scoor(), send = wt_arg.SnodeSites() * (wt_arg.Scoor()+1); int nx = wt_arg.XnodeSites() * wt_arg.Xnodes(); int ny = wt_arg.YnodeSites() * wt_arg.Ynodes(); int nz = wt_arg.ZnodeSites() * wt_arg.Znodes(); int nt = wt_arg.TnodeSites() * wt_arg.Tnodes(); int ns = wt_arg.SnodeSites() * wt_arg.Snodes(); int nstacked = 1; //CK in end use only a single random generator per site. However for testing it is useful to have two //if(hd.headerType() == LatHeaderBase::LATTICE_HEADER && GJP.Gparity()) nstacked=2; //2 gauge fields consecutive in memory if(GJP.Gparity()) nstacked=2; //2 gauge fields consecutive in memory if(hd.headerType() == LatHeaderBase::LATTICE_HEADER && GJP.Gparity() && doGparityReconstructUstarField() ) nstacked = 1; //no point in saving U* field when it can be trivially calculated from U at load int64_t yblk = nx*chars_per_site; int64_t zblk = yblk * ny; int64_t tblk = zblk * nz; int64_t stkblk = tblk *nt; int64_t sblk = stkblk *nstacked; //tblk * nt; // TempBufAlloc is a mem allocator that prevents mem leak on function exits TempBufAlloc fbuf(chars_per_site); TempBufAlloc fbuf2(chars_per_site); // buffer only stores one site // these two only need when doing LatRng unloading TempBufAlloc rng(data_per_site * dconv.hostDataSize()); #ifndef USE_C11_RNG UGrandomGenerator * ugran = (UGrandomGenerator*)data; #endif // start parallel writing uint32_t csum = 0, pdcsum = 0; Float RandSum=0, Rand2Sum=0; char * pd = data; int siteid=0; VRB.Result(cname, fname, "Parallel unloading starting\n"); setConcurIONumber(wt_arg.ConcurIONumber); VRB.Result(cname, fname, "ConcurIONumber=%d\n",wt_arg.ConcurIONumber); // setConcurIONumber(1); getIOTimeSlot(); int retry=0; do { if(dimension ==5 || wt_arg.Scoor() == 0) { // this line differs from read() if (hd.dataStart()>0) output.seekp(hd.dataStart(),ios_base::beg); int64_t jump=0; streampos r_pos=0, w_pos; if(dimension==5) jump = sbegin * sblk; for(int sr=sbegin; dimension==4 || sr0) ERR.FileW(cname,fname,wt_arg.FileName); // printf("Node %d: synchronize done!\n",UniqueID()); // cps::sync(); VRB.Result(cname,fname,"Parallel Unloading done!\n"); // printf("Node %d: Parallel Unloading done!\n",UniqueID()); // cps::sync(); if(ptrcsum) *ptrcsum = csum; if(ptrpdcsum) *ptrpdcsum = pdcsum; if(rand_sum) *rand_sum = RandSum; if(rand_2_sum) *rand_2_sum = Rand2Sum; return 1; } #if 1 /*********************************************************************/ /* SerialIO functions ***********************************************/ /*********************************************************************/ int SerialIO::load(char * data, const int data_per_site, const int site_mem, const LatHeaderBase & hd, const DataConversion & dconv, const int dimension /* 4 or 5 */, unsigned int * ptrcsum, unsigned int * ptrpdcsum, Float * rand_sum, Float * rand_2_sum) { const char * fname = "load()"; VRB.Func(cname,fname); // only node 0 is responsible for writing int error = 0; QioArg & rd_arg = qio_arg; // check dimensions, b.c, etc int nx = rd_arg.Xnodes() * rd_arg.XnodeSites(); int ny = rd_arg.Ynodes() * rd_arg.YnodeSites(); int nz = rd_arg.Znodes() * rd_arg.ZnodeSites(); int nt = rd_arg.Tnodes() * rd_arg.TnodeSites(); int ns = rd_arg.Snodes() * rd_arg.SnodeSites(); // int data_per_site = hd.recon_row_3? 4*12 : 4*18; int chars_per_site = data_per_site * dconv.fileDataSize(); ifstream input; if(isNode0()) { input.open(rd_arg.FileName); if ( !input.good() ) error = 1; } VRB.Result(cname,fname,"%s opened sizeof(streamoff)=%d\n",rd_arg.FileName,sizeof(streamoff)); // executed by all, sync and share error status information if(synchronize(error) != 0) ERR.FileR(cname, fname, rd_arg.FileName); VRB.Result(cname,fname,"error = %d\n",error); // TempBufAlloc is a Mem Allocator that prevents mem leak on function exits TempBufAlloc fbuf(chars_per_site); VRB.Result(cname,fname,"fbuf done\n"); TempBufAlloc rng(data_per_site * dconv.hostDataSize()); VRB.Result(cname,fname,"rng done\n"); VRB.Result(cname,fname,"Node %d: dataStart() = %d\n",UniqueID(),(streamoff)hd.dataStart()); if(isNode0()) if ((streamoff)hd.dataStart()>0) { input.seekg(hd.dataStart(),ios_base::beg); VRB.Result(cname,fname,"Node %d: pos = %d\n",UniqueID(),(streamoff)input.tellg()); } int nstacked = 1; if(GJP.Gparity()) nstacked=2; //2 fields consecutive in memory if(hd.headerType() == LatHeaderBase::LATTICE_HEADER && GJP.Gparity() && doGparityReconstructUstarField() ) nstacked = 1; //no point in saving U* field when it can be trivially calculated from U at load int global_id = 0; unsigned int csum = 0; unsigned int pdcsum = 0; Float RandSum = 0; Float Rand2Sum = 0; #ifndef USE_C11_RNG UGrandomGenerator * ugran = (UGrandomGenerator*)data; #endif VRB.Result(cname, fname, "Serial loading starting\n"); for(int sc=0; dimension==4 || sc0) output.seekp(hd.dataStart(), ios_base::beg); VRB.Result(cname,fname,"Node %d: pos = %d\n",UniqueID(),(streamoff)output.tellp()); int nstacked = 1; //CK in end use only a single random generator per site. However for testing it is useful to have two //if(hd.headerType() == LatHeaderBase::LATTICE_HEADER && GJP.Gparity()) nstacked=2; //2 gauge fields consecutive in memory if(GJP.Gparity()) nstacked=2; //2 gauge fields consecutive in memory if(hd.headerType() == LatHeaderBase::LATTICE_HEADER && GJP.Gparity() && doGparityReconstructUstarField() ) nstacked = 1; //no point in saving U* field when it can be trivially calculated from U at load VRB.Result(cname, fname, "Serial unloading starting\n"); for(int sc=0; dimension==4 || sc0) ERR.FileW(cname,fname,wt_arg.FileName); VRB.Result(cname, fname, "Serial Unloading done!\n"); if(ptrcsum) *ptrcsum = csum; if(ptrpdcsum) *ptrpdcsum = pdcsum; if(rand_sum) *rand_sum = RandSum; if(rand_2_sum) *rand_2_sum = Rand2Sum; VRB.Func(cname,fname); return 1; } bool SerialIO::dbl_latt_storemode(false); //In this version of the function we write out the two stacked fields as a single field of double the size in the //G-parity direction int SerialIO::storeGparityInterleaved(iostream & output, char * data, const int data_per_site, const int site_mem, LatHeaderBase & hd, const DataConversion & dconv, const int dimension /* 4 or 5 */, unsigned int * ptrcsum, unsigned int * ptrpdcsum, Float * rand_sum, Float * rand_2_sum) { const char * fname = "storeGparityInterleaved()"; if(!GJP.Gparity()) ERR.General(cname, fname, "Function only for use with G-parity boundary conditions\n"); int gparity_dir; int ngp(0); for(int mu=0;mu<3;mu++){ if(GJP.Bc(mu) == BND_CND_GPARITY){ gparity_dir = mu; ngp++; } } if(ngp!=1 || gparity_dir!=0){ ERR.General(cname, fname, "Function only designed for G-parity in X-direction\n"); } int error = 0; QioArg & wt_arg = qio_arg; // const int data_per_site = wt_arg.ReconRow3 ? 4*12 : 4*18; const int chars_per_site = data_per_site * dconv.fileDataSize(); int nx = wt_arg.XnodeSites() * wt_arg.Xnodes(); int ny = wt_arg.YnodeSites() * wt_arg.Ynodes(); int nz = wt_arg.ZnodeSites() * wt_arg.Znodes(); int nt = wt_arg.TnodeSites() * wt_arg.Tnodes(); int ns = wt_arg.SnodeSites() * wt_arg.Snodes(); TempBufAlloc fbuf(chars_per_site); TempBufAlloc rng(data_per_site * dconv.hostDataSize()); // start serial writing unsigned int csum = 0, pdcsum = 0; Float RandSum = 0, Rand2Sum = 0; #ifndef USE_C11_RNG UGrandomGenerator * ugran; #endif //if(hd.headerType() != LatHeaderBase::LATTICE_HEADER) // printf("Node %d: ugran[0]=%e\n",UniqueID(),ugran[0].Urand(0,1)); int global_id = 0; output.seekp(hd.dataStart(), ios_base::beg); VRB.Result(cname, fname, "Double latt Serial unloading starting\n"); for(int sc=0; dimension==4 || sc0) ERR.FileW(cname,fname,wt_arg.FileName); VRB.Result(cname, fname, "Serial Unloading done!\n"); if(ptrcsum) *ptrcsum = csum; if(ptrpdcsum) *ptrpdcsum = pdcsum; if(rand_sum) *rand_sum = RandSum; if(rand_2_sum) *rand_2_sum = Rand2Sum; VRB.Func(cname,fname); return 1; } //This version of the function is applicable to G-parity in 2 directions (XY hardcoded currently): //we write out the two stacked fields as a single field of double the size in the //X and Y directions in the following checkerboard form: /* | U* | U | ----------- | U | U* | Here the x and y axes correspond to the X and Y lattice directions */ int SerialIO::storeGparityXYInterleaved(iostream & output, char * data, const int data_per_site, const int site_mem, LatHeaderBase & hd, const DataConversion & dconv, const int dimension /* 4 or 5 */, unsigned int * ptrcsum, unsigned int * ptrpdcsum, Float * rand_sum, Float * rand_2_sum) { const char * fname = "storeGparityXYInterleaved()"; if(!GJP.Gparity()) ERR.General(cname, fname, "Function only for use with G-parity boundary conditions\n"); if( !(GJP.Bc(0) == BND_CND_GPARITY && GJP.Bc(1) == BND_CND_GPARITY) || GJP.Bc(2) == BND_CND_GPARITY ) ERR.General(cname, fname, "Function only for use with G-parity boundary conditions in both X and Y directions\n"); int error = 0; QioArg & wt_arg = qio_arg; // const int data_per_site = wt_arg.ReconRow3 ? 4*12 : 4*18; const int chars_per_site = data_per_site * dconv.fileDataSize(); int nx = wt_arg.XnodeSites() * wt_arg.Xnodes(); int ny = wt_arg.YnodeSites() * wt_arg.Ynodes(); int nz = wt_arg.ZnodeSites() * wt_arg.Znodes(); int nt = wt_arg.TnodeSites() * wt_arg.Tnodes(); int ns = wt_arg.SnodeSites() * wt_arg.Snodes(); TempBufAlloc fbuf(chars_per_site); TempBufAlloc rng(data_per_site * dconv.hostDataSize()); // start serial writing unsigned int csum = 0, pdcsum = 0; Float RandSum = 0, Rand2Sum = 0; #ifndef USE_C11_RNG UGrandomGenerator * ugran; #endif //if(hd.headerType() != LatHeaderBase::LATTICE_HEADER) // printf("Node %d: ugran[0]=%e\n",UniqueID(),ugran[0].Urand(0,1)); int global_id = 0; output.seekp(hd.dataStart(), ios_base::beg); VRB.Result(cname, fname, "Quad latt Serial unloading starting at file position %d\n",output.tellg()); for(int sc=0; dimension==4 || sc0) ERR.FileW(cname,fname,wt_arg.FileName); VRB.Result(cname, fname, "Serial Unloading done! Finish at file position %d\n",output.tellg()); if(ptrcsum) *ptrcsum = csum; if(ptrpdcsum) *ptrpdcsum = pdcsum; if(rand_sum) *rand_sum = RandSum; if(rand_2_sum) *rand_2_sum = Rand2Sum; return 1; } // NOTE: !!! // the x-shift is a little different from y-, z-, & t-shift. // x-shift shift 1 NODE, while others shift a SITE void SerialIO::xShiftNode(char * data, const int xblk, const int dir) const { if(qio_arg.Xnodes() <= 1) return; if(isRow0 ()) { // x rotation only apply to nodes (0,0,0,x) // const SCUDir pos_dir[] = { SCU_XP, SCU_YP, SCU_ZP, SCU_TP }; // const SCUDir neg_dir[] = { SCU_XM, SCU_YM, SCU_ZM, SCU_TM }; char * sendbuf = data; // VRB.Func(cname,"xShift"); int fsize = xblk/sizeof(IFloat); if (xblk%sizeof(IFloat)>0) fsize++; // int *tmp_p = (int *)sendbuf; TempBufAlloc recvbuf (fsize*sizeof(IFloat)); if(dir>0) getMinusData(recvbuf.FPtr(),(IFloat *)sendbuf,fsize,0); else getPlusData(recvbuf.FPtr(),(IFloat *)sendbuf,fsize,0); // tmp_p = (int *)recvbuf.FPtr(); memcpy(sendbuf, recvbuf.FPtr(), xblk); } } void SerialIO::yShift(char * data, const int xblk, const int dir) const { int useSCU = 1; if(qio_arg.Ynodes() <= 1) useSCU = 0; // VRB.Func(cname,"yShift"); if(isFace0()) { int fsize = xblk/sizeof(IFloat); if (xblk%sizeof(IFloat)>0) fsize++; TempBufAlloc sendbuf (fsize*sizeof(IFloat)); TempBufAlloc recvbuf (fsize*sizeof(IFloat)); if(dir>0){ memcpy(sendbuf, data + xblk * (qio_arg.YnodeSites() - 1), xblk); getMinusData(recvbuf.FPtr(),sendbuf.FPtr(),fsize,1); } else { memcpy(sendbuf, data, xblk); getPlusData(recvbuf.FPtr(),sendbuf.FPtr(),fsize,1); } #if 0 SCUDirArg send, recv; if(dir>0) { if(useSCU) { memcpy(sendbuf, data + xblk * (qio_arg.YnodeSites() - 1), xblk); send.Init(sendbuf, SCU_YP, SCU_SEND, xblk); recv.Init(recvbuf, SCU_YM, SCU_REC, xblk); } else { memcpy(recvbuf, data + xblk * (qio_arg.YnodeSites() - 1), xblk); } } else { if(useSCU) { memcpy(sendbuf, data, xblk); send.Init(sendbuf, SCU_YM, SCU_SEND, xblk); recv.Init(recvbuf, SCU_YP, SCU_REC, xblk); } else { memcpy(recvbuf, data, xblk); } } if(useSCU) { SCUTrans(&send); SCUTrans(&recv); } #endif // doing memory move at the same time if(dir>0) { for(int i=qio_arg.YnodeSites()-1;i>0;i--) memcpy(data + i*xblk, data + (i-1)*xblk, xblk); // memmove(data + xblk, data, xblk * (qio_arg.YnodeSites()-1)); } else{ for(int i=0;i0) { memcpy(data, recvbuf, xblk); } else{ memcpy(data + xblk*(qio_arg.YnodeSites()-1), recvbuf, xblk); } } } void SerialIO::zShift(char * data, const int xblk, const int dir) const { int useSCU = 1; if(qio_arg.Znodes() <= 1) useSCU = 0; // VRB.Func(cname,"zShift"); if(isCube0()) { int yblk = xblk * qio_arg.YnodeSites(); int fsize = xblk/sizeof(IFloat); if (xblk%sizeof(IFloat)>0) fsize++; TempBufAlloc sendbuf (fsize*sizeof(IFloat)); TempBufAlloc recvbuf (fsize*sizeof(IFloat)); #if 0 TempBufAlloc sendbuf(xblk); TempBufAlloc recvbuf(xblk); SCUDirArg send, recv; #endif char * loface = data; char * hiface = data + yblk * (qio_arg.ZnodeSites()-1); for(int yc=0;yc0){ memcpy(sendbuf, hiface + yc*xblk, xblk); getMinusData(recvbuf.FPtr(),sendbuf.FPtr(),fsize,2); } else { memcpy(sendbuf, loface + yc*xblk, xblk); getPlusData(recvbuf.FPtr(),sendbuf.FPtr(),fsize,2); } #if 0 if(dir>0) { if(useSCU) { memcpy(sendbuf, hiface + yc*xblk, xblk); send.Init(sendbuf, SCU_ZP, SCU_SEND, xblk); recv.Init(recvbuf, SCU_ZM, SCU_REC, xblk); } else { memcpy(recvbuf, hiface + yc*xblk, xblk); } } else { if(useSCU) { memcpy(sendbuf, loface + yc*xblk, xblk); send.Init(sendbuf, SCU_ZM, SCU_SEND, xblk); recv.Init(recvbuf, SCU_ZP, SCU_REC, xblk); } else { memcpy(recvbuf, loface + yc*xblk, xblk); } } if(useSCU) { SCUTrans(&send); SCUTrans(&recv); } #endif if(dir > 0) { for(int zc=qio_arg.ZnodeSites()-1;zc>0;zc--) { memcpy(data + zc*yblk + yc*xblk, data + (zc-1)*yblk + yc*xblk, xblk); } } else { for(int zc=0;zc0) { memcpy(loface + yc*xblk, recvbuf, xblk); } else{ memcpy(hiface + yc*xblk, recvbuf, xblk); } } } } void SerialIO::tShift(char * data, const int xblk, const int dir) const { int useSCU = 1; if(qio_arg.Tnodes() <= 1) useSCU = 0; // VRB.Func(cname,"tShift"); if(isSdim0()) { int yblk = xblk * qio_arg.YnodeSites(); int zblk = yblk * qio_arg.ZnodeSites(); int fsize = xblk/sizeof(IFloat); if (xblk%sizeof(IFloat)>0) fsize++; TempBufAlloc sendbuf (fsize*sizeof(IFloat)); TempBufAlloc recvbuf (fsize*sizeof(IFloat)); #if 0 TempBufAlloc sendbuf(xblk); TempBufAlloc recvbuf(xblk); SCUDirArg send, recv; #endif char * locube = data; char * hicube = data + zblk * (qio_arg.TnodeSites()-1); for(int zc=0;zc0){ memcpy(sendbuf, hiface + yc*xblk, xblk); getMinusData(recvbuf.FPtr(),sendbuf.FPtr(),fsize,3); } else { memcpy(sendbuf, loface + yc*xblk, xblk); getPlusData(recvbuf.FPtr(),sendbuf.FPtr(),fsize,3); } #if 0 if(dir>0) { if(useSCU) { memcpy(sendbuf, hiface + yc*xblk, xblk); send.Init(sendbuf, SCU_TP, SCU_SEND, xblk); recv.Init(recvbuf, SCU_TM, SCU_REC, xblk); } else { memcpy(recvbuf, hiface + yc*xblk, xblk); } } else { if(useSCU) { memcpy(sendbuf, loface + yc*xblk, xblk); send.Init(sendbuf, SCU_TM, SCU_SEND, xblk); recv.Init(recvbuf, SCU_TP, SCU_REC, xblk); } else { memcpy(recvbuf, loface + yc*xblk, xblk); } } if(useSCU) { SCUTrans(&send); SCUTrans(&recv); } #endif if(dir > 0) { for(int tc=qio_arg.TnodeSites()-1;tc>0;tc--) { memcpy(data + tc*zblk + zc*yblk + yc*xblk, data + (tc-1)*zblk + zc*yblk + yc*xblk, xblk); } } else { for(int tc=0;tc0) { memcpy(loface + yc*xblk, recvbuf, xblk); } else{ memcpy(hiface + yc*xblk, recvbuf, xblk); } } } } } void SerialIO::sShift(char * data, const int xblk, const int dir) const { if(qio_arg.Snodes() * qio_arg.SnodeSites() == 1) return; int useSCU = 1; if(qio_arg.Snodes() <= 1) useSCU = 0; // VRB.Func(cname,"sShift"); int yblk = xblk * qio_arg.YnodeSites(); int zblk = yblk * qio_arg.ZnodeSites(); int tblk = zblk * qio_arg.TnodeSites(); int nstacked = 1; if(GJP.Gparity()) nstacked = 2; //2 stacked 4d volumes at each s //in final implementation, just check the header type to only skip double volume on lattice save, not RNG int stkblk = nstacked * tblk; int fsize = xblk/sizeof(IFloat); if (xblk%sizeof(IFloat)>0) fsize++; TempBufAlloc sendbuf (fsize*sizeof(IFloat)); TempBufAlloc recvbuf (fsize*sizeof(IFloat)); #if 0 TempBufAlloc sendbuf(xblk); TempBufAlloc recvbuf(xblk); SCUDirArg send, recv; #endif char * lodblhypcb = data; char * hidblhypcb = data + stkblk * (qio_arg.SnodeSites()-1); for(int stk=0;stk0){ memcpy(sendbuf, hiface + yc*xblk, xblk); getMinusData(recvbuf.FPtr(),sendbuf.FPtr(),fsize,4); } else { memcpy(sendbuf, loface + yc*xblk, xblk); getPlusData(recvbuf.FPtr(),sendbuf.FPtr(),fsize,4); } #if 0 if(dir>0) { if(useSCU) { memcpy(sendbuf, hiface + yc*xblk, xblk); send.Init(sendbuf, SCU_SP, SCU_SEND, xblk); recv.Init(recvbuf, SCU_SM, SCU_REC, xblk); } else { memcpy(recvbuf, hiface + yc*xblk, xblk); } } else { if(useSCU) { memcpy(sendbuf, loface + yc*xblk, xblk); send.Init(sendbuf, SCU_SM, SCU_SEND, xblk); recv.Init(recvbuf, SCU_SP, SCU_REC, xblk); } else { memcpy(recvbuf, loface + yc*xblk, xblk); } } if(useSCU) { SCUTrans(&send); SCUTrans(&recv); } #endif if(dir > 0) { for(int sc=qio_arg.SnodeSites()-1;sc>0;sc--) { memcpy(data + sc*stkblk +stk*tblk + tc*zblk + zc*yblk + yc*xblk, data + (sc-1)*stkblk + stk*tblk + tc*zblk + zc*yblk + yc*xblk, xblk); } } else { for(int sc=0;sc0) { memcpy(loface + yc*xblk, recvbuf, xblk); } else{ memcpy(hiface + yc*xblk, recvbuf, xblk); } } } } } } #if TARGET == QCDOC void SerialIO::sSpread(char * data, const int datablk) const { const char *fname = "sSpread()"; // clone the lattice from s==0 nodes to all s>0 nodes // only used in loading process if(qio_arg.Snodes() <= 1) return; // eg. 8 nodes // step: i ii iii iv // 0 --> 1 --> 2 --> 3 // | // \/ // 7 --> 6 --> 5 --> 4 SCUDirArg socket; VRB.Flow(cname, fname, "Spread on S dimension:: 0 ==> %d\n", qio_arg.Snodes()-1); if(qio_arg.Scoor() == 0) { socket.Init(data, SCU_SM, SCU_SEND, datablk); SCUTrans(&socket); SCUTransComplete(); } else if(qio_arg.Scoor() == qio_arg.Snodes()-1) { socket.Init(data, SCU_SP, SCU_REC, datablk); SCUTrans(&socket); SCUTransComplete(); } // spread simultaneously along +S and -S directions int sender[2] = { 0, qio_arg.Snodes()-1}; int receiver[2] = { 1, sender[1]-1 }; while(receiver[0] < receiver[1]) { // send until two directions converge synchronize(); VRB.Flow(cname, fname, "Spread on S dimension:: %d ==> %d\n", sender[0], receiver[0]); VRB.Flow(cname, fname, "Spread on S dimension:: %d ==> %d\n", sender[1], receiver[1]); if(qio_arg.Scoor() == sender[0]) { socket.Init(data, SCU_SP, SCU_SEND, datablk); SCUTrans(&socket); SCUTransComplete(); } else if(qio_arg.Scoor() == receiver[0]) { socket.Init(data, SCU_SM, SCU_REC, datablk); SCUTrans(&socket); SCUTransComplete(); } else if(qio_arg.Scoor() == sender[1]) { socket.Init(data, SCU_SM, SCU_SEND, datablk); SCUTrans(&socket); SCUTransComplete(); } else if(qio_arg.Scoor() == receiver[1]) { socket.Init(data, SCU_SP, SCU_REC, datablk); SCUTrans(&socket); SCUTransComplete(); } sender[0] = receiver[0]; sender[1] = receiver[1]; receiver[0]++; receiver[1]--; } } #else void SerialIO::sSpread(char * data, const int datablk) const { const char *fname = "sSpread()"; // clone the lattice from s==0 nodes to all s>0 nodes // only used in loading process if(qio_arg.Snodes() <= 1) return; // eg. 8 nodes // step: i ii iii iv // 0 --> 1 --> 2 --> 3 // | // \/ // 7 --> 6 --> 5 --> 4 TempBufAlloc snd_buf(datablk); TempBufAlloc rcv_buf(datablk); int s_nodes=qio_arg.Snodes(); int s_coor=qio_arg.Scoor(); if(s_coor == 0) memcpy(snd_buf.FPtr(), data, datablk); for(int i =0;i