#ifndef CPS_FIELD_UTILS_H #define CPS_FIELD_UTILS_H CPS_START_NAMESPACE inline void compareFermion(const CPSfermion5D &A, const CPSfermion5D &B, const std::string &descr = "Ferms", const double tol = 1e-9){ double fail = 0.; for(int i=0;i tol){ printf("Fail: (%d,%d,%d,%d,%d; %d; %d) A %g B %g rat_A_B %g fracdiff %g\n",x[0],x[1],x[2],x[3],x[4],f,sc,vbfm,vgrid,rat_grid_bfm,diff_rat); fail = 1.0; }//else printf("Pass: (%d,%d,%d,%d,%d; %d; %d) A %g B %g rat_A_B %g fracdiff %g\n",x[0],x[1],x[2],x[3],x[4],f,sc,vbfm,vgrid,rat_grid_bfm,diff_rat); } } } glb_max(&fail); if(fail!=0.0){ if(!UniqueID()){ printf("Failed %s check\n", descr.c_str()); fflush(stdout); } exit(-1); }else{ if(!UniqueID()){ printf("Passed %s check\n", descr.c_str()); fflush(stdout); } } } template::type, complex_double_or_float_mark>::value,int>::type = 0> inline void compareField(const FieldType &A, const FieldType &B, const std::string &descr = "Field", const double tol = 1e-9, bool print_all = false){ typedef typename FieldType::FieldSiteType::value_type value_type; double fail = 0.; for(int xf=0;xf tol || print_all){ if(!print_all) std::cout << "Fail: ("; else std::cout << "Pass: ("; for(int xx=0;xx &into, Fermion_t from, bfm_evo &dwf, int cb, bool singleprec_evec = false){ Fermion_t zero_a = dwf.allocFermion(); #pragma omp parallel { dwf.set_zero(zero_a); } Fermion_t etmp = dwf.allocFermion(); Fermion_t tmp[2]; tmp[!cb] = zero_a; if(singleprec_evec){ const int len = 24 * dwf.node_cbvol * (1 + dwf.gparity) * dwf.cbLs; #pragma omp parallel for for(int j = 0; j < len; j++) { ((double*)etmp)[j] = ((float*)(from))[j]; } tmp[cb] = etmp; }else tmp[cb] = from; dwf.cps_impexFermion(into.ptr(),tmp,0); dwf.freeFermion(zero_a); dwf.freeFermion(etmp); } #endif #ifdef USE_GRID template inline void exportGridcb(CPSfermion5D &into, typename GridPolicies::GridFermionField &from, typename GridPolicies::FgridFclass &latg){ Grid::GridCartesian *FGrid = latg.getFGrid(); typename GridPolicies::GridFermionField tmp_g(FGrid); tmp_g = Grid::zero; setCheckerboard(tmp_g, from); latg.ImportFermion((Vector*)into.ptr(), tmp_g); } #endif #ifdef USE_QMP //Cyclic permutation of *4D* CPSfield with std::complex type and FourDpolicy dimension policy //Conventions are direction of *data flow*: For shift n in direction +1 f'(x) = f(x-\hat i) so data is sent in the +x direction. #define CONDITION _equal::type, complex_double_or_float_mark>::value && (_equal::value || _equal::value) template< typename mf_Complex, int SiteSize, typename DimensionPolicy, typename FlavorPolicy, typename AllocPolicy> void cyclicPermute(CPSfield &to, const CPSfield &from, const int dir, const int pm, const int n, typename my_enable_if::type dummy = 0){ enum {Dimension = DimensionPolicy::EuclideanDimension}; assert(dir < Dimension); assert(n < GJP.NodeSites(dir)); assert(pm == 1 || pm == -1); if(&to == &from){ if(n==0) return; CPSfield tmpfrom(from); return cyclicPermute(to,tmpfrom,dir,pm,n); } if(n == 0){ to = from; return; } QMP_barrier(); //Prepare face to send. If we send in the + direction we need to collect the slice starting {L-n ... L-1} (inclusive), and if we send in the - dir we collect the slice {0... n-1} int bsites = n; //sites on boundary int bsizes[Dimension]; bsizes[dir] = n; int boff[Dimension]; boff[dir] = (pm == 1 ? GJP.NodeSites(dir)-n : 0); for(int i=0;i::type, grid_vector_complex_mark>::value && (_equal::value || _equal::value) //Version with SIMD vectorized data template< typename mf_Complex, int SiteSize, typename DimensionPolicy, typename FlavorPolicy, typename AllocPolicy> void cyclicPermute(CPSfield &to, const CPSfield &from, const int dir, const int pm, const int n, typename my_enable_if::type dummy = 0){ enum {Dimension = DimensionPolicy::EuclideanDimension}; assert(dir < Dimension); assert(n < GJP.NodeSites(dir)); assert(pm == 1 || pm == -1); if(&to == &from){ if(n==0) return; CPSfield tmpfrom(from); return cyclicPermute(to,tmpfrom,dir,pm,n); } if(n == 0){ to = from; return; } const int nsimd = mf_Complex::Nsimd(); //Use notation c (combined index), o (outer index) i (inner index) int bcsites = n; //sites on boundary int bcsizes[Dimension]; bcsizes[dir] = n; int bcoff[Dimension]; bcoff[dir] = (pm == 1 ? GJP.NodeSites(dir)-n : 0); int bcoff_postcomms[Dimension]; bcoff_postcomms[dir] = (pm == 1 ? 0 : GJP.NodeSites(dir)-n); for(int i=0;i::scalar_type scalarType; int bufsz = bcsites * SiteSize * nf; QMP_mem_t *recv_mem = QMP_allocate_memory(bufsz * sizeof(scalarType)); scalarType *recv_buf = (scalarType *)QMP_get_memory_pointer(recv_mem); QMP_mem_t *send_mem = QMP_allocate_memory(bufsz * sizeof(scalarType)); scalarType *send_buf = (scalarType *)QMP_get_memory_pointer(send_mem); int osites = from.nsites(); std::vector to_oi_buf_map(nf * osites * nsimd); //map from outer and inner index of destination site to offset within buffer, used *after* comms. //map i + nsimd*(o + osites*f) as index #pragma omp parallel for for(int c=0;c ounpacked(nsimd); for(int f=0;f > lane_offsets(nsimd, std::vector(Dimension) ); for(int i=0;i= GJP.NodeSites(dir)){ from_lane[lane] = -1; //indicates data is in recv_buf from_osite_idx[lane] = to_oi_buf_map[ lane + nsimd*oto ]; //here is for flavor 0 - remember to offset for second flav }else{ from_lane[lane] = from.SIMDmap(offrom_coor); from_osite_idx[lane] = from.siteMap(offrom_coor); } } //Now loop over flavor and element within the site as well as SIMD lanes of the destination vector and gather what we need to poke - then poke it Grid::Vector towrite(nsimd); Grid::Vector unpack(nsimd); for(int f=0;f::type, complex_double_or_float_mark>::value && (_equal::value || _equal::value) template< typename mf_Complex, int SiteSize, typename DimensionPolicy, typename FlavorPolicy, typename AllocPolicy> void cyclicPermute(CPSfield &to, const CPSfield &from, const int dir, const int pm, const int n, typename my_enable_if::type dummy = 0){ enum {Dimension = DimensionPolicy::EuclideanDimension}; assert(dir < Dimension); assert(n < GJP.NodeSites(dir)); assert(pm == 1 || pm == -1); if(&to == &from){ if(n==0) return; CPSfield tmpfrom(from); return cyclicPermute(to,tmpfrom,dir,pm,n); } if(n == 0){ to = from; return; } const int nodes = GJP.Xnodes()*GJP.Ynodes()*GJP.Znodes()*GJP.Tnodes()*GJP.Snodes(); if(nodes != 1) ERR.General("","cyclicPermute","Parallel implementation requires QMP\n"); #pragma omp parallel for for(int i=0;i::type, grid_vector_complex_mark>::value && (_equal::value || _equal::value) //Version with SIMD vectorized data template< typename mf_Complex, int SiteSize, typename DimensionPolicy, typename FlavorPolicy, typename AllocPolicy> void cyclicPermute(CPSfield &to, const CPSfield &from, const int dir, const int pm, const int n, typename my_enable_if::type dummy = 0){ enum {Dimension = DimensionPolicy::EuclideanDimension}; assert(dir < Dimension); assert(n < GJP.NodeSites(dir)); assert(pm == 1 || pm == -1); if(&to == &from){ if(n==0) return; CPSfield tmpfrom(from); return cyclicPermute(to,tmpfrom,dir,pm,n); } if(n == 0){ to = from; return; } const int nodes = GJP.Xnodes()*GJP.Ynodes()*GJP.Znodes()*GJP.Tnodes()*GJP.Snodes(); if(nodes != 1) ERR.General("","cyclicPermute","Parallel implementation requires QMP\n"); const int nsimd = mf_Complex::Nsimd(); typedef typename mf_Complex::scalar_type scalar_type; const int nthr = omp_get_max_threads(); scalar_type* tmp_store_thr[nthr]; for(int i=0;i 0 ? +1 : -1; } //Invoke multiple independent permutes to offset field by vector 'shift' assuming field is periodic template void shiftPeriodicField(FieldType &to, const FieldType &from, const std::vector &shift){ int nd = shift.size(); //assume ascending: x,y,z,t int nshift_dirs = 0; for(int i=0;i void fft(CPSfieldType &into, const CPSfieldType &from, const bool* do_dirs, const bool inverse_transform = false, typename my_enable_if<_equal::type, complex_double_or_float_mark>::value, const int>::type = 0 ){ typedef typename LocalToGlobalInOneDirMap::type DimPolGlobalInOneDir; typedef CPSfieldGlobalInOneDir CPSfieldTypeGlobalInOneDir; int dcount = 0; for(int mu=0;mu void fft(CPSfieldType &into, const CPSfieldType &from, const bool* do_dirs, const bool inverse_transform = false, typename my_enable_if<_equal::type, grid_vector_complex_mark>::value, const int>::type = 0 ){ typedef typename Grid::GridTypeMapper::scalar_type ScalarType; typedef typename CPSfieldType::FieldDimensionPolicy::EquivalentScalarPolicy ScalarDimPol; typedef CPSfield ScalarFieldType; NullObject null_obj; ScalarFieldType tmp_in(null_obj); ScalarFieldType tmp_out(null_obj); tmp_in.importField(from); fft(tmp_out, tmp_in, do_dirs, inverse_transform); tmp_out.exportField(into); } #endif template void fft(CPSfieldType &fftme, const bool* do_dirs){ fft(fftme,fftme,do_dirs); } template void fft_opt(CPSfieldType &into, const CPSfieldType &from, const bool* do_dirs, const bool inverse_transform = false, typename my_enable_if<_equal::type, complex_double_or_float_mark>::value, const int>::type = 0 ){ #ifndef USE_MPI fft(into,from,do_dirs,inverse_transform); #else enum { Dimension = CPSfieldType::FieldDimensionPolicy::EuclideanDimension }; int ndirs_fft = 0; for(int i=0;i node_map; getMPIrankMap(node_map); CPSfieldType tmp(from.getDimPolParams()); //we want the last fft to end up in 'into'. Intermediate FFTs cycle between into and tmp as temp storage. Thus for odd ndirs_fft, the first fft should output to 'into', for even it should output to 'tmp' CPSfieldType *tmp1, *tmp2; if(ndirs_fft % 2 == 1){ tmp1 = &into; tmp2 = &tmp; }else{ tmp1 = &tmp; tmp2 = &into; } CPSfieldType* src = tmp2; CPSfieldType* out = tmp1; int fft_count = 0; for(int mu=0; mu void fft_opt_mu(CPSfieldType &into, const CPSfieldType &from, const int mu, const std::vector &node_map, const bool inverse_transform, typename my_enable_if<_equal::type, complex_double_or_float_mark>::value, const int>::type = 0 ){ enum {SiteSize = CPSfieldType::FieldSiteSize, Dimension = CPSfieldType::FieldDimensionPolicy::EuclideanDimension }; typedef typename CPSfieldType::FieldSiteType ComplexType; typedef typename ComplexType::value_type FloatType; typedef typename FFTWwrapper::complexType FFTComplex; const int nf = from.nflavors(); const int foff = from.flav_offset(); const int nthread = omp_get_max_threads(); //Eg for fft in X-direction, divide up Y,Z,T work over nodes in X-direction doing linear FFTs. const int munodesites = GJP.NodeSites(mu); const int munodes = GJP.Nodes(mu); const int mutotalsites = munodesites*munodes; const int munodecoor = GJP.NodeCoor(mu); const int n_orthdirs = Dimension - 1; FloatType Lmu(mutotalsites); int orthdirs[n_orthdirs]; //map of orthogonal directions to mu int total_work_munodes = 1; //sites orthogonal to FFT direction int o=0; for(int i=0;i< Dimension;i++) if(i!=mu){ total_work_munodes *= GJP.NodeSites(i); orthdirs[o++] = i; } //Divvy up work over othogonal directions int munodes_work[munodes]; int munodes_off[munodes]; for(int i=0;i plan_f_base[Dimension]; //destructors deallocate plans static FFTplanContainer plan_f_base_p1[Dimension]; static int plan_howmany[Dimension]; static bool plan_init = false; static int plan_fft_phase; if(!plan_init || plan_howmany[mu] != howmany || fft_phase != plan_fft_phase){ if(!plan_init) for(int i=0;i::complexType *tmp_f; //I don't think it actually does anything with this plan_fft_phase = fft_phase; const int fft_work_per_musite = howmany_per_thread_base; const int musite_stride = howmany; //stride between musites plan_f_base[mu].setPlan(1, &mutotalsites, fft_work_per_musite, tmp_f, NULL, musite_stride, 1, tmp_f, NULL, musite_stride, 1, plan_fft_phase, FFTW_ESTIMATE); plan_f_base_p1[mu].setPlan(1, &mutotalsites, fft_work_per_musite+1, tmp_f, NULL, musite_stride, 1, tmp_f, NULL, musite_stride, 1, plan_fft_phase, FFTW_ESTIMATE); plan_init = true; //other mu's will still init later } FFTComplex*fftw_mem = (FFTComplex*)recv_buf; #pragma omp parallel { assert(nthread == omp_get_num_threads()); //plans will be messed up if not true const int me = omp_get_thread_num(); int thr_work, thr_off; thread_work(thr_work, thr_off, howmany, me, nthread); const FFTplanContainer* thr_plan_ptr; if(thr_work == howmany_per_thread_base) thr_plan_ptr = &plan_f_base[mu]; else if(thr_work == howmany_per_thread_base + 1) thr_plan_ptr = &plan_f_base_p1[mu]; else assert(0); //catch if logic for thr_work changes FFTWwrapper::execute_dft(thr_plan_ptr->getPlan(), fftw_mem + thr_off, fftw_mem + thr_off); } wret = MPI_Waitall(munodes,send_req,status); assert(wret == MPI_SUCCESS); //Send back out. Reuse the old send buffers as receive buffers and vice versa for(int i=0;i void fft_opt(CPSfieldType &into, const CPSfieldType &from, const bool* do_dirs, const bool inverse_transform = false, typename my_enable_if<_equal::type, grid_vector_complex_mark>::value, const int>::type = 0 ){ //we can avoid the copies below but with some effort - do at some point # ifdef USE_MPI fft(into,from,do_dirs,inverse_transform); # else typedef typename Grid::GridTypeMapper::scalar_type ScalarType; typedef typename CPSfieldType::FieldDimensionPolicy::EquivalentScalarPolicy ScalarDimPol; typedef CPSfield ScalarFieldType; NullObject null_obj; ScalarFieldType tmp_in(null_obj); ScalarFieldType tmp_out(null_obj); tmp_in.importField(from); fft_opt(tmp_out, tmp_in, do_dirs, inverse_transform); tmp_out.exportField(into); # endif } #endif CPS_END_NAMESPACE #endif