//Periodically calculate true residual of single precision multi-mass solve using same parameters as the 32^3 gparity run, only using the 16^3 mobius+DSDR configs on a 128 node machine //Local volume will not be identical because 32^3 job has 8^4 local volume whereas 16^3 has 8x4x4x8 //However we might be able to determine the optimum single precision residual #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_BFM #include #endif //------------------------------------------------------------- USING_NAMESPACE_CPS using namespace std; const char *cname = ""; DoArg do_arg; RationalQuotientRemezArg rq_arg; #define decode_vml(arg_name) \ if(!UniqueID()) printf("Decoding %s.vml\n",#arg_name); fflush(stdout); \ do{ \ if ( ! arg_name.Decode(#arg_name".vml", #arg_name) ) \ ERR.General(cname, fname, "Bad " #arg_name ".vml.\n"); \ } while(0) void decode_vml_all(void) { char *fname = "decode_vml_all()"; //decode_vml(b_arg); decode_vml(do_arg); decode_vml(rq_arg); } void setup_bfmargs(bfmarg &dwfa, const Float &residual, const int &nthreads){ if(!UniqueID()) printf("Setting up bfmargs\n"); omp_set_num_threads(nthreads); dwfa.node_latt[0] = GJP.XnodeSites(); dwfa.node_latt[1] = GJP.YnodeSites(); dwfa.node_latt[2] = GJP.ZnodeSites(); dwfa.node_latt[3] = GJP.TnodeSites(); multi1d ncoor(4); multi1d procs(4); for(int i=0;i<4;i++){ ncoor[i] = GJP.NodeCoor(i); procs[i] = GJP.Nodes(i); } if(GJP.Gparity()){ dwfa.gparity = 1; if(!UniqueID()) printf("G-parity directions: "); for(int d=0;d<3;d++) if(GJP.Bc(d) == BND_CND_GPARITY){ dwfa.gparity_dir[d] = 1; if(!UniqueID()) printf("%d ",d); } else dwfa.gparity_dir[d] = 0; for(int d=0;d<4;d++){ dwfa.nodes[d] = procs[d]; dwfa.ncoor[d] = ncoor[d]; } if(!UniqueID()) printf("\n"); } dwfa.verbose=1; dwfa.reproduce=0; bfmarg::Threads(nthreads); bfmarg::Reproduce(0); bfmarg::ReproduceChecksum(0); bfmarg::ReproduceMasterCheck(0); bfmarg::Verbose(1); for(int mu=0;mu<4;mu++){ if ( procs[mu]>1 ) { dwfa.local_comm[mu] = 0; if(!UniqueID()) printf("Non-local comms in direction %d\n",mu); } else { dwfa.local_comm[mu] = 1; if(!UniqueID()) printf("Local comms in direction %d\n",mu); } } dwfa.precon_5d = 0; //mobius uses 4d preconditioning dwfa.mobius_scale = 32.0/12.0; dwfa.Ls = GJP.SnodeSites(); dwfa.solver = HmCayleyTanh; dwfa.M5 = toDouble(GJP.DwfHeight()); dwfa.mass = toDouble(0.0001); dwfa.Csw = 0.0; dwfa.max_iter = 100000; dwfa.residual = residual; if(!UniqueID()) printf("Finished setting up bfmargs\n"); } static int multi_shift_trueresid(Fermion_t psi[], Fermion_t src, double mass[], double alpha[], int nshift, double mresidual[], int single, bfm_evo &bfm_f, bfm_evo &bfm_d, int report_freq = 100) { //NOTE: Assumes bfm_d comms are active int me = bfm_d.thread_barrier(); // Per shift fields Fermion_t ps [nshift]; // search directions double bs [nshift]; double rsq[nshift]; double z[nshift][2]; int converged[nshift]; const int primary =0; //Primary shift fields CG iteration double a,b,c,d; double cp,bp; //prev Fermion_t p = bfm_f.threadedAllocFermion(mem_slow); Fermion_t r = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::switch_comm(bfm_f, bfm_d); //Single precision copy of src Fermion_t src_f = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::threaded_convFermion(src_f, src, bfm_f, bfm_d); // Matrix mult fields in single and double prec Fermion_t tmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t tmp2 = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t tmp_d = bfm_d.threadedAllocFermion(mem_slow); Fermion_t mp_d = bfm_d.threadedAllocFermion(mem_slow); Fermion_t mmp_d = bfm_d.threadedAllocFermion(mem_slow); Fermion_t r_d = bfm_d.threadedAllocFermion(mem_slow); Fermion_t p_d = bfm_d.threadedAllocFermion(mem_slow); // Check lightest mass for(int s=0;s nrow(Nd); for(int i = 0; i< Nd; ++i) nrow[i] = GJP.Sites(i); Layout::setLattSize(nrow); Layout::create(); } static int multi_shift_dp_spmprec (Fermion_t psi[], Fermion_t src, double mass[], double alpha[], int nshift, double mresidual[], int single, bfm_evo &bfm_f, bfm_evo &bfm_d) { //Assumes bfm_d comms active! int me = bfm_d.thread_barrier(); // Per shift fields Fermion_t ps [nshift]; // search directions double bs [nshift]; double rsq[nshift]; double z[nshift][2]; int converged[nshift]; const int primary =0; //Primary shift fields CG iteration double a,b,c,d; double cp,bp; //prev Fermion_t p ; Fermion_t r ; Fermion_t p_f ; // Matrix mult fields Fermion_t tmp = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mp = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mmp = bfm_d.threadedAllocFermion(mem_fast); Fermion_t tmp_f = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mp_f = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mmp_f = bfm_f.threadedAllocFermion(mem_fast); // Check lightest mass for(int s=0;s &bfm_f, bfm_evo &bfm_d, int switch_iter, int report_freq = -1) { //Assumes bfm_d comms active! int me = bfm_d.thread_barrier(); // Per shift fields Fermion_t ps [nshift], ps_f [nshift]; // search directions double bs [nshift]; double rsq[nshift]; double z[nshift][2]; int converged[nshift]; const int primary =0; //Primary shift fields CG iteration double a,b,c,d; double cp,bp; //prev Fermion_t p, p_f ; Fermion_t r, r_f ; // Matrix mult fields Fermion_t tmp = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mp = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mmp = bfm_d.threadedAllocFermion(mem_fast); Fermion_t tmp_f = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mp_f = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mmp_f = bfm_f.threadedAllocFermion(mem_fast); // Check lightest mass for(int s=0;s &bfm_f, bfm_evo &bfm_d, int update_freq = 100, int report_freq = -1) { //NOTE: Assumes bfm_d comms are active //update_freq is the frequency at which the reliable update step is performed //report_freq prints the double precision true residual when k % report_freq = 0. Use -1 to disable int me = bfm_d.thread_barrier(); double bs [nshift]; double rsq[nshift]; double z[nshift][2]; int converged[nshift]; const int primary =0; //Primary shift fields CG iteration double a,b,c,d; double cp,bp; //prev //Single precision fields Fermion_t r = bfm_f.threadedAllocFermion(mem_slow); //residual vector, single precision Fermion_t tmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t p = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t src_f = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::threaded_convFermion(src_f, src, bfm_f, bfm_d); //Double precision fields Fermion_t p_d = bfm_d.threadedAllocFermion(mem_fast); //search direction, double precision Fermion_t tmp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mmp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t ps_d [nshift]; // search directions (double precision) for(int i=0;i &bfm_f, bfm_evo &bfm_d, int report_freq = -1) { double f; double cp,c,a,d,b; int me = bfm_f.thread_barrier(); //Assumes bfm_d comms active mixed_cg::switch_comm(bfm_f, bfm_d); Fermion_t src = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::threaded_convFermion(src, src_d, bfm_f, bfm_d); Fermion_t psi = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::threaded_convFermion(psi, psi_d, bfm_f, bfm_d); Fermion_t p = bfm_f.threadedAllocFermion(mem_fast); Fermion_t tmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t r = bfm_f.threadedAllocFermion(mem_fast); Fermion_t tmp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mmp_d = bfm_d.threadedAllocFermion(mem_fast); #define DEALLOCATE_ALL \ bfm_f.threadedFreeFermion(tmp); \ bfm_f.threadedFreeFermion(p); \ bfm_f.threadedFreeFermion(mp); \ bfm_f.threadedFreeFermion(mmp); \ bfm_f.threadedFreeFermion(r); \ bfm_f.threadedFreeFermion(psi); \ bfm_f.threadedFreeFermion(src); \ bfm_d.threadedFreeFermion(tmp_d); \ bfm_d.threadedFreeFermion(mp_d); \ bfm_d.threadedFreeFermion(mmp_d) //Initial residual computation & set up double guess = bfm_f.norm(psi); d= bfm_f.Mprec(psi,mp,tmp,DaggerNo); b= bfm_f.Mprec(mp,mmp,tmp,DaggerYes); bfm_f.axpy (r, mmp, src,-1.0); bfm_f.axpy (p, mmp, src,-1.0); a = bfm_f.norm(p); cp= bfm_f.norm(r); Float ssq = bfm_f.norm(src); if ( bfm_f.isBoss() && !me ) { printf("bfmbase::CGNE_prec gues %le \n",guess); printf("bfmbase::CGNE_prec src %le \n",ssq); printf("bfmbase::CGNE_prec mp %le \n",d); printf("bfmbase::CGNE_prec mmp %le \n",b); printf("bfmbase::CGNE_prec r %le \n",cp); printf("bfmbase::CGNE_prec p %le \n",a); } Float rsq = bfm_f.residual* bfm_f.residual*ssq; //Check if guess is really REALLY good :) if ( cp <= rsq ) { if ( bfm_f.isBoss() && !me ) { printf("bfmbase::CGNE_prec k=0 converged - suspiciously nice guess %le %le\n",cp,rsq); } mixed_cg::threaded_convFermion(psi_d, psi, bfm_d, bfm_f); mixed_cg::switch_comm(bfm_d, bfm_f); DEALLOCATE_ALL; return 0; } if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec k=0 residual %le rsq %le\n",cp,rsq); struct timeval start,stop; gettimeofday(&start,NULL); for (int k=1;k<=bfm_f.max_iter;k++){ c=cp; d = bfm_f.Mprec(p,mp,tmp,0,1); a = c/d; double qq= bfm_f.Mprec(mp,mmp,tmp,1); double b_pred = a*(a*qq-d)/c; cp = bfm_f.axpy_norm(r,mmp,r,-a); b = cp/c; bfm_f.axpy(psi,p,psi,a); bfm_f.axpy(p,p,r,b); if ( k%100 == 0 ){ if ( bfm_f.isBoss() && !me ) { printf("bfmbase::CGNE_prec: k=%d r^2=%le %le\n",k,cp,sqrt(cp/ssq)); } } // Stopping condition if ( cp <= rsq ) { gettimeofday(&stop,NULL); struct timeval diff; timersub(&stop,&start,&diff); if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec converged in %d iterations\n",k); if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec converged in %d.%6.6d s\n",diff.tv_sec,diff.tv_usec); //Calculate double precision true resid mixed_cg::threaded_convFermion(psi_d, psi, bfm_d, bfm_f); mixed_cg::switch_comm(bfm_d, bfm_f); bfm_d.Mprec(psi_d,mp_d,tmp_d,0); bfm_d.Mprec(mp_d,mmp_d,tmp_d,1); bfm_d.axpy(tmp_d,src_d,mmp_d,-1.0); double true_residual = sqrt(bfm_d.norm(tmp_d)/bfm_d.norm(src_d)); if ( bfm_d.isBoss() && !me ) printf("bfmbase::CGNE_prec: true residual is %le \n",true_residual); DEALLOCATE_ALL; return k; }else if(report_freq != -1 && k % report_freq == 0){ mixed_cg::threaded_convFermion(psi_d, psi, bfm_d, bfm_f); mixed_cg::switch_comm(bfm_d, bfm_f); bfm_d.Mprec(psi_d,mp_d,tmp_d,0); bfm_d.Mprec(mp_d,mmp_d,tmp_d,1); bfm_d.axpy(tmp_d,src_d,mmp_d,-1.0); double true_residual = sqrt(bfm_d.norm(tmp_d)/bfm_d.norm(src_d)); if ( bfm_d.isBoss() && !me ) printf("bfmbase::CGNE_prec: iter %d, double prec true residual is %.12le, running true residual is %.12le\n",k,true_residual,sqrt(cp/rsq)*bfm_f.residual); mixed_cg::switch_comm(bfm_f, bfm_d); } } if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec: CG not converged \n"); DEALLOCATE_ALL; mixed_cg::switch_comm(bfm_d, bfm_f); return -1; } #undef DEALLOCATE_ALL static int CGNE_singleprec_trueresid_reliable_update(Fermion_t psi_d, Fermion_t src_d, bfm_evo &bfm_f, bfm_evo &bfm_d, int update_freq = 10, int report_freq = -1) { double f; double cp,c,a,d,b; int me = bfm_f.thread_barrier(); //Assumes bfm_d comms active mixed_cg::switch_comm(bfm_f, bfm_d); Fermion_t src = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::threaded_convFermion(src, src_d, bfm_f, bfm_d); Fermion_t psi = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::threaded_convFermion(psi, psi_d, bfm_f, bfm_d); Fermion_t p = bfm_f.threadedAllocFermion(mem_fast); Fermion_t tmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t r = bfm_f.threadedAllocFermion(mem_fast); Fermion_t tmp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mmp_d = bfm_d.threadedAllocFermion(mem_fast); #define DEALLOCATE_ALL \ bfm_f.threadedFreeFermion(tmp); \ bfm_f.threadedFreeFermion(p); \ bfm_f.threadedFreeFermion(mp); \ bfm_f.threadedFreeFermion(mmp); \ bfm_f.threadedFreeFermion(r); \ bfm_f.threadedFreeFermion(psi); \ bfm_f.threadedFreeFermion(src); \ bfm_d.threadedFreeFermion(tmp_d); \ bfm_d.threadedFreeFermion(mp_d); \ bfm_d.threadedFreeFermion(mmp_d) //Initial residual computation & set up double guess = bfm_f.norm(psi); d= bfm_f.Mprec(psi,mp,tmp,DaggerNo); b= bfm_f.Mprec(mp,mmp,tmp,DaggerYes); bfm_f.axpy (r, mmp, src,-1.0); bfm_f.axpy (p, mmp, src,-1.0); a = bfm_f.norm(p); cp= bfm_f.norm(r); Float ssq = bfm_f.norm(src); if ( bfm_f.isBoss() && !me ) { printf("bfmbase::CGNE_prec gues %le \n",guess); printf("bfmbase::CGNE_prec src %le \n",ssq); printf("bfmbase::CGNE_prec mp %le \n",d); printf("bfmbase::CGNE_prec mmp %le \n",b); printf("bfmbase::CGNE_prec r %le \n",cp); printf("bfmbase::CGNE_prec p %le \n",a); } Float rsq = bfm_f.residual* bfm_f.residual*ssq; //Check if guess is really REALLY good :) if ( cp <= rsq ) { if ( bfm_f.isBoss() && !me ) { printf("bfmbase::CGNE_prec k=0 converged - suspiciously nice guess %le %le\n",cp,rsq); } mixed_cg::threaded_convFermion(psi_d, psi, bfm_d, bfm_f); mixed_cg::switch_comm(bfm_d, bfm_f); DEALLOCATE_ALL; return 0; } if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec k=0 residual %le rsq %le\n",cp,rsq); struct timeval start,stop; gettimeofday(&start,NULL); for (int k=1;k<=bfm_f.max_iter;k++){ c=cp; // if(k % update_freq == 0){ // //Replace r with true residual // mixed_cg::switch_comm(bfm_d, bfm_f); // mixed_cg::threaded_convFermion(psi_d, psi, bfm_d, bfm_f); // bfm_d.Mprec(psi_d,mp_d,tmp_d,0,1); // bfm_d.Mprec(mp_d,mmp_d,tmp_d,1); // double c_old = c; // c = bfm_d.axpy_norm(tmp_d,src_d,mmp_d,-1.0); // printf("bfmbase::CGNE_prec: reliable update iter %d, replaced |r|^2 = %.12le with |r|^2 = %.12le\n",k,c_old,c); // mixed_cg::threaded_convFermion(r, tmp_d, bfm_f, bfm_d); // mixed_cg::switch_comm(bfm_f, bfm_d); // } d = bfm_f.Mprec(p,mp,tmp,0,1); a = c/d; bfm_f.axpy(psi,p,psi,a); double qq= bfm_f.Mprec(mp,mmp,tmp,1); double b_pred = a*(a*qq-d)/c; if(k % update_freq == 0){ double cp_sp = bfm_f.axpy_norm(r,mmp,r,-a); //Replace r with true residual mixed_cg::switch_comm(bfm_d, bfm_f); mixed_cg::threaded_convFermion(psi_d, psi, bfm_d, bfm_f); bfm_d.Mprec(psi_d,mp_d,tmp_d,0,1); bfm_d.Mprec(mp_d,mmp_d,tmp_d,1); cp = bfm_d.axpy_norm(tmp_d,mmp_d,src_d,-1.0); if( bfm_d.isBoss() && !me) printf("bfmbase::CGNE_prec: reliable update iter %d, replaced |r|^2 = %.12le with |r|^2 = %.12le\n",k,cp_sp,cp); mixed_cg::threaded_convFermion(r, tmp_d, bfm_f, bfm_d); //mixed_cg::threaded_convFermion(p, tmp_d, bfm_f, bfm_d); //make next search direction equal to residual mixed_cg::switch_comm(bfm_f, bfm_d); }else{ cp = bfm_f.axpy_norm(r,mmp,r,-a); //b = cp/c; //bfm_f.axpy(p,p,r,b); } b = cp/c; bfm_f.axpy(p,p,r,b); if ( k%100 == 0 ){ if ( bfm_f.isBoss() && !me ) { printf("bfmbase::CGNE_prec: k=%d r^2=%le %le\n",k,cp,sqrt(cp/ssq)); } } // Stopping condition if ( cp <= rsq ) { gettimeofday(&stop,NULL); struct timeval diff; timersub(&stop,&start,&diff); if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec converged in %d iterations\n",k); if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec converged in %d.%6.6d s\n",diff.tv_sec,diff.tv_usec); //Calculate double precision true resid mixed_cg::threaded_convFermion(psi_d, psi, bfm_d, bfm_f); mixed_cg::switch_comm(bfm_d, bfm_f); bfm_d.Mprec(psi_d,mp_d,tmp_d,0); bfm_d.Mprec(mp_d,mmp_d,tmp_d,1); bfm_d.axpy(tmp_d,src_d,mmp_d,-1.0); double true_residual = sqrt(bfm_d.norm(tmp_d)/bfm_d.norm(src_d)); if ( bfm_d.isBoss() && !me ) printf("bfmbase::CGNE_prec: true residual is %le \n",true_residual); DEALLOCATE_ALL; return k; }else if(report_freq != -1 && k % report_freq == 0){ mixed_cg::threaded_convFermion(psi_d, psi, bfm_d, bfm_f); mixed_cg::switch_comm(bfm_d, bfm_f); bfm_d.Mprec(psi_d,mp_d,tmp_d,0); bfm_d.Mprec(mp_d,mmp_d,tmp_d,1); bfm_d.axpy(tmp_d,src_d,mmp_d,-1.0); double true_residual = sqrt(bfm_d.norm(tmp_d)/bfm_d.norm(src_d)); if ( bfm_d.isBoss() && !me ) printf("bfmbase::CGNE_prec: iter %d, double prec true residual is %.12le, running true residual is %.12le (|r|^2=%.12le)\n",k,true_residual,sqrt(cp/rsq)*bfm_f.residual,cp); mixed_cg::switch_comm(bfm_f, bfm_d); } } if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec: CG not converged \n"); DEALLOCATE_ALL; mixed_cg::switch_comm(bfm_d, bfm_f); return -1; } #undef DEALLOCATE_ALL static int CGNE_singleprec_trueresid_reliable_update_dp_vects(Fermion_t psi_d, Fermion_t src_d, bfm_evo &bfm_f, bfm_evo &bfm_d, int update_freq = 10, int report_freq = -1) { //Maintain seach vector in double precision double f; double cp,c,a,d,b; int me = bfm_d.thread_barrier(); //Assumes bfm_d comms active Fermion_t src = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::threaded_convFermion(src, src_d, bfm_f, bfm_d); Fermion_t psi = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::threaded_convFermion(psi, psi_d, bfm_f, bfm_d); Fermion_t p = bfm_f.threadedAllocFermion(mem_fast); Fermion_t tmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t r = bfm_f.threadedAllocFermion(mem_fast); Fermion_t p_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t tmp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mmp_d = bfm_d.threadedAllocFermion(mem_fast); #define DEALLOCATE_ALL \ bfm_f.threadedFreeFermion(p); \ bfm_f.threadedFreeFermion(tmp); \ bfm_f.threadedFreeFermion(mp); \ bfm_f.threadedFreeFermion(mmp); \ bfm_f.threadedFreeFermion(r); \ bfm_f.threadedFreeFermion(psi); \ bfm_f.threadedFreeFermion(src); \ bfm_d.threadedFreeFermion(p_d); \ bfm_d.threadedFreeFermion(tmp_d); \ bfm_d.threadedFreeFermion(mp_d); \ bfm_d.threadedFreeFermion(mmp_d) //Initial residual computation & set up mixed_cg::switch_comm(bfm_f, bfm_d); double guess = bfm_f.norm(psi); d= bfm_f.Mprec(psi,mp,tmp,DaggerNo); b= bfm_f.Mprec(mp,mmp,tmp,DaggerYes); cp = bfm_f.axpy_norm (r, mmp, src,-1.0); mixed_cg::threaded_convFermion(tmp_d, mmp, bfm_d, bfm_f); mixed_cg::switch_comm(bfm_d, bfm_f); a = bfm_d.axpy_norm (p_d, tmp_d, src_d,-1.0); mixed_cg::switch_comm(bfm_f, bfm_d); Float ssq = bfm_f.norm(src); if ( bfm_f.isBoss() && !me ) { printf("bfmbase::CGNE_prec gues %le \n",guess); printf("bfmbase::CGNE_prec src %le \n",ssq); printf("bfmbase::CGNE_prec mp %le \n",d); printf("bfmbase::CGNE_prec mmp %le \n",b); printf("bfmbase::CGNE_prec r %le \n",cp); printf("bfmbase::CGNE_prec p %le \n",a); } Float rsq = bfm_f.residual* bfm_f.residual*ssq; //Check if guess is really REALLY good :) if ( cp <= rsq ) { if ( bfm_f.isBoss() && !me ) { printf("bfmbase::CGNE_prec k=0 converged - suspiciously nice guess %le %le\n",cp,rsq); } mixed_cg::switch_comm(bfm_d, bfm_f); DEALLOCATE_ALL; return 0; } if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec k=0 residual %le rsq %le\n",cp,rsq); struct timeval start,stop; gettimeofday(&start,NULL); for (int k=1;k<=bfm_f.max_iter;k++){ c=cp; mixed_cg::threaded_convFermion(p, p_d, bfm_f, bfm_d); d = bfm_f.Mprec(p,mp,tmp,0,1); double qq= bfm_f.Mprec(mp,mmp,tmp,1); a = c/d; double b_pred = a*(a*qq-d)/c; bfm_d.axpy(psi_d,p_d,psi_d,a); if(k % update_freq == 0){ double cp_sp = bfm_f.axpy_norm(r,mmp,r,-a); //Replace r with true residual mixed_cg::switch_comm(bfm_d, bfm_f); bfm_d.Mprec(psi_d,mp_d,tmp_d,0,1); bfm_d.Mprec(mp_d,mmp_d,tmp_d,1); cp = bfm_d.axpy_norm(tmp_d,mmp_d,src_d,-1.0); if( bfm_d.isBoss() && !me) printf("bfmbase::CGNE_prec: reliable update iter %d, replaced |r|^2 = %.12le with |r|^2 = %.12le\n",k,cp_sp,cp); mixed_cg::threaded_convFermion(r, tmp_d, bfm_f, bfm_d); mixed_cg::switch_comm(bfm_f, bfm_d); }else{ cp = bfm_f.axpy_norm(r,mmp,r,-a); mixed_cg::threaded_convFermion(tmp_d, r, bfm_d, bfm_f); } //Update search direction b = cp/c; bfm_d.axpy(p_d,p_d,tmp_d,b); if ( k%100 == 0 ){ if ( bfm_f.isBoss() && !me ) { printf("bfmbase::CGNE_prec: k=%d r^2=%le %le\n",k,cp,sqrt(cp/ssq)); } } // Stopping condition if ( cp <= rsq ) { gettimeofday(&stop,NULL); struct timeval diff; timersub(&stop,&start,&diff); if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec converged in %d iterations\n",k); if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec converged in %d.%6.6d s\n",diff.tv_sec,diff.tv_usec); //Calculate double precision true resid mixed_cg::switch_comm(bfm_d, bfm_f); bfm_d.Mprec(psi_d,mp_d,tmp_d,0); bfm_d.Mprec(mp_d,mmp_d,tmp_d,1); bfm_d.axpy(tmp_d,src_d,mmp_d,-1.0); double true_residual = sqrt(bfm_d.norm(tmp_d)/bfm_d.norm(src_d)); if ( bfm_d.isBoss() && !me ) printf("bfmbase::CGNE_prec: true residual is %le \n",true_residual); DEALLOCATE_ALL; return k; }else if(report_freq != -1 && k % report_freq == 0){ mixed_cg::switch_comm(bfm_d, bfm_f); bfm_d.Mprec(psi_d,mp_d,tmp_d,0); bfm_d.Mprec(mp_d,mmp_d,tmp_d,1); bfm_d.axpy(tmp_d,src_d,mmp_d,-1.0); double true_residual = sqrt(bfm_d.norm(tmp_d)/bfm_d.norm(src_d)); if ( bfm_d.isBoss() && !me ) printf("bfmbase::CGNE_prec: iter %d, double prec true residual is %.12le, running true residual is %.12le (|r|^2=%.12le)\n",k,true_residual,sqrt(cp/rsq)*bfm_f.residual,cp); mixed_cg::switch_comm(bfm_f, bfm_d); } } if ( bfm_f.isBoss() && !me ) printf("bfmbase::CGNE_prec: CG not converged \n"); DEALLOCATE_ALL; mixed_cg::switch_comm(bfm_d, bfm_f); return -1; } #undef DEALLOCATE_ALL Float* rand_5d_canonical_fermion(Lattice &lat){ long f_size = (long)24 * GJP.VolNodeSites() * GJP.SnodeSites(); if(GJP.Gparity()) f_size*=2; Float *v1 = (Float *)pmalloc(sizeof(Float) * f_size); if(!UniqueID()) printf("Making random gaussian 5d vector\n"); lat.RandGaussVector((Vector*)v1, 0.5, 2, CANONICAL, FIVE_D); if(!UniqueID()) printf("Finished making random gaussian vector\n"); return v1; } int main(int argc, char *argv[]) { const char *fname = "main()"; if(argc<3){ if(!UniqueID()) printf("Usage: \n"); exit(0); } int test_no; { std::stringstream ss; ss << argv[2]; ss >> test_no; } if(test_no == 9) do_arg.start_conf_kind = START_CONF_ORD; setup(&argc, &argv); Float residual = 1e-10; int report_freq = 100; bool skip_test1 = false; int switch_iter = 90; int rel_up_freq = 10; bool resid_from_file = false; char* resid_file; int nthreads = 1; #if TARGET == BGQ nthreads = 64; #endif int i=3; while(i> residual; i+=2; }else if( strncmp(cmd,"-resid_file",15) == 0){ resid_from_file = true; resid_file = argv[i+1]; i+=2; }else if( strncmp(cmd,"-report_freq",20) == 0){ std::stringstream ss; ss << argv[i+1]; ss >> report_freq; i+=2; }else if( strncmp(cmd,"-switch_iter",20) == 0){ std::stringstream ss; ss << argv[i+1]; ss >> switch_iter; i+=2; }else if( strncmp(cmd,"-rel_up_freq",20) == 0){ std::stringstream ss; ss << argv[i+1]; ss >> rel_up_freq; i+=2; }else if( strncmp(cmd,"-threads",20) == 0){ std::stringstream ss; ss << argv[i+1]; ss >> nthreads; i+=2; }else{ ERR.General("","main","Unknown argument %s\n",cmd); } } // VRB.Result( "CPS", "main", "omp_get_num_threads[1] -> %d", omp_get_num_threads() ); // #pragma omp parallel // { // if ( UniqueID() == 0 && omp_get_thread_num() == 0 ) { // VRB.Result( "CPS", "main", "omp_get_num_threads[2] -> %d", omp_get_num_threads() ); // } // } // VRB.Result( "CPS", "main", "omp_get_num_threads[3] -> %d", omp_get_num_threads() ); GnoneFdwf lattice; //Get a RemezArg from RationalQuotientRemezArg. We only use the first fermion MD here RemezArg &remez_arg = rq_arg.frm_md.frm_md_val[0]; Float* src_cps = rand_5d_canonical_fermion(lattice); lattice.BondCond(); Float* gauge = (Float*) lattice.GaugeField(); bfmarg dwfa; setup_bfmargs(dwfa,residual,nthreads); if(!UniqueID()){ printf("sizeof float %d double %d\n",sizeof(float),sizeof(double)); fflush(stdout); } bfm_evo bfm_d; bfm_d.init(dwfa); // sync(); if(!UniqueID()){ printf("Importing gauge\n"); fflush(stdout); } bfm_d.cps_importGauge(gauge); //Init the single prec instance bfm_d.comm_end(); bfm_evo bfm_f; bfm_f.init(dwfa); bfm_f.cps_importGauge(gauge); bfm_f.comm_end(); bfm_d.comm_init(); // sync(); if(!UniqueID()){ printf("Allocating fermions\n"); fflush(stdout); } Fermion_t src_bfm[2] = {bfm_d.allocFermion(), bfm_d.allocFermion()}; //odd/even // sync(); if(!UniqueID()){ printf("Impexing random vector to bfm src vectors\n"); fflush(stdout); } bfm_d.cps_impexFermion(src_cps,src_bfm,1); //Allocate npoles Fermion vectors in double precision // sync(); if(!UniqueID()){ printf("Allocating solution vectors\n"); fflush(stdout); } Fermion_t sol[remez_arg.degree]; for(int i=0;i