// -*- mode:c++; c-basic-offset: 4 -*- #ifndef INCLUDED_BFM_MIXED_SOLVER_MULTI_H #define INCLUDED_BFM_MIXED_SOLVER_MULTI_H #include #include #include #include #include #include #include #include //disabling. Seems to have been manually copied from C. Kelly #if 0 namespace mixed_cg { // Convert between single/double precision bfm fermions // CK: And do it quickly! The original takes as long as an Mprec! template inline void threaded_convFermion_fast(Fermion_t out, Fermion_t in, bfm_evo &bfm_out, bfm_evo &bfm_in) { // Simple copy, this shouldn't be called, right? if(sizeof(Float_out) == sizeof(Float_in)) { return bfm_out.copy(out, in); } #if 0 const static int nspinco = 12; int me,thrlen,throff; int work = (bfm_out.gparity? 2:1)*bfm_out.cbLs*bfm_out.simd_cbvol*bfm_out.nsimd*nspinco*2; bfm_out.thread_work(work,me,thrlen,throff); Float_in *x = (Float_in *) in + throff; Float_out *y = (Float_out *) out + throff; for(int s=0;s1){ toadd *= sigma_sum_recurse(i,j+1,N,shifts,nprod_remaining-1); } out += toadd; } return out; } inline double sigma_prod(const int &i, const int &n, const int &N, double shifts[]){ if(n==N-1) return 1.0; int prod_size = N-1-n; return sigma_sum_recurse(i,0,N,shifts,prod_size); } /*CK: Implementation of multi-shift with guesses from J.~C.~Osborn, ``Initial guesses for multi-shift solvers,'' PoS LATTICE {\bf 2008} (2008) 029 [arXiv:0810.1081 [hep-lat]]. */ template int CGNE_prec_MdagM_multi_shift_with_guesses(Fermion_t psi[], Fermion_t src, Fermion_t guesses[], double mass[], double alpha[], int nshift, const double mresidual_in[], int single, bfm_evo &bfm) { //'single' appears to sum all the solutions into psi[0] int me = bfm.thread_barrier(); //Threads take a local copy of mresidual_in[] so wno worries about race conditions double mresidual[nshift]; for(int i=0;ij, k<=N-1} shift[k] ) B^{N-3}w // +... //The coefficient of a term B^n is the sum of all the unique products of (N-1)-n elements of the set of shifts excluding shift[i]: { shift[0], shift[1],.. excl shift[i].... shift[N-2], shift[N-1] } //We take a product of size zero to have value 1.0 (as we would with factorials). Indices all start from 0 and end at N-1. //For example //n=N-1 has coefficent 1.0, which is the product of (N-1)-(N-1)=0 shifts //n=N-2 is the sum of all products comprising 1 shift: sum_{j!=i, j<=N-1} shift[j], which //n=N-3 is the sum of all products of 2 shifts: shift[0]*(shift[1] + shift[2] + ... shift[N-1]) + shift[1]*(shift[2] + shift[3] + ... shift[N-1]) + ... + shift[N-2]*shift[N-1] // = sum_{j!=i, j<=N-2} shift[j] * ( sum_{k!=i,k>j;k<=N-1} shift[k] ) //etc... //n=0 is the product of all shifts bar i //Note, we contain the running product of MMdag in w: w_n+1 = MMdag w_n and definining w_0 = w Fermion_t tmp = bfm.threadedAllocFermion(mem_fast); Fermion_t tmp2 = bfm.threadedAllocFermion(mem_fast); for(int n=0;n1){ toadd *= sigma_sum_recurse_2(i,k,j+1,N,shifts,nprod_remaining-1); } out += toadd; } return out; } inline double sigma_prod_2(const int &i, const int &k, const int &n, const int &N, double shifts[]){ if(n==N-2) return 1.0; int prod_size = N-2-n; return sigma_sum_recurse_2(i,k,0,N,shifts,prod_size); } //CK: Multi-mass with multiple sources, using the method in the Osborn paper cited above template int CGNE_prec_MdagM_multi_shift_multi_src(Fermion_t psi[], Fermion_t src[], double mass[], double alpha[], int nshift, const double mresidual_in[], int single, bfm_evo &bfm) { //'single' appears to sum all the solutions into psi[0] int me = bfm.thread_barrier(); //Not know if mresidual_in is shared or not. To be safe , threads each take a local copy of the residuals that they can //all simultaneously modify without worrying about race conditions double mresidual[nshift]; for(int i=0;ij, l\neq i,k} \sigma_l + ..... for(int m=0;m inline void MdagMplusShift(Fermion_t in, Fermion_t out, const double &shift, Fermion_t tmp1, Fermion_t tmp2, bfm_evo &bfm){ bfm.Mprec(in, tmp1, tmp2, DaggerNo); bfm.Mprec(tmp1, out, tmp2, DaggerYes); bfm.axpy(out, in, out, shift); } //CK: mixed precision multi-mass using multiple single precision restarted inner loop. // Does not work very well because the rediduals get very large such that the required stopping conditions // are less than single precision accuracy // Both sol and src are double precision fermions. Single precision // solver is only used internally. // mass, alpha and nshift as usual // dresidual are the target residuals // fresidual are the initial single precision residuals, which are dynamically modified during the solve // // Things to be set before using this function: // // double precision solver mass, max iteration // number. // // single precision solver mass, max iteration // number. // // Gauge field must be initialized for both double and single prec // solvers. // // the communication subsystem must be ready for bfm_d to use (due to // the way bfmcommspi is written, one must reinitialize the // communication object when switching between single and double // precisions). // // max_cycle: the maximum number of restarts will be performed. inline int threaded_cg_mixed_restarted_multi_shift_MdagM(Fermion_t sol[], Fermion_t src, double mass[], double alpha[], int nshift, const double dresidual[], const double fresidual_in[], int single, bfm_evo &bfm_d, bfm_evo &bfm_f, int max_cycle){ int me = bfm_d.thread_barrier(); double fresidual[nshift]; for(int i=0;i(sol[n], tv1_d, mass[n], src_d, tv2_d, bfm_d); // tv1_d = (MdagM + mass[n]) * sol[n] double norm = bfm_d.axpy_norm(src_d, tv1_d, src, -1.); //ad hoc stopping condition from cg_mixed implementation if(norm < 100. * stop[n]) finished[n] = 1; if(bfm_f.isBoss() && !me) printf("threaded_cg_mixed_restarted_multi_shift_MdagM: iter = %d shift = %d rsd = %17.10e(d) stop = %17.10e(d) finished = %d\n", i,n, norm, stop[n],finished[n]); while(norm * fresidual[n] * fresidual[n] < stop[n]) fresidual[n] *= 2; threaded_convFermion(src_f[n], src_d, bfm_f, bfm_d); } fin_count += finished[n]; } if(fin_count == nshift) break; //stop when all have finished switch_comm(bfm_f, bfm_d); iter += CGNE_prec_MdagM_multi_shift_multi_src(sol_f, src_f, mass, alpha, nshift, fresidual, 0, bfm_f); switch_comm(bfm_d, bfm_f); for(int n=0;n &bfm_d, bfm_evo &bfm_f){ int me = bfm_d.thread_barrier(); //Source and solution locations for single precision input and output Fermion_t sol_f[nshift]; for(int n=0;n int threaded_CGNE_MdagM_plus_shift(Fermion_t psi, Fermion_t src, Float shift, bfm_evo &bfm) { //Standard CG algorithm from BFM: //(Use subscript to label iteration) //r_1 = MMdag psi - src, p_1 = MMdag psi - src, c_1 = |r_1|^2 = |p_1|^2 //Iteration: //d_k = |M p_k|^2 = p_k^dag M^dag M p_k //a_k = c_k / d_k //r_k+1 = MMdag p_k - a_k r_k //c_k+1 = |r_k+1|^2 //b_k = c_k+1 / c_k //psi = a_k p_k + psi //p_k+1 = b_k p_k + r_k+1 //Note: norm(vec) is actually |vec|^2 //Shift modified version should look similar: //r_1 = (MMdag+shift) psi - src, p_1 = (MMdag+shift) psi - src, c_1 = |r_1|^2 = |p_1|^2 //Iteration: //d_k = p_k^dag M^dag M p_k + shift * p_k^dag p_k //a_k = c_k / d_k //r_k+1 = (MMdag+shift) p_k - a_k r_k //c_k+1 = |r_k+1|^2 //b_k = c_k+1 / c_k //psi = a_k p_k + psi //p_k+1 = b_k p_k + r_k+1 int me = bfm.thread_barrier(); int verbose = bfm.verbose; double f; double cp,c,a,d,b; double residual = bfm.residual; int max_iter = bfm.max_iter; if ( bfm.isBoss() && (!me) ) { bfm.InverterEnter(); } Fermion_t p = bfm.threadedAllocFermion(mem_fast); Fermion_t tmp = bfm.threadedAllocFermion(mem_fast); Fermion_t mp = bfm.threadedAllocFermion(mem_fast); Fermion_t mmp = bfm.threadedAllocFermion(mem_fast); Fermion_t r = bfm.threadedAllocFermion(mem_fast); //Initial residual computation & set up double guess = bfm.norm(psi); d = bfm.Mprec(psi,mp,tmp,DaggerNo); bfm.Mprec(mp,mmp,tmp,DaggerYes); b = bfm.axpy_norm(mmp,psi,mmp,shift); //MMdag psi + shift*psi cp= bfm.axpy_norm (r, mmp, src,-1.0); a = bfm.axpy_norm (p, mmp, src,-1.0); //a = bfm.norm(p); //cp= bfm.norm(r); //r_1 = (MMdag+shift) psi - src, p_1 = (MMdag+shift) psi - src, c_1 = |r_1|^2 = |p_1|^2 Float ssq = bfm.norm(src); if ( verbose && bfm.isBoss() && !me ) { printf("mixed_cg::CGNE_MdagM_plus_shift gues %le \n",guess); printf("mixed_cg::CGNE_MdagM_plus_shift src %le \n",ssq); printf("mixed_cg::CGNE_MdagM_plus_shift Mp %le \n",d); printf("mixed_cg::CGNE_MdagM_plus_shift (MMdag + shift)p %le \n",b); printf("mixed_cg::CGNE_MdagM_plus_shift r %le \n",cp); printf("mixed_cg::CGNE_MdagM_plus_shift p %le \n",a); } Float rsq = residual* residual*ssq; //Check if guess is really REALLY good :) if ( cp <= rsq ) { if ( verbose && bfm.isBoss() && !me ) { printf("mixed_cg::CGNE_MdagM_plus_shift k=0 converged - suspiciously nice guess %le %le\n",cp,rsq); } bfm.threadedFreeFermion(tmp); bfm.threadedFreeFermion(p); bfm.threadedFreeFermion(mp); bfm.threadedFreeFermion(mmp); bfm.threadedFreeFermion(r); if ( bfm.isBoss() && (!me) ) { bfm.InverterExit(); } return 0; } if ( verbose && bfm.isBoss() && !me ) printf("mixed_cg::CGNE_MdagM_plus_shift k=0 residual %le rsq %le\n",cp,rsq); if ( bfm.isBoss() && !me ) { if ( bfm.watchfile ) { printf("mixed_cg::CGNE_MdagM_plus_shift watching file \"%s\"\n",bfm.watchfile); } } struct timeval start,stop; if(bfm.isBoss() && !me) gettimeofday(&start,NULL); for (int k=1;k<=max_iter;k++){ bfm.iter=k; uint64_t t_iter_1=GetTimeBase(); c=cp; uint64_t t_mprec_1=GetTimeBase(); //d_k = p_k^dag M^dag M p_k + shift * p_k^dag p_k d = bfm.Mprec(p,mp,tmp,0,1); double norm_p = bfm.norm(p); d += shift * norm_p; uint64_t t_mprec_2=GetTimeBase(); a = c/d; uint64_t t_mprec_3=GetTimeBase(); bfm.Mprec(mp,mmp,tmp,1); bfm.axpy(mmp,p,mmp,shift); // mmp = MMdag p + shift * p uint64_t t_mprec_4=GetTimeBase(); uint64_t tr1=GetTimeBase(); cp = bfm.axpy_norm(r,mmp,r,-a); //r_k+1 = (MMdag+shift) p_k - a_k r_k b = cp/c; uint64_t tr2=GetTimeBase(); uint64_t tpsi1=GetTimeBase(); bfm.axpy(psi,p,psi,a); uint64_t tpsi2=GetTimeBase(); // New (conjugate/M-orthogonal) search direction uint64_t tp1=GetTimeBase(); bfm.axpy(p,p,r,b); uint64_t tp2=GetTimeBase(); uint64_t t_iter_2=GetTimeBase(); // verbose nonsense if ( (bfm.iter==bfm.time_report_iter) && bfm.isBoss() && (!me) && verbose) { int lx = bfm.node_latt[0]; int ly = bfm.node_latt[1]; int lz = bfm.node_latt[2]; int lt = bfm.node_latt[3]; int cb4dsites = (lx*ly*lz*lt)/2; printf("fermionCacheFootprint: %ld \n",7*bfm.axpyBytes()/3); printf("gauge CacheFootprint: %ld \n",2*18*8*cb4dsites*2); printf("fermionVecBytes : %ld \n",bfm.axpyBytes()/3); printf("axpyBytes : %ld \n",bfm.axpyBytes()); printf("axpy (soln) : %ld cyc %le MB/s\n",(tpsi2-tpsi1),(double)bfm.axpyBytes()*1600./(tpsi2-tpsi1)); printf("axpy_norm (residual) : %ld cyc %le MB/s\n",(tr2-tr1),(double)bfm.axpyBytes()*1600./(tr2-tr1)); printf("axpy (search) : %ld cyc %le MB/s\n",(tp2-tp1),(double)bfm.axpyBytes()*1600./(tp2-tp1)); printf("Iter time : %ld cyc\n",t_iter_2-t_iter_1); printf("linalg time : %ld cyc\n",t_iter_2-t_iter_1-(t_mprec_2-t_mprec_1)-(t_mprec_4-t_mprec_3)); printf("Mprec time : %ld cyc\n",t_mprec_2-t_mprec_1); printf("Mprec time : %ld cyc\n",t_mprec_4-t_mprec_3); fflush(stdout); } if ( ((k%100 == 0) && (verbose!=0)) || (verbose > 10)){ if ( bfm.isBoss() && !me ) { printf("mixed_cg::CGNE_MdagM_plus_shift: k=%d r^2=%le %le %lx\n",k,cp,sqrt(cp/ssq),&bfm); } } // Stopping condition if ( cp <= rsq ) { //I did not update the flops count so I have commented them out struct timeval diff; if ( bfm.isBoss() && !me ){ gettimeofday(&stop,NULL); timersub(&stop,&start,&diff); } if ( bfm.isBoss() && !me ) printf("mixed_cg::CGNE_MdagM_plus_shift converged in %d iterations\n",k); if ( bfm.isBoss() && !me ) printf("mixed_cg::CGNE_MdagM_plus_shift converged in %d.%6.6d s\n",diff.tv_sec,diff.tv_usec); //double flops = mprecFlops()*2.0 + 2.0*axpyNormFlops() + axpyFlops()*2.0; //flops = flops * k; //double t = diff.tv_sec*1.0E6 + diff.tv_usec; // if ( isBoss()&& !me ) // printf("mixed_cg::CGNE_MdagM_plus_shift: %d mprec flops/site\n",mprecFlopsPerSite()); // if ( isBoss()&& !me ) printf("mixed_cg::CGNE_MdagM_plus_shift: %le flops\n",flops); // if ( isBoss()&& !me ) printf("mixed_cg::CGNE_MdagM_plus_shift: %le mflops per node\n",flops/t); if ( bfm.isBoss() && !me ){ printf("mixed_cg::CGNE_MdagM_plus_shift calculating true resid. V0\n"); fflush(stdout); } //DEBUG bfm.Mprec(psi,mp,tmp,0); bfm.Mprec(mp,mmp,tmp,1); bfm.axpy(mmp,psi,mmp,shift); double resid = bfm.axpy_norm(tmp,src,mmp,-1.0); double src_norm = bfm.norm(src); double true_residual = sqrt(resid/src_norm); if ( bfm.isBoss() && !me ) printf("mixed_cg::CGNE_MdagM_plus_shift: true residual is %le \n",true_residual); if ( bfm.isBoss() && !me ){ printf("mixed_cg::CGNE_MdagM_plus_shift cleaning up\n"); fflush(stdout); } //DEBUG bfm.threadedFreeFermion(tmp); bfm.threadedFreeFermion(p); bfm.threadedFreeFermion(mp); bfm.threadedFreeFermion(mmp); bfm.threadedFreeFermion(r); #ifdef LIST_ENGINE if ( bfm.list_engine) bfm.L1P_PatternUnconfigure(); #endif if ( bfm.isBoss() && (!me) ) { bfm.InverterExit(); } return k; } } if ( bfm.isBoss() && !me ) printf("mixed_cg::CGNE_MdagM_plus_shift: CG not converged \n"); bfm.threadedFreeFermion(tmp); bfm.threadedFreeFermion(p); bfm.threadedFreeFermion(mp); bfm.threadedFreeFermion(mmp); bfm.threadedFreeFermion(r); #ifdef LIST_ENGINE if (bfm.list_engine ) bfm.L1P_PatternUnconfigure(); #endif if ( bfm.isBoss() && (!me) ) { bfm.InverterExit(); } return -1; } //CK: Single precision solve followed by defect correction loop using single shift solver independently // for each shift // Both sol and src are double precision fermions. Single precision // solver is only used internally. // // Things to be set before using this function: // // double precision solver mass, stopping condition, max iteration // number. // // single precision solver mass, stopping condition, max iteration // number. // // Gauge field must be initialized for both double and single prec // solvers. // // the communication subsystem must be ready for bfm_d to use (due to // the way bfmcommspi is written, one must reinitialize the // communication object when switching between single and double // precisions). // // max_cycle: the maximum number of restarts will be performed. //fresidual are the residuals used for the initial single precision solve //min_fp_resid: the smallest value for the residual of the initial single-precision multi-mass solve inline int threaded_cg_mixed_defect_correction_multi_shift_MdagM(Fermion_t sol[], Fermion_t src, double mass[], double alpha[], bfm_evo &bfm_d, bfm_evo &bfm_f, int nshift, double mresidual[], double fresidual[], int single, int max_cycle){ int me = bfm_d.thread_barrier(); double frsd = bfm_f.residual; //save original residual for later restoration //First we perform the multi-mass inversion using the single-precision solver Fermion_t src_f = bfm_f.threadedAllocFermion(); Fermion_t sol_f[nshift]; for(int i=0;i(sol_f[shift],src_f,mass[shift],bfm_f); switch_comm(bfm_d, bfm_f); threaded_convFermion(tv1_d, sol_f[shift], bfm_d, bfm_f); bfm_d.axpy(sol[shift], tv1_d, sol[shift], 1.); } bfm_f.residual = frsd; //restore original single precision residual at end of each step } bfm_d.threadedFreeFermion(src_d); bfm_d.threadedFreeFermion(tv1_d); bfm_d.threadedFreeFermion(tv2_d); for(int i=0;i(sol[shift],src,mass[shift],bfm_d); bfm_d.residual = restore_resid; //bfm_d.thread_barrier(); //Not needed because barrier in next call double sol_norm = bfm_d.norm(sol[shift]); if(bfm_d.isBoss() && !me) printf("threaded_cg_mixed_defect_correction_multi_shift_MdagM: final sol[%d] norm = %17.10e\n", shift,sol_norm); } if ( single ) { for(int s=1; s < nshift; s++) { bfm_d.axpy(sol[0],sol[s],sol[0],1.0); } } return iter; } //CK 2014: The version below performs the multi-shift with the matrix multiplication in single precision. The residual is stored in single precision, but the search directions and solution are //stored in double precision. Every update_freq iterations the residual is corrected in double precision. //Note that the final double precision residuals may not be as good as desired, so you may want to perform defect correction on each pole afterwards. I have added a version that does this extra step below. inline int threaded_cg_mixed_multi_shift_MdagM_sp_relup_dp(Fermion_t psi[], Fermion_t src, double mass[], double alpha[], int nshift, double mresidual[], int single, bfm_evo &bfm_f, bfm_evo &bfm_d, int update_freq = 100, int report_freq = -1) { //NOTE: Assumes bfm_d comms are active //update_freq is the frequency at which the reliable update step is performed //report_freq prints the double precision true residual when k % report_freq = 0. Use -1 to disable int me = bfm_d.thread_barrier(); double bs [nshift]; double rsq[nshift]; double z[nshift][2]; int converged[nshift]; const int primary =0; //Primary shift fields CG iteration double a,b,c,d; double cp,bp; //prev //Single precision fields Fermion_t r = bfm_f.threadedAllocFermion(mem_slow); //residual vector, single precision Fermion_t tmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t p = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t mmp = bfm_f.threadedAllocFermion(mem_fast); Fermion_t src_f = bfm_f.threadedAllocFermion(mem_slow); mixed_cg::threaded_convFermion_fast(src_f, src, bfm_f, bfm_d); //Double precision fields Fermion_t p_d = bfm_d.threadedAllocFermion(mem_fast); //search direction, double precision Fermion_t tmp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t mmp_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t ps_d [nshift]; // search directions (double precision) for(int i=0;i &bfm_f, bfm_evo &bfm_d, int update_freq = 100, int report_freq = -1, int max_cycle = 10){ int me = bfm_d.thread_barrier(); double frsd = bfm_f.residual; //save original residual for later restoration if (bfm_d.isBoss() && !me) printf("threaded_cg_mixed_multi_shift_MdagM_sp_relup_dp_defect_correction: hello!!!!! nshift = %d\n", nshift); // FIXME: Greg test: tighten residuals for all but lightest pole by a factor of ten during single precision solve //for (int i = 1; i < nshift; i++) mresidual[i] /= 10.0; struct timeval tstart, tstop, tdiff; gettimeofday(&tstart,NULL); int iter_multi = threaded_cg_mixed_multi_shift_MdagM_sp_relup_dp(psi,src,mass,alpha,nshift,mresidual,0,bfm_f,bfm_d,update_freq,report_freq); gettimeofday(&tstop,NULL); timersub(&tstop,&tstart,&tdiff); // FIXME: Greg test: tighten residuals for all but lightest pole by a factor of ten during single precision solve //for (int i = 1; i < nshift; i++) mresidual[i] *= 10.0; if (bfm_d.isBoss() && !me) printf("threaded_cg_mixed_multi_shift_MdagM_sp_relup_dp_defect_correction: Initial multi-shift iter = %d, time %d.%6.6d s\n", iter_multi, tdiff.tv_sec, tdiff.tv_usec); gettimeofday(&tstart,NULL); Fermion_t src_f = bfm_f.threadedAllocFermion(); Fermion_t sol_f = bfm_f.threadedAllocFermion(); Fermion_t src_d = bfm_d.threadedAllocFermion(); Fermion_t tv1_d = bfm_d.threadedAllocFermion(mem_fast); Fermion_t tv2_d = bfm_d.threadedAllocFermion(mem_fast); double src_norm = bfm_d.norm(src); int iter = 0; for(int shift=0;shift(sol_f,src_f,mass[shift],bfm_f); switch_comm(bfm_d, bfm_f); threaded_convFermion_fast(tv1_d, sol_f, bfm_d, bfm_f); bfm_d.axpy(psi[shift], tv1_d, psi[shift], 1.); } bfm_f.residual = frsd; //restore original single precision residual at end of each step } gettimeofday(&tstop,NULL); timersub(&tstop,&tstart,&tdiff); if(bfm_d.isBoss() && !me) printf("threaded_cg_mixed_multi_shift_MdagM_sp_relup_dp_defect_correction: defect correction time %d.%6.6d s\n", tdiff.tv_sec,tdiff.tv_usec); gettimeofday(&tstart,NULL); for(int shift=0;shift(psi[shift],src,mass[shift],bfm_d); bfm_d.residual = restore_resid; } if ( single ) { for(int s=1; s < nshift; s++) { bfm_d.axpy(psi[0],psi[s],psi[0],1.0); } } bfm_d.threadedFreeFermion(src_d); bfm_f.threadedFreeFermion(src_f); bfm_d.threadedFreeFermion(tv1_d); bfm_d.threadedFreeFermion(tv2_d); bfm_f.threadedFreeFermion(sol_f); gettimeofday(&tstop,NULL); timersub(&tstop,&tstart,&tdiff); if(bfm_d.isBoss() && !me) printf("threaded_cg_mixed_multi_shift_MdagM_sp_relup_dp_defect_correction: finishing up time %d.%6.6d s\n", tdiff.tv_sec,tdiff.tv_usec); return iter; } }; #endif #endif