#ifndef _MULT_VV_IMPL_H #define _MULT_VV_IMPL_H //#define MULT_LR_BASIC #define MULT_LR_GSL template class lA2Afield, template class rA2Afield, typename ComplexClass > class _mult_lr_impl_v{ }; template class lA2Afield, template class rA2Afield > class _mult_lr_impl_v{ public: typedef typename mf_Policies::ScalarComplexType ScalarComplexType; #if defined(MULT_LR_BASIC) static void mult(CPSspinColorFlavorMatrix &out, const lA2Afield &l, const rA2Afield &r, const int &xop, const int &top, const bool &conj_l, const bool &conj_r){ typedef typename lA2Afield::DilutionType iLeftDilutionType; typedef typename rA2Afield::DilutionType iRightDilutionType; out = 0.0; int top_glb = top+GJP.TnodeSites()*GJP.TnodeCoor(); //Precompute index mappings ModeContractionIndices i_ind(l); modeIndexSet ilp, irp; ilp.time = top_glb; irp.time = top_glb; int site4dop = l.getMode(0).threeToFour(xop, top); for(int sl=0;sl<4;sl++){ for(int sr=0;sr<4;sr++){ for(int cl=0;cl<3;cl++){ ilp.spin_color = cl + 3*sl; for(int cr=0;cr<3;cr++){ irp.spin_color = cr + 3*sr; for(int fl=0;fl<2;fl++){ ilp.flavor = fl; for(int fr=0;fr<2;fr++){ irp.flavor = fr; for(int i=0;i &out, const lA2Afield &l, const rA2Afield &r, const int &xop, const int &top, const bool &conj_l, const bool &conj_r){ typedef typename lA2Afield::DilutionType iLeftDilutionType; typedef typename rA2Afield::DilutionType iRightDilutionType; out.zero(); int top_glb = top+GJP.TnodeSites()*GJP.TnodeCoor(); const StandardIndexDilution &std_idx = l; int nv = std_idx.getNmodes(); //Precompute index mappings modeIndexSet ilp, irp; ilp.time = top_glb; irp.time = top_glb; //We want to treat this as a matrix mult of a 24 * nv matrix with an nv * 24 matrix, where nv is the number of fully undiluted indices. First implementation uses regular GSL, but could perhaps be done better with sparse matrices int site4dop = l.getMode(0).threeToFour(xop,top); const static int nscf = 2*3*4; //Pull out the components we need into packed GSL vectors typedef gsl_wrapper gw; typename gw::matrix_complex *lgsl = gw::matrix_complex_alloc(nscf, nv); typename gw::matrix_complex *rgsl = gw::matrix_complex_alloc(nv,nscf); gw::matrix_complex_set_zero(lgsl); gw::matrix_complex_set_zero(rgsl); assert(sizeof(typename gw::complex) == sizeof(ScalarComplexType) ); for(int s=0;s<4;s++){ for(int c=0;c<3;c++){ ilp.spin_color = irp.spin_color = c + 3*s; for(int f=0;f<2;f++){ ilp.flavor = irp.flavor = f; int scf = f+2*(c+3*s); std::vector lmap; std::vector lnon_zeroes; l.getIndexMapping(lmap,lnon_zeroes,ilp); for(int i=0;i lval_tmp = l.nativeElem(lmap[i], site4dop, ilp.spin_color, f); // if(conj_l) lval_tmp = std::conj(lval_tmp); gw::matrix_complex_set(lgsl, scf, i, reinterpret_cast(lval_tmp)); } std::vector rmap; std::vector rnon_zeroes; r.getIndexMapping(rmap,rnon_zeroes,irp); for(int i=0;i rval_tmp = r.nativeElem(rmap[i], site4dop, irp.spin_color, f); // if(conj_r) rval_tmp = std::conj(rval_tmp); gw::matrix_complex_set(rgsl, i, scf, reinterpret_cast(rval_tmp)); } } } } if(conj_l) for(int i=1;idata[i] = -lgsl->data[i]; if(conj_r) for(int i=1;idata[i] = -rgsl->data[i]; typename gw::matrix_complex *ogsl = gw::matrix_complex_alloc(nscf, nscf); typename gw::complex one; GSL_SET_COMPLEX(&one,1.0,0.0); typename gw::complex zero; GSL_SET_COMPLEX(&zero,0.0,0.0); #ifndef MEMTEST_MODE gw::blas_gemm(CblasNoTrans, CblasNoTrans, one, lgsl, rgsl, zero, ogsl); #endif for(int sl=0;sl<4;sl++){ for(int sr=0;sr<4;sr++){ for(int cl=0;cl<3;cl++){ for(int cr=0;cr<3;cr++){ for(int fl=0;fl<2;fl++){ int scfl = fl+2*(cl+3*sl); for(int fr=0;fr<2;fr++){ int scfr = fr+2*(cr+3*sr); typename ScalarComplexType::value_type (&ol)[2] = reinterpret_cast(out(sl,sr)(cl,cr)(fl,fr)); typename gw::complex* gg = gw::matrix_complex_ptr(ogsl, scfl, scfr); ol[0] = gg->dat[0]; ol[1] = gg->dat[1]; } } } } } } gw::matrix_complex_free(lgsl); gw::matrix_complex_free(rgsl); gw::matrix_complex_free(ogsl); } #else #error "mult_vv_impl.h NO MULT IMPLEMENTATION DEFINED" #endif static void mult_slow(CPSspinColorFlavorMatrix &out, const lA2Afield &l, const rA2Afield &r, const int &xop, const int &top, const bool &conj_l, const bool &conj_r){ assert( l.paramsEqual(r) ); int site4dop = xop + GJP.VolNodeSites()/GJP.TnodeSites()*top; A2Aparams i_params(l); StandardIndexDilution idil(i_params); int ni = idil.getNmodes(); out = 0.0; for(int sl=0;sl<4;sl++){ for(int cl=0;cl<3;cl++){ for(int fl=0;fl<2;fl++){ for(int sr=0;sr<4;sr++){ for(int cr=0;cr<3;cr++){ for(int fr=0;fr<2;fr++){ for(int i=0;i class lA2Afield, template class rA2Afield > class _mult_lr_impl_v{ public: typedef typename mf_Policies::ScalarComplexType ScalarComplexType; typedef typename mf_Policies::ComplexType VectorComplexType; //#define BASIC_MULT_GRID #ifdef BASIC_MULT_GRID static void mult(CPSspinColorFlavorMatrix &out, const lA2Afield &l, const rA2Afield &r, const int &xop, const int &top, const bool &conj_l, const bool &conj_r){ typedef typename lA2Afield::DilutionType iLeftDilutionType; typedef typename rA2Afield::DilutionType iRightDilutionType; out.zero(); assert(l.getMode(0).SIMDlogicalNodes(3) == 1); int top_glb = top+GJP.TnodeSites()*GJP.TnodeCoor(); //Precompute index mappings ModeContractionIndices i_ind(l); modeIndexSet ilp, irp; ilp.time = top_glb; irp.time = top_glb; int site4dop = l.getMode(0).threeToFour(xop, top); for(int sl=0;sl<4;sl++){ for(int sr=0;sr<4;sr++){ for(int cl=0;cl<3;cl++){ ilp.spin_color = cl + 3*sl; for(int cr=0;cr<3;cr++){ irp.spin_color = cr + 3*sr; for(int fl=0;fl<2;fl++){ ilp.flavor = fl; for(int fr=0;fr<2;fr++){ irp.flavor = fr; for(int i=0;i &out, const lA2Afield &l, const rA2Afield &r, const int &xop, const int &top, const bool &conj_l, const bool &conj_r){ typedef typename lA2Afield::DilutionType iLeftDilutionType; typedef typename rA2Afield::DilutionType iRightDilutionType; out.zero(); assert(l.getMode(0).SIMDlogicalNodes(3) == 1); int top_glb = top+GJP.TnodeSites()*GJP.TnodeCoor(); //Precompute index mappings modeIndexSet ilp, irp; ilp.time = top_glb; irp.time = top_glb; int site4dop = l.getMode(0).threeToFour(xop, top); const StandardIndexDilution &std_idx = l; int nv = std_idx.getNmodes(); const static int nscf = 2*3*4; //Why doesn't this work? // VectorComplexType zro; zeroit(zro); // std::vector > lcp(nscf, Grid::Vector(nv, zro) ); // std::vector > rcp(nv, Grid::Vector(nscf, zro) ); std::vector > lcp(nscf, Grid::Vector(nv) ); std::vector > rcp(nv, Grid::Vector(nscf) ); std::vector > lnon_zeroes_all(nscf); std::vector > rnon_zeroes_all(nscf); for(int s=0;s<4;s++){ for(int c=0;c<3;c++){ ilp.spin_color = irp.spin_color = c + 3*s; for(int f=0;f<2;f++){ ilp.flavor = irp.flavor = f; int scf = f+2*(c+3*s); std::vector lmap; std::vector &lnon_zeroes = lnon_zeroes_all[scf]; l.getIndexMapping(lmap,lnon_zeroes,ilp); for(int i=0;i rmap; std::vector &rnon_zeroes = rnon_zeroes_all[scf]; r.getIndexMapping(rmap,rnon_zeroes,irp); for(int i=0;i &out, const lA2Afield &l, const rA2Afield &r, const int &xop, const int &top, const bool &conj_l, const bool &conj_r){ assert( l.paramsEqual(r) ); int site4dop = xop + GJP.VolNodeSites()/GJP.TnodeSites()*top; A2Aparams i_params(l); StandardIndexDilution idil(i_params); int ni = idil.getNmodes(); out = 0.0; for(int sl=0;sl<4;sl++){ for(int cl=0;cl<3;cl++){ for(int fl=0;fl<2;fl++){ for(int sr=0;sr<4;sr++){ for(int cr=0;cr<3;cr++){ for(int fr=0;fr<2;fr++){ for(int i=0;i class lA2Afield, template class rA2Afield > void mult(CPSspinColorFlavorMatrix &out, const lA2Afield &l, const rA2Afield &r, const int &xop, const int &top, const bool &conj_l, const bool &conj_r){ _mult_lr_impl_v::type >::mult(out,l,r,xop,top,conj_l,conj_r); } template class lA2Afield, template class rA2Afield > void mult_slow(CPSspinColorFlavorMatrix &out, const lA2Afield &l, const rA2Afield &r, const int &xop, const int &top, const bool &conj_l, const bool &conj_r){ _mult_lr_impl_v::type >::mult_slow(out,l,r,xop,top,conj_l,conj_r); } #endif