#ifndef MESON_FIELD_IMPL #define MESON_FIELD_IMPL #include #include #include #include #include #include #include template class A2AfieldL, template class A2AfieldR> void A2AmesonField::plus_equals(const A2AmesonField &with, const bool parallel){ if(nmodes_l != with.nmodes_l || nmodes_r != with.nmodes_r || !lindexdilution.paramsEqual(with.lindexdilution) || !rindexdilution.paramsEqual(with.rindexdilution) ){ ERR.General("A2AmesonField","plus_equals(..)","Second meson field must have the same underlying parameters\n"); } if(parallel){ #pragma omp_parallel for for(int i=0;i class A2AfieldL, template class A2AfieldR> void A2AmesonField::times_equals(const ScalarComplexType f,const bool parallel){ if(parallel){ #pragma omp_parallel for for(int i=0;i inline std::complex complexAvg(const std::complex&a, const std::complex &b){ return (a+b)/T(2.0); } //Replace this meson field with the average of this and a second field, 'with' template class A2AfieldL, template class A2AfieldR> void A2AmesonField::average(const A2AmesonField &with, const bool parallel){ if(nmodes_l != with.nmodes_l || nmodes_r != with.nmodes_r || !lindexdilution.paramsEqual(with.lindexdilution) || !rindexdilution.paramsEqual(with.rindexdilution) ){ ERR.General("A2AmesonField","average(..)","Second meson field must have the same underlying parameters\n"); } if(parallel){ #pragma omp_parallel for for(int i=0;i class A2AfieldL, template class A2AfieldR> void A2AmesonField::rowReorder(A2AmesonField &into, const int idx_map[], int map_size, bool parallel) const{ into.setup(lindexdilution, rindexdilution, tl, tr); #define DOIT \ int irow = idx_map[i]; \ for(int j=0;j class A2AfieldL, template class A2AfieldR> void A2AmesonField::colReorder(A2AmesonField &into, const int idx_map[], int map_size, bool parallel) const{ into.setup(lindexdilution, rindexdilution, tl, tr); #define DOIT \ for(int j=0;j class A2AfieldL, template class A2AfieldR> typename gsl_wrapper::matrix_complex * A2AmesonField::GSLpackedColReorder(const int idx_map[], int map_size, bool rowidx_used[], typename gsl_wrapper::matrix_complex *reuse ) const{ typedef gsl_wrapper gw; assert(sizeof(typename gw::complex) == sizeof(ScalarComplexType)); int rows = nmodes_l; int cols = nmodes_r; int nrows_used = 0; for(int i_full=0;i_fullsize1 = nrows_used; M_packed->size2 = M_packed->tda = map_size; }else M_packed = gw::matrix_complex_alloc(nrows_used,map_size); //Look for contiguous blocks in the idx_map we can take advantage of std::vector > blocks; find_contiguous_blocks(blocks,idx_map,map_size); int i_packed = 0; for(int i_full=0;i_full class A2AfieldL, template class A2AfieldR> void A2AmesonField::splatPackedColReorder(Grid::Vector &into, const int idx_map[], int map_size, bool rowidx_used[], bool do_resize) const{ typedef typename mf_Policies::ComplexType VectorComplexType; int full_rows = nmodes_l; int full_cols = nmodes_r; int nrows_used = 0; for(int i_full=0;i_full > blocks; find_contiguous_blocks(blocks,idx_map,map_size); int i_packed = 0; for(int i_full=0;i_full class A2AfieldL, template class A2AfieldR> void A2AmesonField::scalarPackedColReorder(Grid::Vector &into, const int idx_map[], int map_size, bool rowidx_used[], bool do_resize) const{ int full_rows = nmodes_l; int full_cols = nmodes_r; int nrows_used = 0; for(int i_full=0;i_full > blocks; find_contiguous_blocks(blocks,idx_map,map_size); int i_packed = 0; for(int i_full=0;i_full class A2AfieldL, template class A2AfieldR> void A2AmesonField::transpose(A2AmesonField &into) const{ assert( (void*)this != (void*)&into ); into.setup(rindexdilution, lindexdilution, tr, tl); #pragma omp parallel for for(int i=0;i class lA2AfieldL, template class lA2AfieldR, template class rA2AfieldL, template class rA2AfieldR > typename mf_Policies::ScalarComplexType trace(const A2AmesonField &l, const A2AmesonField &r){ //Check the indices match if(! l.getRowParams().paramsEqual( r.getColParams() ) || ! l.getColParams().paramsEqual( r.getRowParams() ) ) ERR.General("","trace(const A2AmesonField &, const A2AmesonField &)","Illegal matrix product: underlying vector parameters must match\n"); typedef typename mf_Policies::ScalarComplexType ScalarComplexType; ScalarComplexType into(0,0); typedef typename A2AmesonField::LeftDilutionType DilType0; typedef typename A2AmesonField::RightDilutionType DilType1; typedef typename A2AmesonField::LeftDilutionType DilType2; typedef typename A2AmesonField::RightDilutionType DilType3; ModeContractionIndices i_ind(l.getRowParams()); ModeContractionIndices j_ind(r.getRowParams()); int times[4] = { l.getRowTimeslice(), l.getColTimeslice(), r.getRowTimeslice(), r.getColTimeslice() }; //W * W is only non-zero when the timeslice upon which we evaluate them are equal const int n_threads = omp_get_max_threads(); std::vector ret_vec(n_threads,(0.,0.)); modeIndexSet lip; lip.time = times[0]; modeIndexSet rip; rip.time = times[3]; modeIndexSet ljp; ljp.time = times[1]; modeIndexSet rjp; rjp.time = times[2]; const int &ni = i_ind.getNindices(lip,rip); //how many indices to loop over const int &nj = j_ind.getNindices(ljp,rjp); #ifndef MEMTEST_MODE #pragma omp parallel for for(int i = 0; i < ni; i++){ int id = omp_get_thread_num(); int li = i_ind.getLeftIndex(i,lip,rip); int ri = i_ind.getRightIndex(i,lip,rip); for(int j = 0; j < nj; j++){ int lj = j_ind.getLeftIndex(j,ljp,rjp); int rj = j_ind.getRightIndex(j,ljp,rjp); ret_vec[id] += l(li,lj) * r(rj,ri); } } for(int i=0;i class lA2AfieldL, template class lA2AfieldR, template class rA2AfieldL, template class rA2AfieldR > void trace(fMatrix &into, const std::vector > &l, const std::vector > &r){ //Distribute load over all nodes int lsize = l.size(); int rsize = r.size(); into.resize(lsize,rsize); int nodes = 1; for(int i=0;i<5;i++) nodes *= GJP.Nodes(i); int work = lsize*rsize; bool do_work = true; if(nodes > work){ nodes = work; if(UniqueID() >= work) do_work = false; //too many nodes, at least for this parallelization. Might want to consider parallelizing in a different way! } int node_work = work/nodes; if(node_work * nodes < work && !UniqueID()) node_work += work - node_work * nodes; //node 0 mops up remainder int node_off = UniqueID()*node_work; #ifndef MEMTEST_MODE if(do_work){ for(int tt=node_off; tt class A2AfieldL, template class A2AfieldR> void A2AmesonField::nodeDistribute(int node_uniqueid){ if(node_uniqueid == -1) node_uniqueid = nodeDistributeCounter::getNext(); //draw the next node index from the pool int nodes = 1; for(int i=0;i<5;i++) nodes *= GJP.Nodes(i); if(node_uniqueid < 0 || node_uniqueid >= nodes) ERR.General("A2AmesonField","nodeDistribute","Invalid node rank %d\n", node_uniqueid); #ifndef USE_MPI if(nodes > 1) ERR.General("A2AmesonField","nodeDistribute","Implementation requires MPI\n"); //For one node, don't do anything #else //if(!UniqueID()){ printf("nodeDistribute start with destination node UniqueID %d\n",node_uniqueid); fflush(stdout); } MPI_Barrier(MPI_COMM_WORLD); //MPI rank and CPS UniqueID may not be the same. Translate int my_rank; int ret = MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); if(ret != MPI_SUCCESS) ERR.General("A2AmesonField","nodeDistribute","Comm_rank failed\n"); if(mf == NULL){ printf("Error nodeDistribute: Rank %d has null pointer!\n",my_rank); fflush(stdout); exit(-1); } //printf("UniqueID %d -> MPI rank %d\n",UniqueID(),my_rank); fflush(stdout); int rank_tmp = (UniqueID() == node_uniqueid ? my_rank : 0); ret = MPI_Allreduce(&rank_tmp,&node_mpi_rank, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); //node is now the MPI rank corresponding to UniqueID == _node if(ret != MPI_SUCCESS) ERR.General("A2AmesonField","nodeDistribute","Reduce failed\n"); MPI_Barrier(MPI_COMM_WORLD); //printf("UniqueID %d (MPI rank %d) got storage rank %d\n",UniqueID(),my_rank,node_mpi_rank); fflush(stdout); if(UniqueID() != node_uniqueid){ //printf("UniqueID %d (MPI rank %d) free'd memory\n",UniqueID(),my_rank); fflush(stdout); free(mf); mf = NULL; }//else{ printf("UniqueID %d (MPI rank %d) is storage node, not freeing\n",UniqueID(),my_rank); fflush(stdout); } //if(!UniqueID()) printf("A2AmesonField::nodeDistribute %f MB stored on node %d (MPI rank %d)\n",(double)byte_size()/(1024.*1024.), node_uniqueid, node_mpi_rank); //if(my_rank == node_mpi_rank) printf("A2AmesonField::nodeDistribute I am node with MPI rank %d and I have UniqueID %d, my first elem remains %f\n",my_rank,UniqueID(),mf[0]); #endif } #ifdef USE_MPI template struct getMPIdataType{ }; template<> struct getMPIdataType{ static MPI_Datatype doit(){ return MPI_DOUBLE; } }; template<> struct getMPIdataType{ static MPI_Datatype doit(){ return MPI_FLOAT; } }; #endif //Get back the data. After the call, all nodes will have a complete copy template class A2AfieldL, template class A2AfieldR> void A2AmesonField::nodeGet(){ typedef typename ScalarComplexType::value_type mf_Float; if(node_mpi_rank == -1) return; //already on all nodes #ifndef USE_MPI int nodes = 1; for(int i=0;i<5;i++) nodes *= GJP.Nodes(i); if(nodes > 1) ERR.General("A2AmesonField","nodeGet","Implementation requires MPI\n"); #else int mpi_rank; //get this node's MPI rank int ret = MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); if(ret != MPI_SUCCESS) ERR.General("A2AmesonField","nodeGet","Comm_rank failed\n"); if(mpi_rank != node_mpi_rank){ //if(mf != NULL) printf("rank %d pointer should be NULL but it isn't!\n",mpi_rank); fflush(stdout); mf = (ScalarComplexType*)malloc(byte_size()); if(mf == NULL){ printf("rank %d failed to allocate memory!\n",mpi_rank); fflush(stdout); exit(-1); } //printf("rank %d allocated memory\n",mpi_rank); fflush(stdout); }//else{ printf("rank %d is root, first element of data %f\n",mpi_rank,mf[0]); fflush(stdout); } MPI_Datatype dtype = getMPIdataType::doit(); int dsize; MPI_Type_size(dtype,&dsize); if(sizeof(mf_Float) != dsize){ if(!UniqueID()){ printf("MPI size mismatch, want %d, got %d\n",sizeof(mf_Float),dsize); fflush(stdout); } MPI_Barrier(MPI_COMM_WORLD); exit(-1); } MPI_Barrier(MPI_COMM_WORLD); ret = MPI_Bcast((void*)mf, 2*fsize, dtype, node_mpi_rank, MPI_COMM_WORLD); if(ret != MPI_SUCCESS){ printf("rank %d Bcast fail\n",mpi_rank,mf[0]); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); exit(-1); } //printf("rank %d completed Bcast, first element %f\n",mpi_rank,mf[0]); fflush(stdout); //if(mpi_rank == node_mpi_rank) printf("A2AmesonField::nodeGet %f MB gathered from node %d (MPI rank %d)\n",(double)byte_size()/(1024.*1024.), UniqueID(), node_mpi_rank); node_mpi_rank = -1; //now on all nodes MPI_Barrier(MPI_COMM_WORLD); #endif } #endif