//Implementation of basic matrix, vector types with underlying SIMD parallelization via Grid //Note: I don't put them in CPS namespace to avoid conflicts. #ifdef USE_GRID #ifndef _GRID_SIMDLINALG_H #define _GRID_SIMDLINALG_H #include namespace Grid_A2A{ //Basic vector dot product for testing template T dot_basic(const std::vector &A, const std::vector &B){ assert(A.size() == B.size()); T out = 0.0; for(uint i=0;i struct SIMDvtype{}; template<> struct SIMDvtype< std::complex >{ typedef Grid::vComplexD Vtype; }; template<> struct SIMDvtype< std::complex >{ typedef Grid::vComplexF Vtype; }; //SIMD paralellized vector template class SIMDvector{ public: typedef typename SIMDvtype::Vtype Vtype; private: std::vector > v; const int nsimd; //how many T can be fit into a packed Vtype uint _size_packed; uint _size; inline void alloc(uint n){ //n is the unpacked size _size = n; bool overspill = (n % nsimd != 0); //size doesn't divide evenly over SIMD vectors, add a final SIMD vector padded with zeroes to mop up _size_packed = n/nsimd + (overspill ? 1:0); v.resize(_size_packed); } public: SIMDvector(): v(0), nsimd(Vtype::Nsimd()), _size(0){} SIMDvector(uint n): nsimd(Vtype::Nsimd()){ alloc(n); } SIMDvector(SIMDvector &&r): nsimd(Vtype::Nsimd()){ _size = r._size; _size_packed = r._size_packed; v = std::move(r.v); } SIMDvector(const SIMDvector &r){ alloc(r._size); for(int i=0;i<_size_packed;i++) v[i] = r.v[i]; } SIMDvector &operator=(const SIMDvector &r){ if(r._size != _size){ alloc(r._size); } for(int i=0;i<_size_packed;i++) v[i] = r.v[i]; return *this; } inline uint size() const{ return _size; } inline uint packedSize() const{ return _size_packed; } //sz is the unpacked size! inline void resize(uint sz){ alloc(sz); } void import(const T* from, const size_t size){ if(size!=_size){ alloc(size); } int overspill = _size % nsimd; uint to = _size_packed - (overspill > 0 ? 1:0); T* d = const_cast(from); for(int i=0;i &from){ import(&from[0],from.size()); } //Access to packed members inline Vtype & operator[](uint i){ return v[i]; } inline const Vtype & operator[](uint i) const{ return v[i]; } }; //SIMD parallelized vector dot product template T dot_simd(const SIMDvector &A, const SIMDvector &B){ typedef typename SIMDvector::Vtype Vtype; assert(A.size() == B.size()); T out = 0.0; Vtype tmp; for(uint i=0;i T dot_simd(const std::vector &A, const std::vector &B){ assert(A.size() == B.size()); SIMDvector Av; Av.import(A); SIMDvector Bv; Bv.import(B); return dot_simd(Av,Bv); } //A basic matrix implementation for testing template class Matrix{ T* m; uint _rows; uint _cols; uint size; inline void alloc(uint r, uint c){ _rows = r; _cols = c; size = r*c; m = (T*)malloc(size * sizeof(T)); } inline void freeme(){ if(m!=NULL) free(m); } public: Matrix(): m(NULL){} Matrix(uint r, uint c){ alloc(r,c); } ~Matrix(){ freeme(); } Matrix(Matrix &&r){ _rows = r._rows; _cols = r._cols; size = r.size; m = r.m; r.m = NULL; } Matrix(const Matrix &r){ alloc(r._rows,r._cols); for(int i=0;i &r){ if(r._rows != _rows || r._cols != _cols){ freeme(); alloc(r._rows,r._cols); } for(int i=0;i > &A, const Matrix > &B, double tol=1e-9){ if(A.rows()!=B.rows() || A.cols()!=B.cols()) return false; for(int i=0;i tol){ printf("WARNING: Difference at elem %d real part: val A %g val B %g diff %g\n",i, std::real(A[i]), std::real(B[i]), rd); return false; } rd = fabs( std::imag(A[i]) - std::imag(B[i]) ); if(rd > tol){ printf("WARNING: Difference at elem %d imag part: val A %g val B %g diff %g\n",i, std::imag(A[i]), std::imag(B[i]), rd); return false; } } return true; } //Basic implementation of matrix multiplication template Matrix mult_basic(const Matrix &A, const Matrix &B){ assert(A.cols() == B.rows()); Matrix out(A.rows(),B.cols()); for(uint i=0;i class SIMDrowMatrix{ std::vector< SIMDvector > _rows; typedef typename SIMDvector::Vtype Vtype; public: void import(const Matrix &from){ _rows.resize(from.rows()); int nsimd = Vtype::Nsimd(); //number of Ts in a V :) for(int r=0;r<_rows.size();r++){ _rows[r].resize(from.cols()); T* block_base = &from(r,0); //row major so column elements are consecutive for(int c_packed = 0; c_packed < _rows[r].packedSize(); c_packed++){ vset(_rows[r][c_packed], block_base); block_base += nsimd; } } } inline const SIMDvector & getRow(uint i) const{ return _rows[i]; } inline SIMDvector & getRow(uint i){ return _rows[i]; } uint rows() const{ return _rows.size(); } uint cols() const{ return _rows[0].size(); } }; //A matrix with SIMD vectors as columns template class SIMDcolMatrix{ std::vector< SIMDvector > _cols; typedef typename SIMDvector::Vtype Vtype; public: void import(const Matrix &from){ _cols.resize(from.cols()); int nsimd = Vtype::Nsimd(); //number of Ts in a V :) T tmp[nsimd]; int row_off = from.cols(); //row major for(int c=0;c<_cols.size();c++){ _cols[c].resize(from.rows()); T* col_p = &from(0,c); for(int r_packed = 0; r_packed < _cols[c].packedSize(); r_packed++){ for(int i=0;i & getCol(uint i) const{ return _cols[i]; } inline SIMDvector & getCol(uint i){ return _cols[i]; } uint rows() const{ return _cols[0].size(); } uint cols() const{ return _cols.size(); } }; //SIMD implementation of matrix multiplication template Matrix mult_simd(const SIMDrowMatrix &A, const SIMDcolMatrix &B){ Matrix out(A.rows(),B.cols()); for(uint i=0;i gridTrace(const SpinColorFlavorMatrix& a, const SpinColorFlavorMatrix& b){ // typedef typename SIMDvtype >::Vtype Vtype; // const int scf_size = 24; // const int nsimd = Vtype::Nsimd(); // assert( scf_size % nsimd == 0 ); //works for AVX1 and AVX2 (2 complex per SIMD vector, nblocks = 12) and AVX512 (4 complex per SIMD vector, nblocks = 6), will also work with AVX1024 when it comes around // const int nblocks = scf_size / nsimd; // //In-place transpose of b so rows are contiguous // std::complex bT[scf_size][scf_size]; // for(int i=0;i