/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/simd/Grid_vector_types.h Copyright (C) 2015 Author: Peter Boyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #pragma once NAMESPACE_BEGIN(Grid); template class Grid_simd2 { public: typedef typename RealPart::type Real; typedef Vector_type vector_type; typedef Scalar_type scalar_type; typedef union conv_t_union { Vector_type v; Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)]; accelerator_inline conv_t_union(){}; } conv_t; static constexpr int nvec=2; Vector_type v[nvec]; static accelerator_inline constexpr int Nsimd(void) { static_assert( (sizeof(Vector_type) / sizeof(Scalar_type) >= 1), " size mismatch " ); return nvec*sizeof(Vector_type) / sizeof(Scalar_type); } accelerator_inline Grid_simd2 &operator=(const Grid_simd2 &&rhs) { for(int n=0;n accelerator_inline Grid_simd2(const typename std::enable_if::value, S>::type a) { vsplat(*this, a); }; ///////////////////////////// // Constructors ///////////////////////////// accelerator_inline Grid_simd2 & operator=(const Zero &z) { vzero(*this); return (*this); } /////////////////////////////////////////////// // mac, mult, sub, add, adj /////////////////////////////////////////////// friend accelerator_inline void mac(Grid_simd2 *__restrict__ y, const Grid_simd2 *__restrict__ a, const Grid_simd2 *__restrict__ x) { *y = (*a) * (*x) + (*y); }; friend accelerator_inline void mult(Grid_simd2 *__restrict__ y, const Grid_simd2 *__restrict__ l, const Grid_simd2 *__restrict__ r) { *y = (*l) * (*r); } friend accelerator_inline void sub(Grid_simd2 *__restrict__ y, const Grid_simd2 *__restrict__ l, const Grid_simd2 *__restrict__ r) { *y = (*l) - (*r); } friend accelerator_inline void add(Grid_simd2 *__restrict__ y, const Grid_simd2 *__restrict__ l, const Grid_simd2 *__restrict__ r) { *y = (*l) + (*r); } friend accelerator_inline void mac(Grid_simd2 *__restrict__ y, const Scalar_type *__restrict__ a, const Grid_simd2 *__restrict__ x) { *y = (*a) * (*x) + (*y); }; friend accelerator_inline void mult(Grid_simd2 *__restrict__ y, const Scalar_type *__restrict__ l, const Grid_simd2 *__restrict__ r) { *y = (*l) * (*r); } friend accelerator_inline void sub(Grid_simd2 *__restrict__ y, const Scalar_type *__restrict__ l, const Grid_simd2 *__restrict__ r) { *y = (*l) - (*r); } friend accelerator_inline void add(Grid_simd2 *__restrict__ y, const Scalar_type *__restrict__ l, const Grid_simd2 *__restrict__ r) { *y = (*l) + (*r); } friend accelerator_inline void mac(Grid_simd2 *__restrict__ y, const Grid_simd2 *__restrict__ a, const Scalar_type *__restrict__ x) { *y = (*a) * (*x) + (*y); }; friend accelerator_inline void mult(Grid_simd2 *__restrict__ y, const Grid_simd2 *__restrict__ l, const Scalar_type *__restrict__ r) { *y = (*l) * (*r); } friend accelerator_inline void sub(Grid_simd2 *__restrict__ y, const Grid_simd2 *__restrict__ l, const Scalar_type *__restrict__ r) { *y = (*l) - (*r); } friend accelerator_inline void add(Grid_simd2 *__restrict__ y, const Grid_simd2 *__restrict__ l, const Scalar_type *__restrict__ r) { *y = (*l) + (*r); } //////////////////////////////////////////////////////////////////////// // FIXME: gonna remove these load/store, get, set, prefetch //////////////////////////////////////////////////////////////////////// friend accelerator_inline void vset(Grid_simd2 &ret, Scalar_type *a) { for(int n=0;n friend accelerator_inline Grid_simd2 SimdApply(const functor &func, const Grid_simd2 &v) { Grid_simd2 ret; for(int n=0;n friend accelerator_inline Grid_simd2 SimdApplyBinop(const functor &func, const Grid_simd2 &x, const Grid_simd2 &y) { Grid_simd2 ret; for(int n=0;n Al Bl Ah,Bh /////////////////////// friend accelerator_inline void exchange0(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ out1.v[0] = in1.v[0]; out1.v[1] = in2.v[0]; out2.v[0] = in1.v[1]; out2.v[1] = in2.v[1]; } friend accelerator_inline void exchange1(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ exchange0(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); exchange0(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); } friend accelerator_inline void exchange2(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ exchange1(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); exchange1(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); } friend accelerator_inline void exchange3(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ exchange2(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); exchange2(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); } friend accelerator_inline void exchange4(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){ exchange3(out1.v[0],out2.v[0],in1.v[0],in2.v[0]); exchange3(out1.v[1],out2.v[1],in1.v[1],in2.v[1]); } friend accelerator_inline void exchange(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2,int n) { if (n==3) { exchange3(out1,out2,in1,in2); } else if(n==2) { exchange2(out1,out2,in1,in2); } else if(n==1) { exchange1(out1,out2,in1,in2); } else if(n==0) { exchange0(out1,out2,in1,in2); } } //////////////////////////////////////////////////////////////////// // General permute; assumes vector length is same across // all subtypes; may not be a good assumption, but could // add the vector width as a template param for BG/Q for example //////////////////////////////////////////////////////////////////// friend accelerator_inline void permute0(Grid_simd2 &y, Grid_simd2 b) { y.v[0]=b.v[1]; y.v[1]=b.v[0]; } friend accelerator_inline void permute1(Grid_simd2 &y, Grid_simd2 b) { permute0(y.v[0],b.v[0]); permute0(y.v[1],b.v[1]); } friend accelerator_inline void permute2(Grid_simd2 &y, Grid_simd2 b) { permute1(y.v[0],b.v[0]); permute1(y.v[1],b.v[1]); } friend accelerator_inline void permute3(Grid_simd2 &y, Grid_simd2 b) { permute2(y.v[0],b.v[0]); permute2(y.v[1],b.v[1]); } friend accelerator_inline void permute4(Grid_simd2 &y, Grid_simd2 b) { permute3(y.v[0],b.v[0]); permute3(y.v[1],b.v[1]); } friend accelerator_inline void permute(Grid_simd2 &y, Grid_simd2 b, int perm) { if(perm==3) permute3(y, b); else if(perm==2) permute2(y, b); else if(perm==1) permute1(y, b); else if(perm==0) permute0(y, b); } /////////////////////////////// // Getting single lanes /////////////////////////////// accelerator_inline Scalar_type getlane(int lane) const { if(lane < vector_type::Nsimd() ) return v[0].getlane(lane); else return v[1].getlane(lane%vector_type::Nsimd()); } accelerator_inline void putlane(const Scalar_type &S, int lane){ if(lane < vector_type::Nsimd() ) v[0].putlane(S,lane); else v[1].putlane(S,lane%vector_type::Nsimd()); } }; // end of Grid_simd2 class definition /////////////////////////////// // Define available types /////////////////////////////// typedef Grid_simd2 , vComplexD> vComplexD2; typedef Grid_simd2 vRealD2; ///////////////////////////////////////// // Some traits to recognise the types ///////////////////////////////////////// template struct is_simd : public std::false_type {}; template <> struct is_simd : public std::true_type {}; template <> struct is_simd : public std::true_type {}; template <> struct is_simd : public std::true_type {}; template <> struct is_simd : public std::true_type {}; template <> struct is_simd : public std::true_type {}; template <> struct is_simd : public std::true_type {}; template <> struct is_simd : public std::true_type {}; template <> struct is_simd : public std::true_type {}; template <> struct is_simd : public std::true_type {}; template using IfSimd = Invoke::value, int> >; template using IfNotSimd = Invoke::value, unsigned> >; /////////////////////////////////////////////// // insert / extract with complex support /////////////////////////////////////////////// template accelerator_inline S getlane(const Grid_simd &in,int lane) { return in.getlane(lane); } template accelerator_inline void putlane(Grid_simd &vec,const S &_S, int lane){ vec.putlane(_S,lane); } template = 0 > accelerator_inline S getlane(const S &in,int lane) { return in; } template = 0 > accelerator_inline void putlane(S &vec,const S &_S, int lane){ vec = _S; } template accelerator_inline S getlane(const Grid_simd2 &in,int lane) { return in.getlane(lane); } template accelerator_inline void putlane(Grid_simd2 &vec,const S &_S, int lane){ vec.putlane(_S,lane); } //////////////////////////////////////////////////////////////////// // General rotate //////////////////////////////////////////////////////////////////// template accelerator_inline void vbroadcast(Grid_simd2 &ret,const Grid_simd2 &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); } template =0> accelerator_inline void rbroadcast(Grid_simd2 &ret,const Grid_simd2 &src,int lane){ typedef typename V::vector_type vector_type; S* typepun =(S*) &src; ret.v[0].v = unary(real(typepun[lane]), VsplatSIMD()); ret.v[1].v = unary(real(typepun[lane]), VsplatSIMD()); } /////////////////////// // Splat /////////////////////// // this is only for the complex version template = 0, class ABtype> accelerator_inline void vsplat(Grid_simd2 &ret, ABtype a, ABtype b) { vsplat(ret.v[0],a,b); vsplat(ret.v[1],a,b); } // overload if complex template accelerator_inline void vsplat(Grid_simd2 &ret, EnableIf, S> c) { vsplat(ret, real(c), imag(c)); } template accelerator_inline void rsplat(Grid_simd2 &ret, EnableIf, S> c) { vsplat(ret, real(c), real(c)); } // if real fill with a, if complex fill with a in the real part (first function // above) template accelerator_inline void vsplat(Grid_simd2 &ret, NotEnableIf, S> a) { vsplat(ret.v[0],a); vsplat(ret.v[1],a); } ////////////////////////// /////////////////////////////////////////////// // Initialise to 1,0,i for the correct types /////////////////////////////////////////////// // For complex types template = 0> accelerator_inline void vone(Grid_simd2 &ret) { vsplat(ret, S(1.0, 0.0)); } template = 0> accelerator_inline void vzero(Grid_simd2 &ret) { vsplat(ret, S(0.0, 0.0)); } // use xor? template = 0> accelerator_inline void vcomplex_i(Grid_simd2 &ret) { vsplat(ret, S(0.0, 1.0)); } template = 0> accelerator_inline void visign(Grid_simd2 &ret) { vsplat(ret, S(1.0, -1.0)); } template = 0> accelerator_inline void vrsign(Grid_simd2 &ret) { vsplat(ret, S(-1.0, 1.0)); } // if not complex overload here template = 0> accelerator_inline void vone(Grid_simd2 &ret) { vsplat(ret, S(1.0)); } template = 0> accelerator_inline void vzero(Grid_simd2 &ret) { vsplat(ret, S(0.0)); } // For integral types template = 0> accelerator_inline void vone(Grid_simd2 &ret) { vsplat(ret, 1); } template = 0> accelerator_inline void vzero(Grid_simd2 &ret) { vsplat(ret, 0); } template = 0> accelerator_inline void vtrue(Grid_simd2 &ret) { vsplat(ret, 0xFFFFFFFF); } template = 0> accelerator_inline void vfalse(Grid_simd2 &ret) { vsplat(ret, 0); } template accelerator_inline void zeroit(Grid_simd2 &z) { vzero(z); } /////////////////////// // Vstream /////////////////////// template = 0> accelerator_inline void vstream(Grid_simd2 &out, const Grid_simd2 &in) { vstream(out.v[0],in.v[0]); vstream(out.v[1],in.v[1]); } template = 0> accelerator_inline void vstream(Grid_simd2 &out, const Grid_simd2 &in) { vstream(out.v[0],in.v[0]); vstream(out.v[1],in.v[1]); } template = 0> accelerator_inline void vstream(Grid_simd2 &out, const Grid_simd2 &in) { vstream(out.v[0],in.v[0]); vstream(out.v[1],in.v[1]); } //////////////////////////////////// // Arithmetic operator overloads +,-,* //////////////////////////////////// template accelerator_inline Grid_simd2 operator+(Grid_simd2 a, Grid_simd2 b) { Grid_simd2 ret; ret.v[0] = a.v[0]+b.v[0]; ret.v[1] = a.v[1]+b.v[1]; return ret; }; template accelerator_inline Grid_simd2 operator-(Grid_simd2 a, Grid_simd2 b) { Grid_simd2 ret; ret.v[0] = a.v[0]-b.v[0]; ret.v[1] = a.v[1]-b.v[1]; return ret; }; // Distinguish between complex types and others template = 0> accelerator_inline Grid_simd2 real_mult(Grid_simd2 a, Grid_simd2 b) { Grid_simd2 ret; ret.v[0] =real_mult(a.v[0],b.v[0]); ret.v[1] =real_mult(a.v[1],b.v[1]); return ret; }; template = 0> accelerator_inline Grid_simd2 real_madd(Grid_simd2 a, Grid_simd2 b, Grid_simd2 c) { Grid_simd2 ret; ret.v[0] =real_madd(a.v[0],b.v[0],c.v[0]); ret.v[1] =real_madd(a.v[1],b.v[1],c.v[1]); return ret; }; // Distinguish between complex types and others template accelerator_inline Grid_simd2 operator*(Grid_simd2 a, Grid_simd2 b) { Grid_simd2 ret; ret.v[0] = a.v[0]*b.v[0]; ret.v[1] = a.v[1]*b.v[1]; return ret; }; // Distinguish between complex types and others template accelerator_inline Grid_simd2 operator/(Grid_simd2 a, Grid_simd2 b) { Grid_simd2 ret; ret.v[0] = a.v[0]/b.v[0]; ret.v[1] = a.v[1]/b.v[1]; return ret; }; /////////////////////// // Conjugate /////////////////////// template accelerator_inline Grid_simd2 conjugate(const Grid_simd2 &in) { Grid_simd2 ret; ret.v[0] = conjugate(in.v[0]); ret.v[1] = conjugate(in.v[1]); return ret; } template = 0> accelerator_inline Grid_simd2 adj(const Grid_simd2 &in) { return conjugate(in); } /////////////////////// // timesMinusI /////////////////////// template accelerator_inline void timesMinusI(Grid_simd2 &ret, const Grid_simd2 &in) { timesMinusI(ret.v[0],in.v[0]); timesMinusI(ret.v[1],in.v[1]); } template accelerator_inline Grid_simd2 timesMinusI(const Grid_simd2 &in) { Grid_simd2 ret; timesMinusI(ret.v[0],in.v[0]); timesMinusI(ret.v[1],in.v[1]); return ret; } /////////////////////// // timesI /////////////////////// template accelerator_inline void timesI(Grid_simd2 &ret, const Grid_simd2 &in) { timesI(ret.v[0],in.v[0]); timesI(ret.v[1],in.v[1]); } template accelerator_inline Grid_simd2 timesI(const Grid_simd2 &in) { Grid_simd2 ret; timesI(ret.v[0],in.v[0]); timesI(ret.v[1],in.v[1]); return ret; } ///////////////////// // Inner, outer ///////////////////// template accelerator_inline Grid_simd2 innerProduct(const Grid_simd2 &l,const Grid_simd2 &r) { return conjugate(l) * r; } template accelerator_inline Grid_simd2 outerProduct(const Grid_simd2 &l,const Grid_simd2 &r) { return l * conjugate(r); } template accelerator_inline Grid_simd2 trace(const Grid_simd2 &arg) { return arg; } //////////////////////////////////////////////////////////// // copy/splat complex real parts into real; // insert real into complex and zero imag; //////////////////////////////////////////////////////////// accelerator_inline void precisionChange(vComplexD2 &out,const vComplexF &in){ Optimization::PrecisionChange::StoD(in.v,out.v[0].v,out.v[1].v); } accelerator_inline void precisionChange(vComplexF &out,const vComplexD2 &in){ out.v=Optimization::PrecisionChange::DtoS(in.v[0].v,in.v[1].v); } accelerator_inline void precisionChange(vComplexD2 *out,const vComplexF *in,int nvec){ for(int m=0;m