
#ifndef randomfieldsutils_intrinsics_H
#define randomfieldsutils_intrinsics_H 1

#ifdef PRINTF
#error "intrinsics.h not very first"
#endif

#include<inttypes.h> // uintptr_t

#include "parallel_simd.h"
// PKG_CXXFLAGS =  $(SHLIB_OPENMP_CXXFLAGS) -mavx ODER -march=native

#if defined (AVX512)
#define SSEBITS 512
#define SSEMODE 30
#elif defined (SSE)
#define SSEBITS 256
#define SSEMODE 20
#elif defined (SSE)
#define SSEBITS 128
#define SSEMODE 10
#else
#define SSEBITS 64
#define SSEMODE 0
#endif


#ifndef WIN32
// #define FMA_AVAILABLE __FMA__
#endif


#if __GNUC__ > 4 ||							\
  (__GNUC__ == 4 && (__GNUC_MINOR__ > 9 ||				\
		     (__GNUC_MINOR__ == 9 &&  __GNUC_PATCHLEVEL__ >= 1)))
//#define OpenMP4 1
#endif


//#define ALIGNED __declspec(align(SSEBITS/8))




#if defined AVX
#include <immintrin.h>
#define BytesPerBlock 32
#define UBlockType __m256i 
#define BlockType __m256i ALIGNED
#define Double __m256d
#define MAXDOUBLE _mm256_max_pd
#define MAXINTEGER _mm256_max_epi32
#define LOAD _mm256_load_si256
// #define EXPDOUBLE mm256_exp_pd // only on intel compiler
#define ADDDOUBLE  _mm256_add_pd
#define SUBDOUBLE  _mm256_sub_pd
#define MULTDOUBLE _mm256_mul_pd 
#define LOADuDOUBLE _mm256_loadu_pd
#define LOADDOUBLE _mm256_load_pd
#define STOREuDOUBLE _mm256_storeu_pd
#define ZERODOUBLE _mm256_setzero_pd()

#elif defined SSE2
#include <immintrin.h>
#define BytesPerBlock 16
#define UBlockType __m128i
#define BlockType __m128i ALIGNED
#define Double __m128d
#define MAXDOUBLE _mm_max_pd
#define MAXINTEGER _mm_max_epi32
#define LOAD _mm_load_si128
// #define EXPDOUBLE _mm_exp_pd  // only on intel compiler
#define ADDDOUBLE  _mm_add_pd
#define SUBDOUBLE  _mm_sub_pd
#define MULTDOUBLE _mm_mul_pd 
#define LOADuDOUBLE _mm_loadu_pd
#define LOADDOUBLE _mm_load_pd
#define STOREuDOUBLE _mm_storeu_pd
#define ZERODOUBLE _mm_setzero_pd()

#else
#define BytesPerBlock 8
#endif

#define algn_general(X)  ((1L + (uintptr_t) (((uintptr_t) X - 1L) / BytesPerBlock)) * BytesPerBlock)
double inline *algn(double *X) {return (double *) algn_general(X); }
int inline *algnInt(int *X) {return (int *) algn_general(X); }
#define ALIGNED __attribute__ ((aligned (BytesPerBlock)))
#define doubles (BytesPerBlock / 8)
#define integers (BytesPerBlock / 8)


#endif


