Almost everything is just a straight wrapper to the SSE intrinsics. The shuffle and shuffle mask had to be defines because the intrinsics require immediate (hard coded) values.

There are a couple of things about this that are not so good, but I'll leave it to you to improve them (I'm already doing it myself, but I can't just give you

*everything*now can I?) Most specifically, the length and dot products break the vectors, returning only a float.

#include

#include

namespace math

{

typedef __m128 vec4f_t;

inline vec4f_t vec4f_zero()

{

return _mm_setzero_ps();

}

inline bool vec4f_cmpeq(const vec4f_t &a,

const vec4f_t &b)

{

return (0xf==_mm_movemask_ps(_mm_cmpeq_ps(a,b)));

}

inline bool vec4f_le(const vec4f_t &a,

const vec4f_t &b)

{

__m128 value = _mm_cmple_ps(a, b);

return (0xf == _mm_movemask_ps(value));

}

inline vec4f_t vec4f_splat(const float v)

{

return _mm_set1_ps(&v);

}

inline vec4f_t vec4f_splat(const vec4f_t &v,

const uint32_t i)

{

switch (i)

{

case 0: return _mm_shuffle_ps(v, v, 0x00);

case 1: return _mm_shuffle_ps(v, v, 0x55);

case 2: return _mm_shuffle_ps(v, v, 0xaa);

case 3: return _mm_shuffle_ps(v, v, 0xff);

default: REQUIRE(0, "Invalid index");

}

return _mm_setzero_ps();

}

inline vec4f_t vec4f_load(const f32_t a,

const f32_t b,

const f32_t c,

const f32_t d)

{

return _mm_setr_ps(a, b, c, d);

}

inline vec4f_t vec4f_load(const f32_t * const p)

{

REQUIRE(((uintptr_t)p & 0xf) == 0, "ALIGNMENT");

return _mm_load_ps(p);

}

inline void vec4f_store(const vec4f_t &a,

f32_t * const p)

{

REQUIRE(((uintptr_t)p & 0xf) == 0, "ALIGNMENT");

_mm_store_ps(p, a);

}

inline vec4f_t vec4f_movelh(const vec4f_t &a,

const vec4f_t &b)

{

return _mm_movelh_ps(a, b);

}

inline vec4f_t vec4f_movehl(const vec4f_t &a,

const vec4f_t &b)

{

return _mm_movehl_ps(b, a);

}

inline vec4f_t vec4f_add(const vec4f_t &a,

const vec4f_t &b)

{

return _mm_add_ps(a, b);

}

inline vec4f_t vec4f_sub(const vec4f_t &a,

const vec4f_t &b)

{

return _mm_sub_ps(a, b);

}

inline vec4f_t vec4f_mul(const vec4f_t &a,

const vec4f_t &b)

{

return _mm_mul_ps(a, b);

}

inline vec4f_t vec4f_div(const vec4f_t &a,

const vec4f_t &b)

{

return _mm_div_ps(a, b);

}

inline vec4f_t vec4f_sqrt(const vec4f_t &a)

{

return _mm_sqrt_ps(a);

}

inline vec4f_t vec4f_rcp(const vec4f_t &a)

{

return _mm_rcp_ps(a);

}

inline vec4f_t vec4f_rsqrt(const vec4f_t &a)

{

return _mm_rsqrt_ps(a);

}

inline vec4f_t vec4f_min(const vec4f_t &a,

const vec4f_t &b)

{

return _mm_min_ps(a, b);

}

inline vec4f_t vec4f_max(const vec4f_t &a,

const vec4f_t &b)

{

return _mm_max_ps(a, b);

}

inline f32_t vec4f_dot(const vec4f_t &a,

const vec4f_t &b)

{

__m128 mult = _mm_mul_ps(a, b);

__m128 shf1 = _mm_shuffle_ps(mult, mult, 0x4e);

__m128 add1 = _mm_add_ps(shf1, mult);

__m128 shf2 = _mm_shuffle_ps(add1, add1, 0x1b);

__m128 add2 = _mm_add_ps(add1, shf2);

f32_t result;

_mm_store_ss(&result, add2);

return result;

}

inline f32_t vec4f_length_sq(const vec4f_t &v)

{

return vec4f_dot(v, v);

}

inline f32_t vec4f_length(const vec4f_t &v)

{

return sqrt(vec4f_length_sq(v));

}

inline vec4f_t vec4f_normalize(const vec4f_t &v)

{

return vec4f_div(v, vec4f_splat(vec4f_length(v)));

}

#define vec4f_shuffle_mask(a, b, c, d)\

_MM_SHUFFLE(d, c, b, a)

#define vec4f_shuffle(a, b, mask)\

_mm_shuffle_ps(a, b, mask)

} // namespace math