Monday, October 25, 2010

Creating a Super-Fast Math Library, Part 4: Vectors with SSE

Here's my Windows specific vector implementation. I had to do some formatting changes to get it to show up decently, hopefully I didn't mess anything up in the process.

Almost everything is just a straight wrapper to the SSE intrinsics. The shuffle and shuffle mask had to be defines because the intrinsics require immediate (hard coded) values.

There are a couple of things about this that are not so good, but I'll leave it to you to improve them (I'm already doing it myself, but I can't just give you everything now can I?) Most specifically, the length and dot products break the vectors, returning only a float.

#include 
#include

namespace math
{
typedef __m128 vec4f_t;

inline vec4f_t vec4f_zero()
{
return _mm_setzero_ps();
}

inline bool vec4f_cmpeq(const vec4f_t &a,
const vec4f_t &b)
{
return (0xf==_mm_movemask_ps(_mm_cmpeq_ps(a,b)));
}

inline bool vec4f_le(const vec4f_t &a,
const vec4f_t &b)
{
__m128 value = _mm_cmple_ps(a, b);
return (0xf == _mm_movemask_ps(value));
}

inline vec4f_t vec4f_splat(const float v)
{
return _mm_set1_ps(&v);
}

inline vec4f_t vec4f_splat(const vec4f_t &v,
const uint32_t i)
{
switch (i)
{
case 0: return _mm_shuffle_ps(v, v, 0x00);
case 1: return _mm_shuffle_ps(v, v, 0x55);
case 2: return _mm_shuffle_ps(v, v, 0xaa);
case 3: return _mm_shuffle_ps(v, v, 0xff);
default: REQUIRE(0, "Invalid index");
}
return _mm_setzero_ps();
}

inline vec4f_t vec4f_load(const f32_t a,
const f32_t b,
const f32_t c,
const f32_t d)
{
return _mm_setr_ps(a, b, c, d);
}

inline vec4f_t vec4f_load(const f32_t * const p)
{
REQUIRE(((uintptr_t)p & 0xf) == 0, "ALIGNMENT");
return _mm_load_ps(p);
}

inline void vec4f_store(const vec4f_t &a,
f32_t * const p)
{
REQUIRE(((uintptr_t)p & 0xf) == 0, "ALIGNMENT");
_mm_store_ps(p, a);
}

inline vec4f_t vec4f_movelh(const vec4f_t &a,
const vec4f_t &b)
{
return _mm_movelh_ps(a, b);
}

inline vec4f_t vec4f_movehl(const vec4f_t &a,
const vec4f_t &b)
{
return _mm_movehl_ps(b, a);
}

inline vec4f_t vec4f_add(const vec4f_t &a,
const vec4f_t &b)
{
return _mm_add_ps(a, b);
}

inline vec4f_t vec4f_sub(const vec4f_t &a,
const vec4f_t &b)
{
return _mm_sub_ps(a, b);
}

inline vec4f_t vec4f_mul(const vec4f_t &a,
const vec4f_t &b)
{
return _mm_mul_ps(a, b);
}

inline vec4f_t vec4f_div(const vec4f_t &a,
const vec4f_t &b)
{
return _mm_div_ps(a, b);
}

inline vec4f_t vec4f_sqrt(const vec4f_t &a)
{
return _mm_sqrt_ps(a);
}

inline vec4f_t vec4f_rcp(const vec4f_t &a)
{
return _mm_rcp_ps(a);
}

inline vec4f_t vec4f_rsqrt(const vec4f_t &a)
{
return _mm_rsqrt_ps(a);
}

inline vec4f_t vec4f_min(const vec4f_t &a,
const vec4f_t &b)
{
return _mm_min_ps(a, b);
}

inline vec4f_t vec4f_max(const vec4f_t &a,
const vec4f_t &b)
{
return _mm_max_ps(a, b);
}

inline f32_t vec4f_dot(const vec4f_t &a,
const vec4f_t &b)
{
__m128 mult = _mm_mul_ps(a, b);
__m128 shf1 = _mm_shuffle_ps(mult, mult, 0x4e);
__m128 add1 = _mm_add_ps(shf1, mult);
__m128 shf2 = _mm_shuffle_ps(add1, add1, 0x1b);
__m128 add2 = _mm_add_ps(add1, shf2);
f32_t result;
_mm_store_ss(&result, add2);
return result;
}

inline f32_t vec4f_length_sq(const vec4f_t &v)
{
return vec4f_dot(v, v);
}

inline f32_t vec4f_length(const vec4f_t &v)
{
return sqrt(vec4f_length_sq(v));
}

inline vec4f_t vec4f_normalize(const vec4f_t &v)
{
return vec4f_div(v, vec4f_splat(vec4f_length(v)));
}

#define vec4f_shuffle_mask(a, b, c, d)\
_MM_SHUFFLE(d, c, b, a)

#define vec4f_shuffle(a, b, mask)\
_mm_shuffle_ps(a, b, mask)
} // namespace math

No comments:

Post a Comment