random coding: Creating a Super-Fast Math Library, Part 4: Vectors with SSE

Here's my Windows specific vector implementation. I had to do some formatting changes to get it to show up decently, hopefully I didn't mess anything up in the process.

Almost everything is just a straight wrapper to the SSE intrinsics. The shuffle and shuffle mask had to be defines because the intrinsics require immediate (hard coded) values.

There are a couple of things about this that are not so good, but I'll leave it to you to improve them (I'm already doing it myself, but I can't just give you everything now can I?) Most specifically, the length and dot products break the vectors, returning only a float.

#include 
#include 

namespace math
{
    typedef __m128 vec4f_t;

    inline vec4f_t vec4f_zero()
    {
        return _mm_setzero_ps();
    }

    inline bool vec4f_cmpeq(const vec4f_t &a,
                            const vec4f_t &b)
    {
        return (0xf==_mm_movemask_ps(_mm_cmpeq_ps(a,b)));
    }

    inline bool vec4f_le(const vec4f_t &a,
                         const vec4f_t &b)
    {
        __m128 value = _mm_cmple_ps(a, b);
        return (0xf == _mm_movemask_ps(value));
    }

    inline vec4f_t vec4f_splat(const float v)
    {
        return _mm_set1_ps(&v);
    }

    inline vec4f_t vec4f_splat(const vec4f_t &v,
                               const uint32_t i)
    {
        switch (i)
        {
            case 0: return _mm_shuffle_ps(v, v, 0x00);
            case 1: return _mm_shuffle_ps(v, v, 0x55);
            case 2: return _mm_shuffle_ps(v, v, 0xaa);
            case 3: return _mm_shuffle_ps(v, v, 0xff);
            default: REQUIRE(0, "Invalid index");
        }
        return _mm_setzero_ps();
    }

    inline vec4f_t vec4f_load(const f32_t a,
                              const f32_t b,
                              const f32_t c,
                              const f32_t d)
    {
        return _mm_setr_ps(a, b, c, d);
    }

    inline vec4f_t vec4f_load(const f32_t * const p)
    {
        REQUIRE(((uintptr_t)p & 0xf) == 0, "ALIGNMENT");
        return _mm_load_ps(p);
    }

    inline void vec4f_store(const vec4f_t &a,
                            f32_t * const p)
    {
        REQUIRE(((uintptr_t)p & 0xf) == 0, "ALIGNMENT");
        _mm_store_ps(p, a);
    }

    inline vec4f_t vec4f_movelh(const vec4f_t &a,
                                const vec4f_t &b)
    {
        return _mm_movelh_ps(a, b);
    }

    inline vec4f_t vec4f_movehl(const vec4f_t &a,
                                const vec4f_t &b)
    {
        return _mm_movehl_ps(b, a);
    }

    inline vec4f_t vec4f_add(const vec4f_t &a,
                             const vec4f_t &b)
    {
        return _mm_add_ps(a, b);
    }

    inline vec4f_t vec4f_sub(const vec4f_t &a,
                             const vec4f_t &b)
    {
        return _mm_sub_ps(a, b);
    }

    inline vec4f_t vec4f_mul(const vec4f_t &a,
                             const vec4f_t &b)
    {
        return _mm_mul_ps(a, b);
    }

    inline vec4f_t vec4f_div(const vec4f_t &a,
                             const vec4f_t &b)
    {
        return _mm_div_ps(a, b);
    }

    inline vec4f_t vec4f_sqrt(const vec4f_t &a)
    {
        return _mm_sqrt_ps(a);
    }

    inline vec4f_t vec4f_rcp(const vec4f_t &a)
    {
        return _mm_rcp_ps(a);
    }

    inline vec4f_t vec4f_rsqrt(const vec4f_t &a)
    {
        return _mm_rsqrt_ps(a);
    }

    inline vec4f_t vec4f_min(const vec4f_t &a,
                             const vec4f_t &b)
    {
        return _mm_min_ps(a, b);
    }

    inline vec4f_t vec4f_max(const vec4f_t &a,
                             const vec4f_t &b)
    {
        return _mm_max_ps(a, b);
    }

    inline f32_t vec4f_dot(const vec4f_t &a,
                           const vec4f_t &b)
    {
        __m128 mult = _mm_mul_ps(a, b);
        __m128 shf1 = _mm_shuffle_ps(mult, mult, 0x4e);
        __m128 add1 = _mm_add_ps(shf1, mult);
        __m128 shf2 = _mm_shuffle_ps(add1, add1, 0x1b);
        __m128 add2 = _mm_add_ps(add1, shf2);
        f32_t result;
        _mm_store_ss(&result, add2);
        return result;
    }

    inline f32_t vec4f_length_sq(const vec4f_t &v)
    {
        return vec4f_dot(v, v);
    }

    inline f32_t vec4f_length(const vec4f_t &v)
    {
        return sqrt(vec4f_length_sq(v));
    }

    inline vec4f_t vec4f_normalize(const vec4f_t &v)
    {
        return vec4f_div(v, vec4f_splat(vec4f_length(v)));
    }

    #define vec4f_shuffle_mask(a, b, c, d)\
        _MM_SHUFFLE(d, c, b, a)

    #define vec4f_shuffle(a, b, mask)\
        _mm_shuffle_ps(a, b, mask)
} // namespace math

Monday, October 25, 2010

Creating a Super-Fast Math Library, Part 4: Vectors with SSE

No comments:

Post a Comment

Blog Archive

Search This Blog

Followers

About Me