Code below - just don't expect it to be too neat, and beware the evil lack of commenting.
Kind of my first time mucking about with intrinsics as well, so I may actually be using them entirely incorrectly.
Code:
struct vector3D
{
union {
__m128 vector;
struct { float x,y,z,w; };
};
vector3D() : x(0.0f), y(0.0f), z(0.0f), w(1.0f) {}
vector3D(float inx, float iny, float inz) : x(inx), y(iny), z(inz), w(1.0f) {}
vector3D(const float *inarray) : x(inarray[0]), y(inarray[1]), z(inarray[2]), w(1.0f) {}
vector3D(__m128 val) : vector(val) {}
inline const vector3D &operator=(const vector3D &src) { x = src.x; y = src.y; z = src.z; return *this; }
inline const vector3D &operator=(const float *src) { x = src[0]; y = src[1]; z = src[2]; return *this; }
inline const vector3D &operator+=(const vector3D &src) { vector = _mm_add_ps(vector, src.vector); return *this; }
inline const vector3D &operator-=(const vector3D &src) { vector = _mm_sub_ps(vector, src.vector); return *this; }
inline const vector3D &operator*=(float value) { x *= value; y *= value; z *= value; return *this;}
inline const vector3D &operator/=(float value) { float inverse = 1.0f / value; x *= inverse; y *= inverse; z *= inverse; return *this;}
inline float dot(const vector3D &invec) const { vector3D result(_mm_mul_ps(vector, invec.vector)); return (result.x + result.y + result.z); }
inline void normalise() { vector3D result(_mm_mul_ps(vector, vector)); vector = _mm_div_ps(vector, _mm_set1_ps(sqrt(result.x + result.y + result.z))); }
inline float length() const { vector3D result(_mm_mul_ps(vector, vector)); return sqrt(result.x + result.y + result.z); }
inline float length_squared() const { vector3D result(_mm_mul_ps(vector, vector)); return (result.x + result.y + result.z); }
inline float angle(const vector3D &in_vec) const { return acos((dot(in_vec) / (length_squared() * in_vec.length_squared()))); }
};
inline const vector3D operator+(const vector3D &a, const vector3D &b) { vector3D rvec(_mm_add_ps(a.vector, b.vector)); return rvec;}
inline const vector3D operator-(const vector3D &a, const vector3D &b) { vector3D rvec(_mm_sub_ps(a.vector, b.vector)); return rvec; }
inline const vector3D operator*(const vector3D &a, const vector3D &b) { vector3D rvec(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); return rvec; }
inline const vector3D operator*(float value, const vector3D &a) { vector3D rvec(_mm_mul_ps(_mm_set1_ps(value), a.vector)); return rvec; }
inline const vector3D operator*(const vector3D &a, float value) { vector3D rvec(_mm_mul_ps(_mm_set1_ps(value), a.vector)); return rvec; }
inline const vector3D operator/(const vector3D &a, float value) { vector3D rvec(_mm_div_ps(a.vector, _mm_set1_ps(value))); return rvec; }