mirror of
https://github.com/thug1src/thug.git
synced 2025-01-22 05:43:47 +00:00
159 lines
6.7 KiB
C
159 lines
6.7 KiB
C
//-----------------------------------------------------------------------------
|
|
// File: SSE.h
|
|
//
|
|
// Desc: P3 SSE conversions and estimates
|
|
//
|
|
// Hist: 1.7.03 - Created
|
|
//
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
//-----------------------------------------------------------------------------
|
|
|
|
#ifndef P3_SSE
|
|
#define P3_SSE
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Name: Ftoi_ASM
|
|
// Desc: SSE float to int conversion. Note that no control word needs to be
|
|
// set to round down
|
|
//-----------------------------------------------------------------------------
|
|
__forceinline int Ftoi_ASM( const float f )
|
|
{
|
|
__asm cvttss2si eax, f // return int(f)
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Name: ReciprocalEstimate_ASM
|
|
// Desc: SSE reciprocal estimate, accurate to 12 significant bits of
|
|
// the mantissa
|
|
//-----------------------------------------------------------------------------
|
|
__forceinline float ReciprocalEstimate_ASM( const float f )
|
|
{
|
|
float rec;
|
|
__asm rcpss xmm0, f // xmm0 = rcpss(f)
|
|
__asm movss rec , xmm0 // return xmm0
|
|
return rec;
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Name: ReciprocalSqrtEstimate_ASM
|
|
// Desc: SSE reciprocal square root estimate, accurate to 12 significant
|
|
// bits of the mantissa
|
|
//-----------------------------------------------------------------------------
|
|
__forceinline float ReciprocalSqrtEstimate_ASM( const float f )
|
|
{
|
|
float recsqrt;
|
|
__asm rsqrtss xmm0, f // xmm0 = rsqrtss(f)
|
|
__asm movss recsqrt, xmm0 // return xmm0
|
|
return recsqrt;
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Name: SqrtEstimae_ASM
|
|
// Desc: SSE square root estimate, accurate to 12 significant bits of
|
|
// the mantissa. Note that a check for zero must be made since
|
|
// sqrt(0) == 0 but 1/sqrt(0) = inf
|
|
//-----------------------------------------------------------------------------
|
|
__forceinline float SqrtEstimate_ASM( const float f )
|
|
{
|
|
float recsqrt;
|
|
__asm movss xmm0,f // xmm0 = f
|
|
__asm rsqrtss xmm1, xmm0 // xmm1 = rsqrtss(f)
|
|
__asm mulss xmm1, xmm0 // xmm1 = rsqrtss(f) * f = sqrt(f)
|
|
__asm xorps xmm2, xmm2 // xmm2 = 0
|
|
__asm cmpneqss xmm2, xmm0 // xmm2 = (f != 0 ? 1s : 0s)
|
|
__asm andps xmm1, xmm2 // xmm1 = xmm1 & xmm2
|
|
__asm movss recsqrt, xmm1 // return xmm1
|
|
return recsqrt;
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Name: ReciprocalEstimateNR_ASM
|
|
// Desc: SSE Newton-Raphson reciprocal estimate, accurate to 23 significant
|
|
// bits of the mantissa
|
|
// One Newtown-Raphson Iteration:
|
|
// f(i+1) = 2 * rcpss(f) - f * rcpss(f) * rcpss(f)
|
|
//-----------------------------------------------------------------------------
|
|
__forceinline float ReciprocalEstimateNR_ASM( const float f )
|
|
{
|
|
float rec;
|
|
__asm rcpss xmm0, f // xmm0 = rcpss(f)
|
|
__asm movss xmm1, f // xmm1 = f
|
|
__asm mulss xmm1, xmm0 // xmm1 = f * rcpss(f)
|
|
__asm mulss xmm1, xmm0 // xmm2 = f * rcpss(f) * rcpss(f)
|
|
__asm addss xmm0, xmm0 // xmm0 = 2 * rcpss(f)
|
|
__asm subss xmm0, xmm1 // xmm0 = 2 * rcpss(f)
|
|
// - f * rcpss(f) * rcpss(f)
|
|
__asm movss rec, xmm0 // return xmm0
|
|
return rec;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Newton-Rapson square root iteration constants
|
|
//-----------------------------------------------------------------------------
|
|
const float g_SqrtNRConst[2] = {0.5f, 3.0f};
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Name: ReciprocalSqrtEstimateNR_ASM
|
|
// Desc: SSE Newton-Raphson reciprocal square root estimate, accurate to 23
|
|
// significant bits of the mantissa
|
|
// One Newtown-Raphson Iteration:
|
|
// f(i+1) = 0.5 * rsqrtss(f) * (3.0 - (f * rsqrtss(f) * rsqrtss(f))
|
|
// NOTE: rsqrtss(f) * rsqrtss(f) != rcpss(f) (presision is not maintained)
|
|
//-----------------------------------------------------------------------------
|
|
__forceinline float ReciprocalSqrtEstimateNR_ASM( const float f )
|
|
{
|
|
float recsqrt;
|
|
__asm rsqrtss xmm0, f // xmm0 = rsqrtss(f)
|
|
__asm movss xmm1, f // xmm1 = f
|
|
__asm mulss xmm1, xmm0 // xmm1 = f * rsqrtss(f)
|
|
__asm movss xmm2, g_SqrtNRConst+4 // xmm2 = 3.0f
|
|
__asm mulss xmm1, xmm0 // xmm1 = f * rsqrtss(f) * rsqrtss(f)
|
|
__asm mulss xmm0, g_SqrtNRConst // xmm0 = 0.5f * rsqrtss(f)
|
|
__asm subss xmm2, xmm1 // xmm2 = 3.0f -
|
|
// f * rsqrtss(f) * rsqrtss(f)
|
|
__asm mulss xmm0, xmm2 // xmm0 = 0.5 * rsqrtss(f)
|
|
// * (3.0 - (f * rsqrtss(f) * rsqrtss(f))
|
|
__asm movss recsqrt, xmm0 // return xmm0
|
|
return recsqrt;
|
|
}
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Name: SqrtEstimateNR_ASM
|
|
// Desc: SSE Newton-Raphson square root estimate, accurate to 23 significant
|
|
// bits of the mantissa
|
|
// NOTE: x/sqrt(x) = sqrt(x)
|
|
// One Newtown-Raphson Iteration (for 1/sqrt(x)) :
|
|
// f(i+1) = 0.5 * rsqrtss(f) * (3.0 - (f * rsqrtss(f) * rsqrtss(f))
|
|
// NOTE: rsqrtss(f) * rsqrtss(f) != rcpss(f) (presision is not maintained)
|
|
//-----------------------------------------------------------------------------
|
|
__forceinline float SqrtEstimateNR_ASM( const float f )
|
|
{
|
|
float recsqrt;
|
|
__asm rsqrtss xmm0, f // xmm0 = rsqrtss(f)
|
|
__asm movss xmm1, f // xmm1 = f
|
|
__asm mulss xmm1, xmm0 // xmm1 = f * rsqrtss(f)
|
|
__asm movss xmm2, g_SqrtNRConst+4 // xmm2 = 3.0f
|
|
__asm mulss xmm1, xmm0 // xmm1 = f * rsqrtss(f) * rsqrtss(f)
|
|
__asm mulss xmm0, g_SqrtNRConst // xmm0 = 0.5f * rsqrtss(f)
|
|
__asm subss xmm2, xmm1 // xmm2 = 3.0f -
|
|
// f * rsqrtss(f) * rsqrtss(f)
|
|
__asm mulss xmm0, xmm2 // xmm0 = 0.5 * rsqrtss(f)
|
|
// * (3.0 - (f * rsqrtss(f) * rsqrtss(f))
|
|
__asm xorps xmm1, xmm1 // xmm1 = 0
|
|
__asm mulss xmm0, f // xmm0 = sqrt(f)
|
|
__asm cmpneqss xmm1, f // xmm1 = (f != 0 ? 1s : 0s)
|
|
__asm andps xmm0, xmm1 // xmm0 = xmm1 & xmm2
|
|
__asm movss recsqrt, xmm0 // return xmm0
|
|
return recsqrt;
|
|
}
|
|
|
|
|
|
#endif // P3_SSE
|