target/i386: reimplement fyl2x using floatx80 operations

The x87 fyl2x emulation is currently based around conversion to
double. This is inherently unsuitable for a good emulation of any
floatx80 operation. Reimplement using the soft-float operations,
building on top of the reimplementation of fyl2xp1 and factoring out
code to be shared between the two instructions.

The included test assumes that the result in round-to-nearest mode
should always be one of the two closest floating-point numbers to the
mathematically exact result (including that it should be exact, in the
exact cases which cover more cases than for fyl2xp1).

Backports commit 1f18a1e6ab8368a4eab2d22894d3b2ae75250cd3 from qemu
This commit is contained in:
Joseph Myers 2021-02-25 13:46:27 -05:00 committed by Lioncash
parent ac2f3fa0f2
commit ddb2f1d4dd

View File

@ -1191,21 +1191,6 @@ void helper_f2xm1(CPUX86State *env)
merge_exception_flags(env, old_flags);
}
void helper_fyl2x(CPUX86State *env)
{
double fptemp = floatx80_to_double(env, ST0);
if (fptemp > 0.0) {
fptemp = log(fptemp) / log(2.0); /* log2(ST) */
fptemp *= floatx80_to_double(env, ST1);
ST1 = double_to_floatx80(env, fptemp);
fpop(env);
} else {
env->fpus &= ~0x4700;
env->fpus |= 0x400;
}
}
void helper_fptan(CPUX86State *env)
{
double fptemp = floatx80_to_double(env, ST0);
@ -1364,6 +1349,118 @@ void helper_fprem(CPUX86State *env)
#define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
#define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
/*
* Compute an approximation of log2(1+arg), where 1+arg is in the
* interval [sqrt(2)/2, sqrt(2)]. It is assumed that when this
* function is called, rounding precision is set to 80 and the
* round-to-nearest mode is in effect. arg must not be exactly zero,
* and must not be so close to zero that underflow might occur.
*/
static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
uint64_t *sig0, uint64_t *sig1)
{
uint64_t arg0_sig = extractFloatx80Frac(arg);
int32_t arg0_exp = extractFloatx80Exp(arg);
bool arg0_sign = extractFloatx80Sign(arg);
bool asign;
int32_t dexp, texp, aexp;
uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
floatx80 t2, accum;
/*
* Compute an approximation of arg/(2+arg), with extra precision,
* as the argument to a polynomial approximation. The extra
* precision is only needed for the first term of the
* approximation, with subsequent terms being significantly
* smaller; the approximation only uses odd exponents, and the
* square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
*/
if (arg0_sign) {
dexp = 0x3fff;
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
} else {
dexp = 0x4000;
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
dsig0 |= 0x8000000000000000ULL;
}
texp = arg0_exp - dexp + 0x3ffe;
rsig0 = arg0_sig;
rsig1 = 0;
rsig2 = 0;
if (dsig0 <= rsig0) {
shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
++texp;
}
tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
&rsig0, &rsig1, &rsig2);
while ((int64_t) rsig0 < 0) {
--tsig0;
add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
&rsig0, &rsig1, &rsig2);
}
tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
/*
* No need to correct any estimation error in tsig1; even with
* such error, it is accurate enough. Now compute the square of
* that approximation.
*/
mul128To256(tsig0, tsig1, tsig0, tsig1,
&t2sig0, &t2sig1, &t2sig2, &t2sig3);
t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
t2sig0, t2sig1, &env->fp_status);
/* Compute the lower parts of the polynomial expansion. */
accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
/*
* The full polynomial expansion is fyl2x_coeff_0 + accum (where
* accum has much lower magnitude, and so, in particular, carry
* out of the addition is not possible), multiplied by t. (This
* expansion is only accurate to about 70 bits, not 128 bits.)
*/
aexp = extractFloatx80Exp(fyl2x_coeff_0);
asign = extractFloatx80Sign(fyl2x_coeff_0);
shift128RightJamming(extractFloatx80Frac(accum), 0,
aexp - extractFloatx80Exp(accum),
&asig0, &asig1);
bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
bsig1 = 0;
if (asign == extractFloatx80Sign(accum)) {
add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
} else {
sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
}
/* Multiply by t to compute the required result. */
mul128To256(asig0, asig1, tsig0, tsig1,
&asig0, &asig1, &asig2, &asig3);
aexp += texp - 0x3ffe;
*exp = aexp;
*sig0 = asig0;
*sig1 = asig1;
}
void helper_fyl2xp1(CPUX86State *env)
{
uint8_t old_flags = save_exception_flags(env);
@ -1431,109 +1528,19 @@ void helper_fyl2xp1(CPUX86State *env)
ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
sig0, sig1, &env->fp_status);
} else {
bool asign;
uint32_t dexp, texp, aexp;
uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
floatx80 t2, accum;
int32_t aexp;
uint64_t asig0, asig1, asig2;
FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
signed char save_prec = env->fp_status.floatx80_rounding_precision;
env->fp_status.float_rounding_mode = float_round_nearest_even;
env->fp_status.floatx80_rounding_precision = 80;
/*
* Compute an approximation of ST0/(2+ST0), with extra
* precision, as the argument to a polynomial approximation.
* The extra precision is only needed for the first term of
* the approximation, with subsequent terms being
* significantly smaller; the approximation only uses odd
* exponents, and the square of ST0/(2+ST0) is at most
* 17-12*sqrt(2) = 0.029....
*/
if (arg0_sign) {
dexp = 0x3fff;
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
} else {
dexp = 0x4000;
shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
dsig0 |= 0x8000000000000000ULL;
}
texp = arg0_exp - dexp + 0x3ffe;
rsig0 = arg0_sig;
rsig1 = 0;
rsig2 = 0;
if (dsig0 <= rsig0) {
shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
++texp;
}
tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
&rsig0, &rsig1, &rsig2);
while ((int64_t) rsig0 < 0) {
--tsig0;
add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
&rsig0, &rsig1, &rsig2);
}
tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
/*
* No need to correct any estimation error in tsig1; even with
* such error, it is accurate enough. Now compute the square
* of that approximation.
*/
mul128To256(tsig0, tsig1, tsig0, tsig1,
&t2sig0, &t2sig1, &t2sig2, &t2sig3);
t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
t2sig0, t2sig1, &env->fp_status);
/* Compute the lower parts of the polynomial expansion. */
accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
accum = floatx80_mul(accum, t2, &env->fp_status);
accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
/*
* The full polynomial expansion is fyl2x_coeff_0 + accum
* (where accum has much lower magnitude, and so, in
* particular, carry out of the addition is not possible),
* multiplied by t. (This expansion is only accurate to about
* 70 bits, not 128 bits.)
* Multiply by the second argument to compute the required
* result.
*/
aexp = extractFloatx80Exp(fyl2x_coeff_0);
asign = extractFloatx80Sign(fyl2x_coeff_0);
shift128RightJamming(extractFloatx80Frac(accum), 0,
aexp - extractFloatx80Exp(accum),
&asig0, &asig1);
bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
bsig1 = 0;
if (asign == extractFloatx80Sign(accum)) {
add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
} else {
sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
}
/*
* Multiply by t and by the second argument to compute the
* required result.
*/
mul128To256(asig0, asig1, tsig0, tsig1,
&asig0, &asig1, &asig2, &asig3);
aexp += texp - 0x3ffe;
if (arg1_exp == 0) {
normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
}
@ -1550,6 +1557,151 @@ void helper_fyl2xp1(CPUX86State *env)
merge_exception_flags(env, old_flags);
}
void helper_fyl2x(CPUX86State *env)
{
uint8_t old_flags = save_exception_flags(env);
uint64_t arg0_sig = extractFloatx80Frac(ST0);
int32_t arg0_exp = extractFloatx80Exp(ST0);
bool arg0_sign = extractFloatx80Sign(ST0);
uint64_t arg1_sig = extractFloatx80Frac(ST1);
int32_t arg1_exp = extractFloatx80Exp(ST1);
bool arg1_sign = extractFloatx80Sign(ST1);
if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_silence_nan(ST0, &env->fp_status);
} else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_silence_nan(ST1, &env->fp_status);
} else if (floatx80_invalid_encoding(ST0) ||
floatx80_invalid_encoding(ST1)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
} else if (floatx80_is_any_nan(ST0)) {
ST1 = ST0;
} else if (floatx80_is_any_nan(ST1)) {
/* Pass this NaN through. */
} else if (arg0_sign && !floatx80_is_zero(ST0)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
} else if (floatx80_is_infinity(ST1)) {
FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
&env->fp_status);
switch (cmp) {
case float_relation_less:
ST1 = floatx80_chs(ST1);
break;
case float_relation_greater:
/* Result is infinity of the same sign as ST1. */
break;
default:
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
break;
}
} else if (floatx80_is_infinity(ST0)) {
if (floatx80_is_zero(ST1)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
} else if (arg1_sign) {
ST1 = floatx80_chs(ST0);
} else {
ST1 = ST0;
}
} else if (floatx80_is_zero(ST0)) {
if (floatx80_is_zero(ST1)) {
float_raise(float_flag_invalid, &env->fp_status);
ST1 = floatx80_default_nan(&env->fp_status);
} else {
/* Result is infinity with opposite sign to ST1. */
float_raise(float_flag_divbyzero, &env->fp_status);
ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
0x8000000000000000ULL);
}
} else if (floatx80_is_zero(ST1)) {
if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
ST1 = floatx80_chs(ST1);
}
/* Otherwise, ST1 is already the correct result. */
} else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
if (arg1_sign) {
ST1 = floatx80_chs(floatx80_zero);
} else {
ST1 = floatx80_zero;
}
} else {
int32_t int_exp;
floatx80 arg0_m1;
FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
signed char save_prec = env->fp_status.floatx80_rounding_precision;
env->fp_status.float_rounding_mode = float_round_nearest_even;
env->fp_status.floatx80_rounding_precision = 80;
if (arg0_exp == 0) {
normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
}
if (arg1_exp == 0) {
normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
}
int_exp = arg0_exp - 0x3fff;
if (arg0_sig > 0xb504f333f9de6484ULL) {
++int_exp;
}
arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
&env->fp_status),
floatx80_one, &env->fp_status);
if (floatx80_is_zero(arg0_m1)) {
/* Exact power of 2; multiply by ST1. */
env->fp_status.float_rounding_mode = save_mode;
ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
ST1, &env->fp_status);
} else {
bool asign = extractFloatx80Sign(arg0_m1);
int32_t aexp;
uint64_t asig0, asig1, asig2;
helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
if (int_exp != 0) {
bool isign = (int_exp < 0);
int32_t iexp;
uint64_t isig;
int shift;
int_exp = isign ? -int_exp : int_exp;
shift = clz32(int_exp) + 32;
isig = int_exp;
isig <<= shift;
iexp = 0x403e - shift;
shift128RightJamming(asig0, asig1, iexp - aexp,
&asig0, &asig1);
if (asign == isign) {
add128(isig, 0, asig0, asig1, &asig0, &asig1);
} else {
sub128(isig, 0, asig0, asig1, &asig0, &asig1);
}
aexp = iexp;
asign = isign;
}
/*
* Multiply by the second argument to compute the required
* result.
*/
if (arg1_exp == 0) {
normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
}
mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
aexp += arg1_exp - 0x3ffe;
/* This result is inexact. */
asig1 |= 1;
env->fp_status.float_rounding_mode = save_mode;
ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
asig0, asig1, &env->fp_status);
}
env->fp_status.floatx80_rounding_precision = save_prec;
}
fpop(env);
merge_exception_flags(env, old_flags);
}
void helper_fsqrt(CPUX86State *env)
{
uint8_t old_flags = save_exception_flags(env);