target/i386: reimplement fyl2x using floatx80 operations

The x87 fyl2x emulation is currently based around conversion to double. This is inherently unsuitable for a good emulation of any floatx80 operation. Reimplement using the soft-float operations, building on top of the reimplementation of fyl2xp1 and factoring out code to be shared between the two instructions. The included test assumes that the result in round-to-nearest mode should always be one of the two closest floating-point numbers to the mathematically exact result (including that it should be exact, in the exact cases which cover more cases than for fyl2xp1). Backports commit 1f18a1e6ab8368a4eab2d22894d3b2ae75250cd3 from qemu
2024-10-19 12:18:23 +02:00 · 2021-02-25 13:46:27 -05:00 · 2021-02-25 13:46:27 -05:00 · ddb2f1d4dd
commit ddb2f1d4dd
parent ac2f3fa0f2
1 changed files with 262 additions and 110 deletions
--- a/qemu/target/i386/fpu_helper.c
+++ b/qemu/target/i386/fpu_helper.c
@ -1191,21 +1191,6 @@ void helper_f2xm1(CPUX86State *env)
    merge_exception_flags(env, old_flags);
 }

-void helper_fyl2x(CPUX86State *env)
-{
-    double fptemp = floatx80_to_double(env, ST0);
-
-    if (fptemp > 0.0) {
-        fptemp = log(fptemp) / log(2.0); /* log2(ST) */
-        fptemp *= floatx80_to_double(env, ST1);
-        ST1 = double_to_floatx80(env, fptemp);
-        fpop(env);
-    } else {
-        env->fpus &= ~0x4700;
-        env->fpus |= 0x400;
-    }
-}
-
 void helper_fptan(CPUX86State *env)
 {
    double fptemp = floatx80_to_double(env, ST0);
@ -1364,6 +1349,118 @@ void helper_fprem(CPUX86State *env)
 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)

+/*
+ * Compute an approximation of log2(1+arg), where 1+arg is in the
+ * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
+ * function is called, rounding precision is set to 80 and the
+ * round-to-nearest mode is in effect.  arg must not be exactly zero,
+ * and must not be so close to zero that underflow might occur.
+ */
+static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
+                                uint64_t *sig0, uint64_t *sig1)
+{
+    uint64_t arg0_sig = extractFloatx80Frac(arg);
+    int32_t arg0_exp = extractFloatx80Exp(arg);
+    bool arg0_sign = extractFloatx80Sign(arg);
+    bool asign;
+    int32_t dexp, texp, aexp;
+    uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
+    uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
+    uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
+    floatx80 t2, accum;
+
+    /*
+     * Compute an approximation of arg/(2+arg), with extra precision,
+     * as the argument to a polynomial approximation.  The extra
+     * precision is only needed for the first term of the
+     * approximation, with subsequent terms being significantly
+     * smaller; the approximation only uses odd exponents, and the
+     * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
+     */
+    if (arg0_sign) {
+        dexp = 0x3fff;
+        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
+        sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
+    } else {
+        dexp = 0x4000;
+        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
+        dsig0 |= 0x8000000000000000ULL;
+    }
+    texp = arg0_exp - dexp + 0x3ffe;
+    rsig0 = arg0_sig;
+    rsig1 = 0;
+    rsig2 = 0;
+    if (dsig0 <= rsig0) {
+        shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
+        ++texp;
+    }
+    tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
+    mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
+    sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
+           &rsig0, &rsig1, &rsig2);
+    while ((int64_t) rsig0 < 0) {
+        --tsig0;
+        add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
+               &rsig0, &rsig1, &rsig2);
+    }
+    tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
+    /*
+     * No need to correct any estimation error in tsig1; even with
+     * such error, it is accurate enough.  Now compute the square of
+     * that approximation.
+     */
+    mul128To256(tsig0, tsig1, tsig0, tsig1,
+                &t2sig0, &t2sig1, &t2sig2, &t2sig3);
+    t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
+                                       t2sig0, t2sig1, &env->fp_status);
+
+    /* Compute the lower parts of the polynomial expansion.  */
+    accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
+
+    /*
+     * The full polynomial expansion is fyl2x_coeff_0 + accum (where
+     * accum has much lower magnitude, and so, in particular, carry
+     * out of the addition is not possible), multiplied by t.  (This
+     * expansion is only accurate to about 70 bits, not 128 bits.)
+     */
+    aexp = extractFloatx80Exp(fyl2x_coeff_0);
+    asign = extractFloatx80Sign(fyl2x_coeff_0);
+    shift128RightJamming(extractFloatx80Frac(accum), 0,
+                         aexp - extractFloatx80Exp(accum),
+                         &asig0, &asig1);
+    bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
+    bsig1 = 0;
+    if (asign == extractFloatx80Sign(accum)) {
+        add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
+    } else {
+        sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
+    }
+    /* Multiply by t to compute the required result.  */
+    mul128To256(asig0, asig1, tsig0, tsig1,
+                &asig0, &asig1, &asig2, &asig3);
+    aexp += texp - 0x3ffe;
+    *exp = aexp;
+    *sig0 = asig0;
+    *sig1 = asig1;
+}
+
 void helper_fyl2xp1(CPUX86State *env)
 {
    uint8_t old_flags = save_exception_flags(env);
@ -1431,109 +1528,19 @@ void helper_fyl2xp1(CPUX86State *env)
        ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
                                            sig0, sig1, &env->fp_status);
    } else {
-        bool asign;
-        uint32_t dexp, texp, aexp;
-        uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
-        uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
-        uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
-        floatx80 t2, accum;
+        int32_t aexp;
+        uint64_t asig0, asig1, asig2;
        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
        signed char save_prec = env->fp_status.floatx80_rounding_precision;
        env->fp_status.float_rounding_mode = float_round_nearest_even;
        env->fp_status.floatx80_rounding_precision = 80;

-        /*
-         * Compute an approximation of ST0/(2+ST0), with extra
-         * precision, as the argument to a polynomial approximation.
-         * The extra precision is only needed for the first term of
-         * the approximation, with subsequent terms being
-         * significantly smaller; the approximation only uses odd
-         * exponents, and the square of ST0/(2+ST0) is at most
-         * 17-12*sqrt(2) = 0.029....
-         */
-        if (arg0_sign) {
-            dexp = 0x3fff;
-            shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
-            sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
-        } else {
-            dexp = 0x4000;
-            shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
-            dsig0 |= 0x8000000000000000ULL;
-        }
-        texp = arg0_exp - dexp + 0x3ffe;
-        rsig0 = arg0_sig;
-        rsig1 = 0;
-        rsig2 = 0;
-        if (dsig0 <= rsig0) {
-            shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
-            ++texp;
-        }
-        tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
-        mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
-        sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
-               &rsig0, &rsig1, &rsig2);
-        while ((int64_t) rsig0 < 0) {
-            --tsig0;
-            add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
-                   &rsig0, &rsig1, &rsig2);
-        }
-        tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
-        /*
-         * No need to correct any estimation error in tsig1; even with
-         * such error, it is accurate enough.  Now compute the square
-         * of that approximation.
-         */
-        mul128To256(tsig0, tsig1, tsig0, tsig1,
-                    &t2sig0, &t2sig1, &t2sig2, &t2sig3);
-        t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
-                                           t2sig0, t2sig1, &env->fp_status);
-
-        /* Compute the lower parts of the polynomial expansion.  */
-        accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
-        accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
-        accum = floatx80_mul(accum, t2, &env->fp_status);
-        accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
-        accum = floatx80_mul(accum, t2, &env->fp_status);
-        accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
-        accum = floatx80_mul(accum, t2, &env->fp_status);
-        accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
-        accum = floatx80_mul(accum, t2, &env->fp_status);
-        accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
-        accum = floatx80_mul(accum, t2, &env->fp_status);
-        accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
-        accum = floatx80_mul(accum, t2, &env->fp_status);
-        accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
-        accum = floatx80_mul(accum, t2, &env->fp_status);
-        accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
-        accum = floatx80_mul(accum, t2, &env->fp_status);
-        accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
+        helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);

        /*
-         * The full polynomial expansion is fyl2x_coeff_0 + accum
-         * (where accum has much lower magnitude, and so, in
-         * particular, carry out of the addition is not possible),
-         * multiplied by t.  (This expansion is only accurate to about
-         * 70 bits, not 128 bits.)
+         * Multiply by the second argument to compute the required
+         * result.
         */
-        aexp = extractFloatx80Exp(fyl2x_coeff_0);
-        asign = extractFloatx80Sign(fyl2x_coeff_0);
-        shift128RightJamming(extractFloatx80Frac(accum), 0,
-                             aexp - extractFloatx80Exp(accum),
-                             &asig0, &asig1);
-        bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
-        bsig1 = 0;
-        if (asign == extractFloatx80Sign(accum)) {
-            add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
-        } else {
-            sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
-        }
-        /*
-         * Multiply by t and by the second argument to compute the
-         * required result.
-         */
-        mul128To256(asig0, asig1, tsig0, tsig1,
-                    &asig0, &asig1, &asig2, &asig3);
-        aexp += texp - 0x3ffe;
        if (arg1_exp == 0) {
            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
        }
@ -1550,6 +1557,151 @@ void helper_fyl2xp1(CPUX86State *env)
    merge_exception_flags(env, old_flags);
 }

+void helper_fyl2x(CPUX86State *env)
+{
+    uint8_t old_flags = save_exception_flags(env);
+    uint64_t arg0_sig = extractFloatx80Frac(ST0);
+    int32_t arg0_exp = extractFloatx80Exp(ST0);
+    bool arg0_sign = extractFloatx80Sign(ST0);
+    uint64_t arg1_sig = extractFloatx80Frac(ST1);
+    int32_t arg1_exp = extractFloatx80Exp(ST1);
+    bool arg1_sign = extractFloatx80Sign(ST1);
+
+    if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_silence_nan(ST0, &env->fp_status);
+    } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_silence_nan(ST1, &env->fp_status);
+    } else if (floatx80_invalid_encoding(ST0) ||
+               floatx80_invalid_encoding(ST1)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_default_nan(&env->fp_status);
+    } else if (floatx80_is_any_nan(ST0)) {
+        ST1 = ST0;
+    } else if (floatx80_is_any_nan(ST1)) {
+        /* Pass this NaN through.  */
+    } else if (arg0_sign && !floatx80_is_zero(ST0)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_default_nan(&env->fp_status);
+    } else if (floatx80_is_infinity(ST1)) {
+        FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
+                                             &env->fp_status);
+        switch (cmp) {
+        case float_relation_less:
+            ST1 = floatx80_chs(ST1);
+            break;
+        case float_relation_greater:
+            /* Result is infinity of the same sign as ST1.  */
+            break;
+        default:
+            float_raise(float_flag_invalid, &env->fp_status);
+            ST1 = floatx80_default_nan(&env->fp_status);
+            break;
+        }
+    } else if (floatx80_is_infinity(ST0)) {
+        if (floatx80_is_zero(ST1)) {
+            float_raise(float_flag_invalid, &env->fp_status);
+            ST1 = floatx80_default_nan(&env->fp_status);
+        } else if (arg1_sign) {
+            ST1 = floatx80_chs(ST0);
+        } else {
+            ST1 = ST0;
+        }
+    } else if (floatx80_is_zero(ST0)) {
+        if (floatx80_is_zero(ST1)) {
+            float_raise(float_flag_invalid, &env->fp_status);
+            ST1 = floatx80_default_nan(&env->fp_status);
+        } else {
+            /* Result is infinity with opposite sign to ST1.  */
+            float_raise(float_flag_divbyzero, &env->fp_status);
+            ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
+                                0x8000000000000000ULL);
+        }
+    } else if (floatx80_is_zero(ST1)) {
+        if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
+            ST1 = floatx80_chs(ST1);
+        }
+        /* Otherwise, ST1 is already the correct result.  */
+    } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
+        if (arg1_sign) {
+            ST1 = floatx80_chs(floatx80_zero);
+        } else {
+            ST1 = floatx80_zero;
+        }
+    } else {
+        int32_t int_exp;
+        floatx80 arg0_m1;
+        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
+        signed char save_prec = env->fp_status.floatx80_rounding_precision;
+        env->fp_status.float_rounding_mode = float_round_nearest_even;
+        env->fp_status.floatx80_rounding_precision = 80;
+
+        if (arg0_exp == 0) {
+            normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
+        }
+        if (arg1_exp == 0) {
+            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
+        }
+        int_exp = arg0_exp - 0x3fff;
+        if (arg0_sig > 0xb504f333f9de6484ULL) {
+            ++int_exp;
+        }
+        arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
+                                               &env->fp_status),
+                               floatx80_one, &env->fp_status);
+        if (floatx80_is_zero(arg0_m1)) {
+            /* Exact power of 2; multiply by ST1.  */
+            env->fp_status.float_rounding_mode = save_mode;
+            ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
+                               ST1, &env->fp_status);
+        } else {
+            bool asign = extractFloatx80Sign(arg0_m1);
+            int32_t aexp;
+            uint64_t asig0, asig1, asig2;
+            helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
+            if (int_exp != 0) {
+                bool isign = (int_exp < 0);
+                int32_t iexp;
+                uint64_t isig;
+                int shift;
+                int_exp = isign ? -int_exp : int_exp;
+                shift = clz32(int_exp) + 32;
+                isig = int_exp;
+                isig <<= shift;
+                iexp = 0x403e - shift;
+                shift128RightJamming(asig0, asig1, iexp - aexp,
+                                     &asig0, &asig1);
+                if (asign == isign) {
+                    add128(isig, 0, asig0, asig1, &asig0, &asig1);
+                } else {
+                    sub128(isig, 0, asig0, asig1, &asig0, &asig1);
+                }
+                aexp = iexp;
+                asign = isign;
+            }
+            /*
+             * Multiply by the second argument to compute the required
+             * result.
+             */
+            if (arg1_exp == 0) {
+                normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
+            }
+            mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
+            aexp += arg1_exp - 0x3ffe;
+            /* This result is inexact.  */
+            asig1 |= 1;
+            env->fp_status.float_rounding_mode = save_mode;
+            ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
+                                                asig0, asig1, &env->fp_status);
+        }
+
+        env->fp_status.floatx80_rounding_precision = save_prec;
+    }
+    fpop(env);
+    merge_exception_flags(env, old_flags);
+}
+
 void helper_fsqrt(CPUX86State *env)
 {
    uint8_t old_flags = save_exception_flags(env);