hardfloat: implement float32/64 comparison

Performance results for fp-bench:

Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
- before:
cmp-single: 110.98 MFlops
cmp-double: 107.12 MFlops
- after:
cmp-single: 506.28 MFlops
cmp-double: 524.77 MFlops

Note that flattening both eq and eq_signaling versions
would give us extra performance (695v506, 615v524 Mflops
for single/double, respectively) but this would emit two
essentially identical functions for each eq/signaling pair,
which is a waste.

Aggregate performance improvement for the last few patches:
[ all charts in png: https://imgur.com/a/4yV8p ]

1. Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz

qemu-aarch64 NBench score; higher is better
Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz

16 +-+-----------+-------------+----===-------+---===-------+-----------+-+
14 +-+..........................@@@&&.=.......@@@&&.=...................+-+
12 +-+..........................@.@.&.=.......@.@.&.=.....+befor=== +-+
10 +-+..........................@.@.&.=.......@.@.&.=.....+ad@@&& = +-+
8 +-+.......................$$$%.@.&.=.......@.@.&.=.....+ @@u& = +-+
6 +-+............@@@&&=+***##.$%.@.&.=***##$$%+@.&.=..###$$%%@i& = +-+
4 +-+.......###$%%.@.&=.*.*.#.$%.@.&.=*.*.#.$%.@.&.=+**.#+$ +@m& = +-+
2 +-+.....***.#$.%.@.&=.*.*.#.$%.@.&.=*.*.#.$%.@.&.=.**.#+$+sqr& = +-+
0 +-+-----***##$%%@@&&=-***##$$%@@&&==***##$$%@@&&==-**##$$%+cmp==-----+-+
FOURIER NEURAL NELU DECOMPOSITION gmean

qemu-aarch64 SPEC06fp (test set) speedup over QEMU 4c2c1015905
Host: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
error bars: 95% confidence interval

4.5 +-+---+-----+----+-----+-----+-&---+-----+----+-----+-----+-----+----+-----+-----+-----+-----+----+-----+---+-+
4 +-+..........................+@@+...........................................................................+-+
3.5 +-+..............%%@&.........@@..............%%@&............................................+++dsub +-+
2.5 +-+....&&+.......%%@&.......+%%@..+%%&+..@@&+.%%@&....................................+%%&+.+%@&++%%@& +-+
2 +-+..+%%&..+%@&+.%%@&...+++..%%@...%%&.+$$@&..%%@&..%%@&.......+%%&+.%%@&+......+%%@&.+%%&++$$@&++d%@& %%@&+-+
1.5 +-+**#$%&**#$@&**#%@&**$%@**#$%@**#$%&**#$@&**$%@&*#$%@**#$%@**#$%&**#%@&**$%@&*#$%@**#$%&**#$@&*+f%@&**$%@&+-+
0.5 +-+**#$%&**#$@&**#%@&**$%@**#$%@**#$%&**#$@&**$%@&*#$%@**#$%@**#$%&**#%@&**$%@&*#$%@**#$%&**#$@&+sqr@&**$%@&+-+
0 +-+**#$%&**#$@&**#%@&**$%@**#$%@**#$%&**#$@&**$%@&*#$%@**#$%@**#$%&**#%@&**$%@&*#$%@**#$%&**#$@&*+cmp&**$%@&+-+
410.bw416.gam433.434.z435.436.cac437.lesli444.447.de450.so453454.ca459.GemsF465.tont470.lb4482.sphinxgeomean

2. Host: ARM Aarch64 A57 @ 2.4GHz

qemu-aarch64 NBench score; higher is better
Host: Applied Micro X-Gene, Aarch64 A57 @ 2.4 GHz

5 +-+-----------+-------------+-------------+-------------+-----------+-+
4.5 +-+........................................@@@&==...................+-+
3 4 +-+..........................@@@&==........@.@&.=.....+before +-+
3 +-+..........................@.@&.=........@.@&.=.....+ad@@@&== +-+
2.5 +-+.....................##$$%%.@&.=........@.@&.=.....+ @m@& = +-+
2 +-+............@@@&==.***#.$.%.@&.=.***#$$%%.@&.=.***#$$%%d@& = +-+
1.5 +-+.....***#$$%%.@&.=.*.*#.$.%.@&.=.*.*#.$.%.@&.=.*.*#+$ +f@& = +-+
0.5 +-+.....*.*#.$.%.@&.=.*.*#.$.%.@&.=.*.*#.$.%.@&.=.*.*#+$+sqr& = +-+
0 +-+-----***#$$%%@@&==-***#$$%%@@&==-***#$$%%@@&==-***#$$%+cmp==-----+-+
FOURIER NEURAL NLU DECOMPOSITION gmean
This commit is contained in:
Emilio G. Cota 2018-12-19 10:45:19 -05:00 committed by Lioncash
parent f7549fc13e
commit 8276a4dc66
No known key found for this signature in database
GPG Key ID: 4E3C3CC1031BA9C7

View File

@ -2906,28 +2906,109 @@ static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
}
}
#define COMPARE(sz) \
int float ## sz ## _compare(float ## sz a, float ## sz b, \
float_status *s) \
#define COMPARE(name, attr, sz) \
static int attr \
name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
{ \
FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
return compare_floats(pa, pb, false, s); \
} \
int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
float_status *s) \
{ \
FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
return compare_floats(pa, pb, true, s); \
return compare_floats(pa, pb, is_quiet, s); \
}
COMPARE(16)
COMPARE(32)
COMPARE(64)
COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
#undef COMPARE
int float16_compare(float16 a, float16 b, float_status *s)
{
return soft_f16_compare(a, b, false, s);
}
int float16_compare_quiet(float16 a, float16 b, float_status *s)
{
return soft_f16_compare(a, b, true, s);
}
static int QEMU_FLATTEN
f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
{
union_float32 ua, ub;
ua.s = xa;
ub.s = xb;
if (QEMU_NO_HARDFLOAT) {
goto soft;
}
float32_input_flush2(&ua.s, &ub.s, s);
if (isgreaterequal(ua.h, ub.h)) {
if (isgreater(ua.h, ub.h)) {
return float_relation_greater;
}
return float_relation_equal;
}
if (likely(isless(ua.h, ub.h))) {
return float_relation_less;
}
/* The only condition remaining is unordered.
* Fall through to set flags.
*/
soft:
return soft_f32_compare(ua.s, ub.s, is_quiet, s);
}
int float32_compare(float32 a, float32 b, float_status *s)
{
return f32_compare(a, b, false, s);
}
int float32_compare_quiet(float32 a, float32 b, float_status *s)
{
return f32_compare(a, b, true, s);
}
static int QEMU_FLATTEN
f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
{
union_float64 ua, ub;
ua.s = xa;
ub.s = xb;
if (QEMU_NO_HARDFLOAT) {
goto soft;
}
float64_input_flush2(&ua.s, &ub.s, s);
if (isgreaterequal(ua.h, ub.h)) {
if (isgreater(ua.h, ub.h)) {
return float_relation_greater;
}
return float_relation_equal;
}
if (likely(isless(ua.h, ub.h))) {
return float_relation_less;
}
/* The only condition remaining is unordered.
* Fall through to set flags.
*/
soft:
return soft_f64_compare(ua.s, ub.s, is_quiet, s);
}
int float64_compare(float64 a, float64 b, float_status *s)
{
return f64_compare(a, b, false, s);
}
int float64_compare_quiet(float64 a, float64 b, float_status *s)
{
return f64_compare(a, b, true, s);
}
/* Multiply A by 2 raised to the power N. */
static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
{