unicorn/qemu/target-i386/mem_helper.c
Emilio G. Cota 3dc16ebca3
target-i386: remove helper_lock()
It's been superseded by the atomic helpers.

The use of the atomic helpers provides a significant performance and scalability
improvement. Below is the result of running the atomic_add-test microbenchmark with:
$ x86_64-linux-user/qemu-x86_64 tests/atomic_add-bench -o 5000000 -r $r -n $n
, where $n is the number of threads and $r is the allowed range for the additions.

The scenarios measured are:
- atomic: implements x86' ADDL with the atomic_add helper (i.e. this patchset)
- cmpxchg: implement x86' ADDL with a TCG loop using the cmpxchg helper
- master: before this patchset

Results sorted in ascending range, i.e. descending degree of contention.
Y axis is Throughput in Mops/s. Tests are run on an AMD machine with 64
Opteron 6376 cores.

atomic_add-bench: 5000000 ops/thread, [0,1] range

25 ++---------+----------+---------+----------+----------+----------+---++
+ atomic +-E--+ + + + + + |
|cmpxchg +-H--+ |
20 +Emaster +-N--+ ++
|| |
|++ |
|| |
15 +++ ++
|N| |
|+| |
10 ++| ++
|+|+ |
| | -+E+------ +++ ---+E+------+E+------+E+-----+E+------+E|
|+E+E+- +++ +E+------+E+-- |
5 ++|+ ++
|+N+H+--- +++ |
++++N+--+H++----+++ + +++ --++H+------+H+------+H++----+H+---+--- |
0 ++---------+-----H----+---H-----+----------+----------+----------+---H+
0 10 20 30 40 50 60
Number of threads

atomic_add-bench: 5000000 ops/thread, [0,2] range

25 ++---------+----------+---------+----------+----------+----------+---++
++atomic +-E--+ + + + + + |
|cmpxchg +-H--+ |
20 ++master +-N--+ ++
|E| |
|++ |
||E |
15 ++| ++
|N|| |
|+|| ---+E+------+E+-----+E+------+E|
10 ++| | ---+E+------+E+-----+E+--- +++ +++
||H+E+--+E+-- |
|+++++ |
| || |
5 ++|+H+-- +++ ++
|+N+ - ---+H+------+H+------ |
+ +N+--+H++----+H+---+--+H+----++H+--- + + +H+---+--+H|
0 ++---------+----------+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads

atomic_add-bench: 5000000 ops/thread, [0,8] range

40 ++---------+----------+---------+----------+----------+----------+---++
++atomic +-E--+ + + + + + |
35 +cmpxchg +-H--+ ++
| master +-N--+ ---+E+------+E+------+E+-----+E+------+E|
30 ++| ---+E+-- +++ ++
| | -+E+--- |
25 ++E ---- +++ ++
|+++++ -+E+ |
20 +E+ E-- +++ ++
|H|+++ |
|+| +H+------- |
15 ++H+ ---+++ +H+------ ++
|N++H+-- +++--- +H+------++|
10 ++ +++ - +++ ---+H+ +++ +H+
| | +H+-----+H+------+H+-- |
5 ++| +++ ++
++N+N+--+N++ + + + + + |
0 ++---------+----------+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads

atomic_add-bench: 5000000 ops/thread, [0,128] range

160 ++---------+---------+----------+---------+----------+----------+---++
+ atomic +-E--+ + + + + + |
140 +cmpxchg +-H--+ +++ +++ ++
| master +-N--+ E--------E------+E+------++|
120 ++ --| | +++ E+
| -- +++ +++ ++|
100 ++ - ++
| +++- +++ ++|
80 ++ -+E+ -+H+------+H+------H--------++
| ---- ---- +++ H|
| ---+E+-----+E+- ---+H+ ++|
60 ++ +E+--- +++ ---+H+--- ++
| --+++ ---+H+-- |
40 ++ +E+-+H+--- ++
| +H+ |
20 +EE+ ++
+N+ + + + + + + |
0 ++N-N---N--+---------+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads

atomic_add-bench: 5000000 ops/thread, [0,1024] range

350 ++---------+---------+----------+---------+----------+----------+---++
+ atomic +-E--+ + + + + + |
300 +cmpxchg +-H--+ +++
| master +-N--+ +++ ||
| +++ | ----E|
250 ++ | ----E---- ++
| ----E--- | ---+H|
200 ++ -+E+--- +++ ---+H+--- ++
| ---- -+H+-- |
| +E+ +++ ---- +++ |
150 ++ ---+++ ---+H+- ++
| --- -+H+-- |
100 ++ ---+E+ ---- +++ ++
| +++ ---+E+-----+H+- |
| -+E+------+H+-- |
50 ++ +E+ ++
+EE+ + + + + + + |
0 ++N-N---N--+---------+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads

hi-res: http://imgur.com/a/fMRmq

For master I stopped measuring master after 8 threads, because there is little
point in measuring the well-known performance collapse of a contended lock.

Backports commit 37b995f6e7a1cb6fa378c5cd4217b9dd9e1fc98b from qemu
2018-02-27 23:43:22 -05:00

216 lines
6.0 KiB
C

/*
* x86 memory access helpers
*
* Copyright (c) 2003 Fabrice Bellard
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "qemu/osdep.h"
#include "cpu.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
#include "qemu/int128.h"
#include "tcg.h"
#include "uc_priv.h"
void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
{
uintptr_t ra = GETPC();
uint64_t oldv, cmpv, newv;
int eflags;
eflags = cpu_cc_compute_all(env, CC_OP);
cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
oldv = cpu_ldq_data_ra(env, a0, ra);
newv = (cmpv == oldv ? newv : oldv);
/* always do the store */
cpu_stq_data_ra(env, a0, newv, ra);
if (oldv == cmpv) {
eflags |= CC_Z;
} else {
env->regs[R_EAX] = (uint32_t)oldv;
env->regs[R_EDX] = (uint32_t)(oldv >> 32);
eflags &= ~CC_Z;
}
CC_SRC = eflags;
}
void helper_cmpxchg8b(CPUX86State *env, target_ulong a0)
{
#ifdef CONFIG_ATOMIC64
uint64_t oldv, cmpv, newv;
int eflags;
eflags = cpu_cc_compute_all(env, CC_OP);
cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
#ifdef CONFIG_USER_ONLY
{
uint64_t *haddr = g2h(a0);
cmpv = cpu_to_le64(cmpv);
newv = cpu_to_le64(newv);
oldv = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
oldv = le64_to_cpu(oldv);
}
#else
{
uintptr_t ra = GETPC();
int mem_idx = cpu_mmu_index(env, false);
TCGMemOpIdx oi = make_memop_idx(MO_TEQ, mem_idx);
oldv = helper_atomic_cmpxchgq_le_mmu(env, a0, cmpv, newv, oi, ra);
}
#endif
if (oldv == cmpv) {
eflags |= CC_Z;
} else {
env->regs[R_EAX] = (uint32_t)oldv;
env->regs[R_EDX] = (uint32_t)(oldv >> 32);
eflags &= ~CC_Z;
}
CC_SRC = eflags;
#else
cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
#endif /* CONFIG_ATOMIC64 */
}
#ifdef TARGET_X86_64
void helper_cmpxchg16b_unlocked(CPUX86State *env, target_ulong a0)
{
uintptr_t ra = GETPC();
Int128 oldv, cmpv, newv;
uint64_t o0, o1;
int eflags;
bool success;
if ((a0 & 0xf) != 0) {
raise_exception_ra(env, EXCP0D_GPF, GETPC());
}
eflags = cpu_cc_compute_all(env, CC_OP);
cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
o0 = cpu_ldq_data_ra(env, a0 + 0, ra);
o1 = cpu_ldq_data_ra(env, a0 + 8, ra);
oldv = int128_make128(o0, o1);
success = int128_eq(oldv, cmpv);
if (!success) {
newv = oldv;
}
cpu_stq_data_ra(env, a0 + 0, int128_getlo(newv), ra);
cpu_stq_data_ra(env, a0 + 8, int128_gethi(newv), ra);
if (success) {
eflags |= CC_Z;
} else {
env->regs[R_EAX] = int128_getlo(oldv);
env->regs[R_EDX] = int128_gethi(oldv);
eflags &= ~CC_Z;
}
CC_SRC = eflags;
}
void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
{
uintptr_t ra = GETPC();
if ((a0 & 0xf) != 0) {
raise_exception_ra(env, EXCP0D_GPF, ra);
} else {
#ifndef CONFIG_ATOMIC128
cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
#else
int eflags = cpu_cc_compute_all(env, CC_OP);
Int128 cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
Int128 newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
int mem_idx = cpu_mmu_index(env, false);
TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
Int128 oldv = helper_atomic_cmpxchgo_le_mmu(env, a0, cmpv,
newv, oi, ra);
if (int128_eq(oldv, cmpv)) {
eflags |= CC_Z;
} else {
env->regs[R_EAX] = int128_getlo(oldv);
env->regs[R_EDX] = int128_gethi(oldv);
eflags &= ~CC_Z;
}
CC_SRC = eflags;
#endif
}
}
#endif
void helper_boundw(CPUX86State *env, target_ulong a0, int v)
{
int low, high;
low = cpu_ldsw_data_ra(env, a0, GETPC());
high = cpu_ldsw_data_ra(env, a0 + 2, GETPC());
v = (int16_t)v;
if (v < low || v > high) {
if (env->hflags & HF_MPX_EN_MASK) {
env->bndcs_regs.sts = 0;
}
raise_exception_ra(env, EXCP05_BOUND, GETPC());
}
}
void helper_boundl(CPUX86State *env, target_ulong a0, int v)
{
int low, high;
low = cpu_ldl_data_ra(env, a0, GETPC());
high = cpu_ldl_data_ra(env, a0 + 4, GETPC());
if (v < low || v > high) {
if (env->hflags & HF_MPX_EN_MASK) {
env->bndcs_regs.sts = 0;
}
raise_exception_ra(env, EXCP05_BOUND, GETPC());
}
}
#if !defined(CONFIG_USER_ONLY)
/* try to fill the TLB and return an exception if error. If retaddr is
* NULL, it means that the function was called in C code (i.e. not
* from generated code or from helper.c)
*/
/* XXX: fix it to restore all registers */
void tlb_fill(CPUState *cs, target_ulong addr, MMUAccessType access_type,
int mmu_idx, uintptr_t retaddr)
{
int ret;
ret = x86_cpu_handle_mmu_fault(cs, addr, access_type, mmu_idx);
if (ret) {
X86CPU *cpu = X86_CPU(cs->uc, cs);
CPUX86State *env = &cpu->env;
raise_exception_err_ra(env, cs->exception_index, env->error_code, retaddr);
}
}
#endif