tcg: Compress dead_temps and mem_temps into a single array

We only need two bits per temporary. Fold the two bytes into one,
and reduce the memory and cachelines required during compilation.

Backports commit c70fbf0a9938baf3b4f843355a77c17a7e945b98 from qemu
This commit is contained in:
Richard Henderson 2018-02-25 22:00:09 -05:00 committed by Lioncash
parent 690985a582
commit e973e89a57
No known key found for this signature in database
GPG Key ID: 4E3C3CC1031BA9C7

View File

@ -1359,39 +1359,42 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
#ifdef USE_LIVENESS_ANALYSIS
#define TS_DEAD 1
#define TS_MEM 2
/* liveness analysis: end of function: all temps are dead, and globals
should be in memory. */
static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps,
uint8_t *mem_temps)
static inline void tcg_la_func_end(TCGContext *s, uint8_t *temp_state)
{
memset(dead_temps, 1, s->nb_temps);
memset(mem_temps, 1, s->nb_globals);
memset(mem_temps + s->nb_globals, 0, s->nb_temps - s->nb_globals);
memset(temp_state, TS_DEAD | TS_MEM, s->nb_globals);
memset(temp_state + s->nb_globals, TS_DEAD, s->nb_temps - s->nb_globals);
}
/* liveness analysis: end of basic block: all temps are dead, globals
and local temps should be in memory. */
static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
uint8_t *mem_temps)
static inline void tcg_la_bb_end(TCGContext *s, uint8_t *temp_state)
{
int i;
int i, n;
memset(dead_temps, 1, s->nb_temps);
memset(mem_temps, 1, s->nb_globals);
for(i = s->nb_globals; i < s->nb_temps; i++) {
mem_temps[i] = s->temps[i].temp_local;
tcg_la_func_end(s, temp_state);
for (i = s->nb_globals, n = s->nb_temps; i < n; i++) {
if (s->temps[i].temp_local) {
temp_state[i] |= TS_MEM;
}
}
}
/*
Unicorn: for brcond, we should refresh liveness states for TCG globals
*/
static inline void tcg_la_br_end(TCGContext *s, uint8_t *mem_temps)
static inline void tcg_la_br_end(TCGContext *s, uint8_t *temp_state)
{
int i;
memset(mem_temps, 1, s->nb_globals);
for (i = 0; i < s->nb_globals; i++) {
temp_state[i] |= TS_MEM;
}
for(i = s->nb_globals; i < s->nb_temps; i++) {
mem_temps[i] = s->temps[i].temp_local;
temp_state[i] = s->temps[i].temp_local;
}
}
@ -1400,12 +1403,12 @@ static inline void tcg_la_br_end(TCGContext *s, uint8_t *mem_temps)
temporaries are removed. */
static void tcg_liveness_analysis(TCGContext *s)
{
uint8_t *dead_temps, *mem_temps;
uint8_t *temp_state;
int oi, oi_prev;
int nb_globals = s->nb_globals;
dead_temps = tcg_malloc(s, s->nb_temps);
mem_temps = tcg_malloc(s, s->nb_temps);
tcg_la_func_end(s, dead_temps, mem_temps);
temp_state = tcg_malloc(s, s->nb_temps);
tcg_la_func_end(s, temp_state);
for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) {
int i, nb_iargs, nb_oargs;
@ -1434,7 +1437,7 @@ static void tcg_liveness_analysis(TCGContext *s)
if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (!dead_temps[arg] || mem_temps[arg]) {
if (temp_state[arg] != TS_DEAD) {
goto do_not_remove_call;
}
}
@ -1445,34 +1448,40 @@ static void tcg_liveness_analysis(TCGContext *s)
/* output args are dead */
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (dead_temps[arg]) {
if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
if (mem_temps[arg]) {
if (temp_state[arg] & TS_MEM) {
arg_life |= SYNC_ARG << i;
}
dead_temps[arg] = 1;
mem_temps[arg] = 0;
temp_state[arg] = TS_DEAD;
}
if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
/* globals should be synced to memory */
memset(mem_temps, 1, s->nb_globals);
}
if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
TCG_CALL_NO_READ_GLOBALS))) {
/* globals should go back to memory */
memset(dead_temps, 1, s->nb_globals);
memset(temp_state, TS_DEAD | TS_MEM, nb_globals);
} else if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
/* globals should be synced to memory */
for (i = 0; i < nb_globals; i++) {
temp_state[i] |= TS_MEM;
}
}
/* input args are live */
/* record arguments that die in this helper */
for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
arg = args[i];
if (arg != TCG_CALL_DUMMY_ARG) {
if (dead_temps[arg]) {
if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
dead_temps[arg] = 0;
}
}
/* input arguments are live for preceding opcodes */
for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
arg = args[i];
if (arg != TCG_CALL_DUMMY_ARG) {
temp_state[arg] &= ~TS_DEAD;
}
}
}
@ -1482,8 +1491,7 @@ static void tcg_liveness_analysis(TCGContext *s)
break;
case INDEX_op_discard:
/* mark the temporary as dead */
dead_temps[args[0]] = 1;
mem_temps[args[0]] = 0;
temp_state[args[0]] = TS_DEAD;
break;
case INDEX_op_add2_i32:
@ -1504,8 +1512,8 @@ static void tcg_liveness_analysis(TCGContext *s)
the low part. The result can be optimized to a simple
add or sub. This happens often for x86_64 guest when the
cpu mode is set to 32 bit. */
if (dead_temps[args[1]] && !mem_temps[args[1]]) {
if (dead_temps[args[0]] && !mem_temps[args[0]]) {
if (temp_state[args[1]] == TS_DEAD) {
if (temp_state[args[0]] == TS_DEAD) {
goto do_remove;
}
/* Replace the opcode and adjust the args in place,
@ -1542,8 +1550,8 @@ static void tcg_liveness_analysis(TCGContext *s)
do_mul2:
nb_iargs = 2;
nb_oargs = 2;
if (dead_temps[args[1]] && !mem_temps[args[1]]) {
if (dead_temps[args[0]] && !mem_temps[args[0]]) {
if (temp_state[args[1]] == TS_DEAD) {
if (temp_state[args[0]] == TS_DEAD) {
/* Both parts of the operation are dead. */
goto do_remove;
}
@ -1551,8 +1559,7 @@ static void tcg_liveness_analysis(TCGContext *s)
op->opc = opc = opc_new;
args[1] = args[2];
args[2] = args[3];
} else if (have_opc_new2 && dead_temps[args[0]]
&& !mem_temps[args[0]]) {
} else if (temp_state[args[0]] == TS_DEAD && have_opc_new2) {
/* The low part of the operation is dead; generate the high. */
op->opc = opc = opc_new2;
args[0] = args[1];
@ -1575,8 +1582,7 @@ static void tcg_liveness_analysis(TCGContext *s)
implies side effects */
if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && nb_oargs != 0) {
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (!dead_temps[arg] || mem_temps[arg]) {
if (temp_state[args[i]] != TS_DEAD) {
goto do_not_remove;
}
}
@ -1587,14 +1593,13 @@ static void tcg_liveness_analysis(TCGContext *s)
/* output args are dead */
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (dead_temps[arg]) {
if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
if (mem_temps[arg]) {
if (temp_state[arg] & TS_MEM) {
arg_life |= SYNC_ARG << i;
}
dead_temps[arg] = 1;
mem_temps[arg] = 0;
temp_state[arg] = TS_DEAD;
}
/* if end of basic block, update */
@ -1603,27 +1608,33 @@ static void tcg_liveness_analysis(TCGContext *s)
// this causes problem because check_exit_request() inserts
// brcond instruction in the middle of the TB,
// which incorrectly flags end-of-block
if (opc != INDEX_op_brcond_i32)
tcg_la_bb_end(s, dead_temps, mem_temps);
if (opc != INDEX_op_brcond_i32) {
tcg_la_bb_end(s, temp_state);
} else {
// Unicorn: we do not touch dead temps for brcond,
// but we should refresh TCG globals In-Memory states,
// otherwise, important CPU states(especially conditional flags) might be forgotten,
// result in wrongly generated host code that run into wrong branch.
// Refer to https://github.com/unicorn-engine/unicorn/issues/287 for further information
else
tcg_la_br_end(s, mem_temps);
tcg_la_br_end(s, temp_state);
}
} else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
/* globals should be synced to memory */
memset(mem_temps, 1, s->nb_globals);
for (i = 0; i < nb_globals; i++) {
temp_state[i] |= TS_MEM;
}
}
/* input args are live */
/* record arguments that die in this opcode */
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
arg = args[i];
if (dead_temps[arg]) {
if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
dead_temps[arg] = 0;
}
/* input arguments are live for preceding opcodes */
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
temp_state[args[i]] &= ~TS_DEAD;
}
}
break;