一
前言
1.soft tlb / Softmmu/内存模拟
2.虚拟中断控制器/中断模拟
3.总线/设备模拟
4.TCG的CPU模拟
1.等价性
2.安全性
3.性能
1.可以理解像libhoudini.so这样的转码技术是如何实现的。
2.对理解应用层的虚拟机如java虚拟机中的jit技术很有帮助。
3.可以帮助理解cpu包括整个计算机体系结构是如何工作的。
4.可以帮助理解和定制二进制分析框架如unicorn/qiling,因为它们都是基于TCG。
5.某些vmp是基于unicorn来实现的,理解TCG可以基于此实现自己的vmp/加深对vmp的理解。
二
QEMU TCG
1.遇到分支指令
2.遇到系统调用
3.达到页边界/最大长度限制
.global _Reset
_Reset:
LDR sp, =stack_top
BL c_entry
B .
volatile unsigned int * const UART0DR = (unsigned int *)0x101f1000;
void print_uart0(const char *s) {
while(*s != '\0') { /* Loop until end of string */
*UART0DR = (unsigned int)(*s); /* Transmit char */
s++; /* Next char */
}
}
void c_entry() {
print_uart0("Hello world!\n");
}
ENTRY(_Reset)
SECTIONS
{
. = 0x10000;
.startup . : { startup.o(.text) }
.text : { *(.text) }
.data : { *(.data) }
.bss : { *(.bss COMMON) }
. = ALIGN(8);
. = . + 0x1000; /* 4kB of stack memory */
stack_top = .;
}
arm-none-eabi-gcc -c -mcpu=arm926ej-s -g test.c -o test.o
arm-none-eabi-as -mcpu=arm926ej-s -g startup.s -o startup.o
arm-none-eabi-ld -T test.ld test.o startup.o -o test.elf
arm-none-eabi-objcopy -O binary test.elf test.bin
qemu-system-arm -M versatilepb -m 128 -kernel test.bin -nographic
/* A very small bootloader: call the board-setup code (if needed),
* set r0-r2, then jump to the kernel.
* If we're not calling boot setup code then we don't copy across
* the first BOOTLOADER_NO_BOARD_SETUP_OFFSET insns in this array.
*/
static const ARMInsnFixup bootloader[] = {
{ 0xe28fe004 }, /* add lr, pc, #4 */
{ 0xe51ff004 }, /* ldr pc, [pc, #-4] */
{ 0, FIXUP_BOARD_SETUP },
#define BOOTLOADER_NO_BOARD_SETUP_OFFSET 3
{ 0xe3a00000 }, /* mov r0, #0 */
{ 0xe59f1004 }, /* ldr r1, [pc, #4] */
{ 0xe59f2004 }, /* ldr r2, [pc, #4] */
{ 0xe59ff004 }, /* ldr pc, [pc, #4] */
{ 0, FIXUP_BOARDID },
{ 0, FIXUP_ARGPTR_LO },
{ 0, FIXUP_ENTRYPOINT_LO },
{ 0, FIXUP_TERMINATOR }
};
sudo apt install libcapstone-dev
qemu-system-arm -M versatilepb -m 128 -kernel test.bin -nographic -d in_asm -D in_asm.txt qemu-system-arm -M versatilepb -m 128 -kernel test.bin -nographic -d op -D op.txt
qemu-system-arm -M versatilepb -m 128 -kernel test.bin -nographic -d out_asm -D out_asm.txt
1.普通算术逻辑运算指令如何更新Host体系结构相关寄存器
2.内存读写如何处理
3.分支指令(条件跳转、非条件跳转、返回指令)
4.目标机器没有的指令、特权指令、敏感指令
5.非普通内存读写如设备寄存器访问MMIO
6.指令执行出现了同步异常如何处理(如系统调用)
7.硬件中断如何处理
tcg_temp_new_internal
分配TEMP_EBB, TEMP_TB
类型的TCGTemp变量tcg_global_alloc
分配TEMP_GLOBAL
类型的TCGTemp变量tcg_global_reg_new_internal
分配TEMP_FIXED
类型的TCGTemp变量tcg_constant_internal
分配TEMP_CONST
类型的TCGTemp变量typedef enum TCGTempKind {
/*
* Temp is dead at the end of the extended basic block (EBB),
* the single-entry multiple-exit region that falls through
* conditional branches.
*/
TEMP_EBB,
/* Temp is live across the entire translation block, but dead at end. */
TEMP_TB,
/* Temp is live across the entire translation block, and between them. */
TEMP_GLOBAL,
/* Temp is in a fixed register. */
TEMP_FIXED,
/* Temp is a fixed constant. */
TEMP_CONST,
} TCGTempKind;
tcg_init_machine
qemu_memfd_create()
函数创建出一个匿名文件,该匿名文件的大小是根据当前Host机器的物理内存计算出来的,比如我的电脑是64G,最终计算出来的匿名文件大小为1G。(PROT_READ | PROT_WRITE)
,称之为buf_rw。(PROT_READ | PROT_EXEC)
,称之为buf_rx,buf_rw和buf_rx之间的差值由全局变量tcg_splitwx_diff
表示。tcg_init_machine
函数还会调用tcg_target_qemu_prologue
函数创建出对应于Host的prologue和epilogue,并且分别由全局变量tcg_qemu_tb_exec
和tcg_code_gen_epilogue
指向(如上图)。//保存callee需要保存的寄存器
0x7fffac000000: 55 pushq %rbp
0x7fffac000001: 53 pushq %rbx
0x7fffac000002: 41 54 pushq %r12
0x7fffac000004: 41 55 pushq %r13
0x7fffac000006: 41 56 pushq %r14
0x7fffac000008: 41 57 pushq %r15//第一个参数赋值给%rbp
0x7fffac00000a: 48 8b ef movq %rdi, %rbp//预留栈空间
0x7fffac00000d: 48 81 c4 78 fb ff ff addq $-0x488, %rsp//跳转到第二个参数地址处执行,第二个参数即为TranslationBlock.tc.ptr
0x7fffac000014: ff e6 jmpq *%rsi
//恢复栈空间及callee需要保存的寄存器
0x7fffac000016: 33 c0 xorl %eax, %eax
0x7fffac000018: 48 81 c4 88 04 00 00 addq $0x488, %rsp
0x7fffac00001f: c5 f8 77 vzeroupper
0x7fffac000022: 41 5f popq %r15
0x7fffac000024: 41 5e popq %r14
0x7fffac000026: 41 5d popq %r13
0x7fffac000028: 41 5c popq %r12
0x7fffac00002a: 5b popq %rbx
0x7fffac00002b: 5d popq %rbp
0x7fffac00002c: c3 retq
TranslationBlock
结构也是在1G的空间内分配,第一个TB紧接着epilogue,并且分配了TB以后TCGContext的code_gen_ptr将会指向TB的末端,该TB对应的Host机器码地址存放在TranslationBlock.tc.ptr
中,属于buf_rx空间。TCGContext.code_ptr
的后面再分配,TCGContext的code_buf和code_ptr则再指向第二个TB对应的Host机器码的开头和末端,此时TCGContext的code_gen_ptr则再更新为第二个TB末端的位置。tcg_qemu_tb_exec()
函数即可,该函数接受两个参数,第一个参数为CPUArchState
,第二个参数为TranslationBlock.tc.ptr
,因此TB执行逻辑为:1.prologue
2.TranslationBlock.tc.ptr
3.epilogue
mttcg_cpu_thread_fn
,执行流程为:mttcg_cpu_thread_fn:
do{
if (cpu_can_run(cpu)) {
...
tcg_cpus_exec(cpu)
cpu_exec_start(cpu)
cpu_exec(cpu)
cpu_exec_enter(cpu)
cpu_exec_setjmp(cpu, &sc)
sigsetjmp(cpu->jmp_env, 0) //设置同步异常退出点
cpu_exec_loop(cpu, sc)
cpu_exec_exit(cpu)
cpu_exec_end(cpu)
...
}
} while (!cpu->unplug || cpu_can_run(cpu));
cpu_exec_loop
中,它的执行过程为:cpu_exec_loop:
while (!cpu_handle_exception(cpu, &ret)) { //处理同步异常
while (!cpu_handle_interrupt(cpu, &last_tb)) { //处理异步中断
cpu_get_tb_cpu_state()
tb = tb_lookup() //查找tb缓存
if (tb == NULL) {
tb = tb_gen_code() //进行dbt转换
setjmp_gen_code()
gen_intermediate_code() //将Guest代码转换为IR
tcg_gen_code() //根据IR生成Host代码
}
tb_add_jump() //Direct block chaining优化
cpu_loop_exec_tb() //执行Host目标代码
}
}
tcg_optimize, reachable_code_pass,liveness_pass_0,liveness_pass_1,liveness_pass_2
等。in_asm.txt
文件中显示出0地址处的arm指令为:0x00000000: e3a00000 mov r0, #0
---- 00000000 00000000 00000000
mov_i32 loc5,$0x0 //0x0赋值给loc5变量
mov_i32 r0,loc5 //loc5再赋值给r0
0x00000004: e59f1004 ldr r1, [pc, #4]
---- 00000004 00000000 00000e04
add_i32 loc6,pc,$0x10 //loc6 = pc + 0x10
mov_i32 loc9,loc6 //loc9 = loc6
qemu_ld_i32 loc8,loc9,leul,2 //loc9处的内存加载至loc8变量,leul的含义为Little Endian unsigned long
mov_i32 r1,loc8 //loc8赋值给r1寄存器
TCG_AREG0
,它表示用哪个Host寄存器来指向Guest体系结构的CPUArchState
,对于x86_64来说TCG_AREG0为%rbp(对于arm来说TCG_AREG0为r6寄存器),也就是说通过rbp寄存器可以找到arm的CPUArchState
。qemu中有专门的TEMP_FIXED类型的TCGTemp用于表示TCG_AREG0
:ts = tcg_global_reg_new_internal(s, TCG_TYPE_PTR, TCG_AREG0, "env");
cpu_env = temp_tcgv_ptr(ts);
uint32_t regs[16];
0x00000000: e3a00000 mov r0, #0
movl $0, 0(%rbp) //rbp指向CPUArchState,更新arm CPUArchState的regs[0]即r0寄存器
ldr r2, [pc, #4]
movl %r12d, 8(%rbp) //rbp指向CPUArchState,更新arm CPUArchState的regs[2]即r2寄存器
TCG_AREG0
寄存器,x86_64指令在执行的时候可以找到CPUArchState结构从而更新所有Guest体系结构的CPU状态。include/exec/cpu-defs.h
文件中定义,其中结构体CPUTLB由ArchCPU中的CPUNegativeOffsetState neg
所引用。addr_read, addr_write, addr_code
分别对应着读写执行指令的地址,地址的构成部分注释中有描述:/* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
bit TARGET_PAGE_BITS-1..4 : Nonzero for accesses that should not
go directly to ram.
bit 3 : indicates that the entry is invalid
bit 2..0 : zero
*/
0x00000004: e59f1004 ldr r1, [pc, #4]
---- 00000004 00000000 00000e04
add_i32 loc6,pc,$0x10
mov_i32 loc9,loc6
qemu_ld_i32 loc8,loc9,leul,2
mov_i32 r1,loc8
-- guest addr 0x00000004
0x7ff9c0000119: 41 8b fc movl %r12d, %edi
0x7ff9c000011c: c1 ef 05 shrl $5, %edi
0x7ff9c000011f: 23 bd 10 ff ff ff andl -0xf0(%rbp), %edi
0x7ff9c0000125: 48 03 bd 18 ff ff ff addq -0xe8(%rbp), %rdi
0x7ff9c000012c: 41 8d 74 24 03 leal 3(%r12), %esi
0x7ff9c0000131: 81 e6 00 fc ff ff andl $0xfffffc00, %esi
0x7ff9c0000137: 3b 37 cmpl 0(%rdi), %esi
0x7ff9c0000139: 41 8b f4 movl %r12d, %esi
0x7ff9c000013c: 0f 85 9c 00 00 00 jne 0x7ff9c00001de
0x7ff9c0000142: 48 03 77 10 addq 0x10(%rdi), %rsi
0x7ff9c0000146: 44 8b 26 movl 0(%rsi), %r12d
tcg/i386/tcg-target.c.inc
文件的tcg_out_qemu_ld
函数。-- guest addr 0x00000004
//r12寄存器包含着要读取的地址的低位部分addrlo(这里要读取的地址为0x10),赋值给edi,edi为x86平台函数调用的第一个参数寄存器
0x7ff9c0000119: 41 8b fc movl %r12d, %edi//地址 >> (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS) = 5
0x7ff9c000011c: c1 ef 05 shrl $5, %edi//-0xf0为偏移量,rbp为CPUArchState,-0xf0分为两部计算,首先获取neg.tlb.f[IDX]在CPUArchState中的偏移,再获取CPUTLBDescFast结构中mask成员的偏移, 因此-0xf0就为CPUTLBDescFast结构中mask成员的偏移,因此这条指令等于是执行了一个函数叫tlb_index(CPUArchState *env, uintptr_t mmu_idx,target_ulong addr)
0x7ff9c000011f: 23 bd 10 ff ff ff andl -0xf0(%rbp), %edi//-0xe8为CPUTLBDescFast结构中的table成员的偏移,因此这条指令等于是执行了一个函数叫tlb_entry(CPUArchState *env, uintptr_t mmu_idx,target_ulong addr)
0x7ff9c0000125: 48 03 bd 18 ff ff ff addq -0xe8(%rbp), %rdi//addrlo + (s_mask - a_mask)赋值给%esi, esi为x86平台函数调用的第二个参数寄存器
0x7ff9c000012c: 41 8d 74 24 03 leal 3(%r12), %esi//地址 & (TARGET_PAGE_MASK | a_mask)这样提取出地址的除了页偏移的其他部分
0x7ff9c0000131: 81 e6 00 fc ff ff andl $0xfffffc00, %esi//0(%rdi)的值为对应CPUTLBEntry的addr_read成员变量的值,和要取的地址进行比较
0x7ff9c0000137: 3b 37 cmpl 0(%rdi), %esi//原始地址赋值给%esi
0x7ff9c0000139: 41 8b f4 movl %r12d, %esi//如果CPUTLBEntry的addr_read成员变量的值和要取的地址不相等则表示tlb不命中,跳转至tlb慢路径地址0x7ff9c00001de处执行
0x7ff9c000013c: 0f 85 9c 00 00 00 jne 0x7ff9c00001de//如果没有进入tlb慢路径表示tlb命中,0x10(%rdi)的值为CPUTLBEntry的addend成员变量的值,加上原始地址即为HVA
0x7ff9c0000142: 48 03 77 10 addq 0x10(%rdi), %rsi//读取HVA地址处的值并赋值给%r12d
0x7ff9c0000146: 44 8b 26 movl 0(%rsi), %r12d
/* Generate TB finalization at the end of block */
#ifdef TCG_TARGET_NEED_LDST_LABELS
i = tcg_out_ldst_finalize(s);
if (i < 0) {
return i;}
-- tb slow paths + alignment
//准备好第一个参数tcg_target_call_iarg_regs[0],它的值为CPUArchState env
0x7ff9c00001de: 48 8b fd movq %rbp, %rdi//准备好第三个参数tcg_target_call_iarg_regs[2],它的值为TCGMemOpIdx oi = 0x22
0x7ff9c00001e1: ba 22 00 00 00 movl $0x22, %edx//准备好第四个参数tcg_target_call_iarg_regs[3],它的值为retaddr
0x7ff9c00001e6: 48 8d 0d 5c ff ff ff leaq -0xa4(%rip), %rcx//调用函数helper_le_ldul_mmu
//helper_le_ldul_mmu(CPUArchState *env, target_ulong addr,TCGMemOpIdx oi, uintptr_t retaddr)
0x7ff9c00001ed: ff 15 4d 00 00 00 callq *0x4d(%rip)//获取返回值
0x7ff9c00001f3: 44 8b e0 movl %eax, %r12d//跳转回之前不命中的地方继续执行
0x7ff9c00001f6: e9 4e ff ff ff jmp 0x7ff9c0000149
helper_le_ldul_mmu
还会再检测一次tlb是否命中,如果不命中将会调用体系结构相关函数做下一步的处理。/* If the TLB entry is for a different page, reload and try again. */
if (!tlb_hit(tlb_addr, addr)) {
if (!victim_tlb_hit(env, mmu_idx, index, tlb_off,
addr & TARGET_PAGE_MASK)) {
tlb_fill(env_cpu(env), addr, size,
access_type, mmu_idx, retaddr);
index = tlb_index(env, mmu_idx, addr);
entry = tlb_entry(env, mmu_idx, addr);
}
tlb_addr = code_read ? entry->addr_code : entry->addr_read;
tlb_addr &= ~TLB_INVALID_MASK;
}
0x0000000c: e59ff004 ldr pc, [pc, #4]
1.直接执行下一个TB
2.回到qemu上下文继续编译执行
---- 0000000c 00000000 00000000
mov_i32 tmp3,$0x18 //0x18处为pc应该更新到的值即pc + 4
mov_i32 tmp7,tmp3
qemu_ld_i32 tmp6,tmp7,leul,10 //将(pc + 4)内存地址处的值取出存放于tmp6
and_i32 pc,tmp6,$0xfffffffe //这里的逻辑对应于target/arm/tcg/translate.c文件的gen_bx函数,注意SPC值发生了改变
and_i32 tmp6,tmp6,$0x1 //同样位于gen_bx函数
st_i32 tmp6,env,$0x220 //赋值给env中的thumb成员//这条IR产生的原因是上面的gen_bx函数中的语句: s->base.is_jmp = DISAS_JUMP,从而退出translator_loop中的while循环,调用ops->tb_stop(db, cpu)从而调用gen_goto_ptr()产生此条IR
call lookup_tb_ptr,$0x6,$1,tmp12,env
goto_ptr tmp12
call lookup_tb_ptr,$0x6,$1,tmp12,env
goto_ptr tmp12
call lookup_tb_ptr
后面的参数是什么含义?具体可以参考tcg/tcg.c
文件的tcg_dump_ops
函数:1.lookup_tb_ptr为TCGOp对象所对应的TCGHelperInfo对象的name字段。
2.$0x6为TCGOp对象所对应的TCGHelperInfo对象的flags字段。
3.$1为TCGOp对象的param2成员,即nb_oargs, 表示输出参数的个数为1。
4.tmp12为op->args[]中输出参数的字符串表示。
5.env为op->args[]中输入参数的字符串表示。
0x7f2d53e7e1aa: 48 8b fd movq %rbp, %rdi //rbp为CPUArchState env赋值给第一个参数寄存器%rdi
//调用%eip + 0x65处的函数,即(helper_lookup_tb_ptr函数)
0x7f2d53e7e1ad: ff 15 65 00 00 00 callq *0x65(%rip)
0x7f2d53e7e1b3: ff e0 jmpq *%rax //跳转至函数返回值处执行
accel/tcg/cpu-exec.c
:const void *HELPER(lookup_tb_ptr)(CPUArchState *env) //它的名字经扩展后为helper_lookup_tb_ptr
tcg_code_gen_epilogue
指向了tcg的epilogue处,因此如果跳转至tcg_code_gen_epilogue
执行最终结果是tcg_qemu_tb_exec(env, tb_ptr)
函数返回,从而回到了qemu tcg上下文处进行下一个TB的转换执行。lookup_tb_ptr
,它要么成功找到下一个TB的地址并跳转过去执行要么返回qemu tcg上下文执行。0x00010004: eb000017 bl #0x10068
libqemu-arm-softmmu.fa.p/decode-a32.c.inc
文件的disas_a32_extract_branch
函数,a->imm为pc相对跳转的偏移值。target/arm/tcg/translate.c
文件的trans_BL函数。add_i32 r14,pc,$0x8
add_i32 pc,pc,$0x68
goto_tb $0x0//0x7f666c000280即val的值为当前的TranslationBlock在buf_rx处的指针:
//uintptr_t val = (uintptr_t)tcg_splitwx_to_rx((void *)tb) + idx;
exit_tb $0x7f666c000280
static target_long jmp_diff(DisasContext *s, target_long diff)
{
return diff + (s->thumb ? 4 : 8);
}
//生成的代码只是用于跳转到下一条指令
0x7fff70000397: e9 00 00 00 00 jmp 0x7fff7000039c
tcg/i386/tcg-target.c.inc
文件中的tcg_out_goto_tb
函数生成:static void tcg_out_goto_tb(TCGContext *s, int which)
{
/*
* Jump displacement must be aligned for atomic patching;
* see if we need to add extra nops before jump
*/
int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
if (gap != 1) {
tcg_out_nopn(s, gap - 1);
}
tcg_out8(s, OPC_JMP_long); /* jmp im */
set_jmp_insn_offset(s, which);
tcg_out32(s, 0);
set_jmp_reset_offset(s, which);
}
set_jmp_insn_offset(s, which); //设置当前TB的jmp_insn_offset[0]为tcg_current_code_size(s)
set_jmp_reset_offset(s, which);//设置当前TB的jmp_reset_offset[0]为tcg_current_code_size(s)
//-0x123(%rip)的值就是0x7f666c000280,赋值给%rax
0x7fff7000039c: 48 8d 05 dd fe ff ff leaq -0x123(%rip), %rax//0x7fff70000018的值就是tb_ret_addr,即TB epilogue
0x7fff700003a3: e9 70 fc ff ff jmp 0x7fff70000018
1.设置当前TB的jmp_insn_offset[0]和jmp_reset_offset[0]。
2.将当前TB在buf_rx处的指针(0x7f666c000280)赋值给%rax。
3.跳转至TB epilogue处即从tcg_qemu_tb_exec(env, tb_ptr)函数处返回。
if (last_tb) {
tb_add_jump(last_tb, tb_exit, tb);
}
void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
{
/*
* Get the rx view of the structure, from which we find the
* executable code address, and tb_target_set_jmp_target can
* produce a pc-relative displacement to jmp_target_addr[n].
*/
const TranslationBlock *c_tb = tcg_splitwx_to_rx(tb);
uintptr_t offset = tb->jmp_insn_offset[n];
uintptr_t jmp_rx = (uintptr_t)tb->tc.ptr + offset;
uintptr_t jmp_rw = jmp_rx - tcg_splitwx_diff;
tb->jmp_target_addr[n] = addr;
tb_target_set_jmp_target(c_tb, n, jmp_rx, jmp_rw);
}
Last TB CODE
中的"e9 00 00 00 00"
进行patch让它指向Next TB CODE
,这样下次再执行Last TB CODE
时"e9 xx xx xx xx"
处将会直接跳转到Next TB CODE
处执行,无需再退出至qemu上下文。这个就叫Direct block chaining优化
。jmp_insn_offset[0]
指向的是需要patch的指令在code区域的偏移,而jmp_reset_offset[0]
则指向了需要patch指令的下一条指令,当需要断开当前TB与下一条TB的Direct block chaining
链接时,再执行patch,目标是jmp_reset_offset[0]
即可恢复当前TB的跳转。Direct block chaining
还需要解决的一个问题是自修改代码,即当代码会对代码区域作修改时,这个代码区域之前旧的翻译指令不再有效,它和其他TB之间的链接也可能不再有效。do_tb_phys_invalidate
函数从而重置TB的一些状态,其中就包括它所链接到其他TB的状态。target/xxx/helper.h
头文件中声明。//所有helpers的数组
static const TCGHelperInfo all_helpers[] = {
#include "exec/helper-tcg.h" //包含#include "helper.h"
};
uint32_t HELPER(udiv)(CPUARMState *env, uint32_t num, uint32_t den)
{
if (den == 0) {
handle_possible_div0_trap(env, GETPC()); //引发除0异常
return 0;
}
return num / den;
}
void helper_cpuid(CPUX86State *env)
{
uint32_t eax, ebx, ecx, edx;
cpu_svm_check_intercept_param(env, SVM_EXIT_CPUID, 0, GETPC());
cpu_x86_cpuid(env, (uint32_t)env->regs[R_EAX], (uint32_t)env->regs[R_ECX],
&eax, &ebx, &ecx, &edx);
env->regs[R_EAX] = eax;
env->regs[R_EBX] = ebx;
env->regs[R_ECX] = ecx;
env->regs[R_EDX] = edx;
}
helper_le_stl_mmu
,然后进入到qemu的内存模块做下一步的处理。0x00010004: ef000000 svc #0
case DISAS_SWI:
gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb));
add_i32 pc,pc,$0x8
call exception_with_syndrome,$0x8,$0,env,$0x2,$0x46000000
//函数名称为helper_exception_with_syndrome
//excp的值为: #define EXCP_SWI 2
//syndrome的值为0x46000000,由syn_aa32_svc()函数计算得出,可以认为是常量
//target_el值为1表示执行系统调用会切换exception level至1
void HELPER(exception_with_syndrome)(CPUARMState *env, uint32_t excp,
uint32_t syndrome, uint32_t target_el)
{
raise_exception(env, excp, syndrome, target_el);
}void raise_exception(CPUARMState *env, uint32_t excp,
uint32_t syndrome, uint32_t target_el)
{
CPUState *cs = env_cpu(env);
if (target_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
/*
* Redirect NS EL1 exceptions to NS EL2. These are reported with
* their original syndrome register value, with the exception of
* SIMD/FP access traps, which are reported as uncategorized
* (see DDI0478C.a D1.10.4)
*/
target_el = 2;
if (syn_get_ec(syndrome) == EC_ADVSIMDFPACCESSTRAP) {
syndrome = syn_uncategorized();
}
}
assert(!excp_is_internal(excp));
cs->exception_index = excp; //更新CPU状态的异常下标
env->exception.syndrome = syndrome;
env->exception.target_el = target_el;
cpu_loop_exit(cs); //请求退出执行循环
}
void cpu_loop_exit(CPUState *cpu)
{
/* Undo the setting in cpu_tb_exec. */
cpu->can_do_io = 1;
/* Undo any setting in generated code. */
qemu_plugin_disable_mem_helpers(cpu);
siglongjmp(cpu->jmp_env, 1); //执行长跳转退出至执行循环
}
cpu_exec_loop(CPUState *cpu, SyncClocks *sc)
{
int ret;
/* if an exception is pending, we execute it here */
while (!cpu_handle_exception(cpu, &ret)) { //执行异常处理
TranslationBlock *last_tb = NULL;
int tb_exit = 0;
while (!cpu_handle_interrupt(cpu, &last_tb)) {
cc->tcg_ops->do_interrupt(cpu)
进入最终的异常处理,从而调用到target/arm/helper.c
文件中的函数:void arm_cpu_do_interrupt(CPUState *cs) //逻辑是addr=8; addr += A32_BANKED_CURRENT_REG_GET(env, vbar);
(Vector Base Address Register)
计算出异常处理函数的地址newpc, 并且通过take_aarch32_exception
函数将pc置为异常处理函数地址并跳转过去执行:env->regs[15] = newpc; //r15就是pc寄存器
void mttcg_kick_vcpu_thread(CPUState *cpu)
{
cpu_exit(cpu);
}void cpu_exit(CPUState *cpu)
{
qatomic_set(&cpu->exit_request, 1);
/* Ensure cpu_exec will see the exit request after TCG has exited. */
smp_wmb();
//Set to -1 to force TCG to stop executing linked TBs for this CPU and return to its top level loop (even in non-icount mode).
qatomic_set(&cpu->icount_decr_ptr->u16.high, -1);
}
cpu->icount_decr_ptr->u16.high
置为-1时就是告诉tcg线程中正在执行的tb尽快退出,回到qemu上下文进行外部中断的处理。icount_decr_ptr还涉及到qemu的一个特性叫TCG Instruction Counting
:OP:
//开头
ld_i32 loc3,env,$0xfffffffffffffff0 //对应于cpu->icount_decr_ptr->u16.high
brcond_i32 loc3,$0x0,lt,$L0 //如果cpu->icount_decr_ptr->u16.high < 0则跳转至结尾处的$L0
...
//结尾
set_label $L0
exit_tb $0x7f884c000043 //收到中断通知,退出执行循环
while (!cpu_handle_interrupt(cpu, &last_tb)) {
三
unicorn原理分析
1.只保留qemu tcg cpu模拟器的部分,移除掉其他如device,rom/bios等和系统模拟相关的代码
2.尽量维持qemu cpu模拟器部分不变,这样才容易和上游的qemu tcg代码同步
3.重构tcg的代码从而可以更好的实现线程安全性及同时运行多个unicorn实例
4.qemu tcg并非一个Instrumentation框架,而unicorn的目标是实现一个有多种语言绑定的Instrumentation框架,可以在多个级别跟踪代码的运行并执行设置好的回调函数。
tlb/softmmu/MemoryRegion
做为内存模拟的实现。uc_mem_map(*uc, code_start, code_len, UC_PROT_ALL)
//返回true表示没有开启mmu
if (regime_translation_disabled(env, mmu_idx)) {
AddressSpace,MemoryRegion,FlatView和RAMBlock
。这块的机制相当复杂,描述它就得需要一两篇博客的篇幅,这里只是简单介绍一下概念:uc_hook_add(uc, &hook, UC_HOOK_CODE, my_callback,&count, 1, 0)
uc_reg_write(uc, UC_ARM_REG_R0, &r_r0)
uc_reg_write(uc, UC_ARM_REG_R2, &r_r2)
uc_emu_start(uc, code_start, code_start + sizeof(code) - 1, 0, 0)
uc_hook_add
函数以后,其实是创建出了struct hook
对象并添加到了uc_struct这个全局对象的struct list hook[UC_HOOK_MAX]
链表中去,unicorn其实是在IR层添加了相应的代码来设置回调,比如对于mov r0, #1
uc_hook_add(uc, &hook, UC_HOOK_CODE, my_callback,&count, 1, 0)
, 调试打印OPCode:UNICORN_DEBUG=1 ./test_arm my_hook_test
insn_idx=0 ---- 00001000 00000000 00000000
1: movi_i32 pc,$0x1000
2: movi_i32 tmp3,$0x4
3: movi_i64 tmp5,$0x55e38ad72840
4: movi_i64 tmp6,$0x1000
5: movi_i64 tmp7,$0x7fff1dd92190
6: call hookcode_4_55e38928e9a9,$0x0,$0,tmp5,tmp6,tmp3,tmp7
7: ld_i32 tmp3,env,$0xfffffffffffffff0
8: movi_i32 tmp4,$0x0
9: brcond_i32 tmp3,tmp4,lt,$L0
10: movi_i32 tmp3,$0x1
11: mov_i32 r0,tmp3
// Unicorn: trace this instruction on request
if (HOOK_EXISTS_BOUNDED(s->uc, UC_HOOK_CODE, s->pc_curr)) {
// Sync PC in advance
gen_set_pc_im(s, s->pc_curr);gen_uc_tracecode(tcg_ctx, 4, UC_HOOK_CODE_IDX, s->uc, s->pc_curr);
// the callback might want to stop emulation immediately
check_exit_request(tcg_ctx);
}
gen_uc_tracecode
的逻辑就是创建出调用hookcode_4_55e38928e9a9
这个helper函数的IR,这个helper函数的实现就是设置进来的回调函数,它是动态被创建出来并且添加到helper函数的hashtable中的。//将当前正在执行指令的地址放置于pc寄存器
1: movi_i32 pc,$0x1000//4的值为跟踪的指令字节个数
2: movi_i32 tmp3,$0x4//$0x55e38ad72840为uc_struct uc指令的值
3: movi_i64 tmp5,$0x55e38ad72840//0x1000为当前pc执行的地址
4: movi_i64 tmp6,$0x1000//$0x7fff1dd92190的值为设置回调时传递的user_data指针值
5: movi_i64 tmp7,$0x7fff1dd92190//调用到回调函数 : void my_callback(uc_engine *uc, uint64_t address, uint32_t size, void *user_data)
6: call hookcode_4_55e38928e9a9,$0x0,$0,tmp5,tmp6,tmp3,tmp7
uint8_t cpu_inb(struct uc_struct *uc, uint32_t addr)
{
// uint8_t val;
// address_space_read(&uc->address_space_io, addr, MEMTXATTRS_UNSPECIFIED,
// &val, 1);
//LOG_IOPORT("inb : %04"FMT_pioaddr" %02"PRIx8"\n", addr, val);
// Unicorn: call registered IN callbacks
struct hook *hook;
HOOK_FOREACH_VAR_DECLARE;
HOOK_FOREACH(uc, hook, UC_HOOK_INSN) {
if (hook->to_delete)
continue;
if (hook->insn == UC_X86_INS_IN)
return ((uc_cb_insn_in_t)hook->callback)(uc, addr, 1, hook->user_data);
}
return 0;
}
UC_HOOK_BLOCK
用于跟踪basic block的执行,因此最好的跟踪点是一个basic block在处理之前:qemu/accel/tcg/translator.c
的translator_loop函数for循环开始处:if (HOOK_EXISTS_BOUNDED(uc, UC_HOOK_BLOCK, tb->pc)) {
prev_op = tcg_last_op(tcg_ctx);
block_hook = true;
gen_uc_tracecode(tcg_ctx, 0xf8f8f8f8, UC_HOOK_BLOCK_IDX, uc, db->pc_first);
}
// Unicorn: fast path if hookmem is not enable
if (!HOOK_EXISTS(s->uc, UC_HOOK_MEM_READ) && !HOOK_EXISTS(s->uc, UC_HOOK_MEM_WRITE))
//没有回调时走之前的逻辑
tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
else
/* slow_path, so data access will go via load_helper() */
tcg_out_opc(s, OPC_JMP_long, 0, 0, 0);
load_helper()
和store_helper()
。"UNMAPPED"
访问,在uc_struct结构中的MemoryRegion **mapped_blocks
成员中存放当前所有设置过内存映射的MemoryRegion
区域,给定一个地址,调用如下函数可以快速得到对应的MemoryRegion:MemoryRegion *memory_mapping(struct uc_struct *uc, uint64_t address)
UC_HOOK_MEM_WRITE_UNMAPPED
等回调。HOOK_FOREACH(uc, hook, UC_HOOK_INTR) {
if (hook->to_delete) {
continue;
}
//cpu->exception_index即是中断号
((uc_cb_hookintr_t)hook->callback)(uc, cpu->exception_index, hook->user_data);
catched = true;
}
static bool arm_stop_interrupt(struct uc_struct *uc, int intno)
{
switch (intno) {
default:
return false;
case EXCP_UDEF:
case EXCP_YIELD:
return true;
case EXCP_INVSTATE:
uc->invalid_error = UC_ERR_EXCEPTION;
return true;
}
}
HOOK_FOREACH(uc, hook, UC_HOOK_INSN_INVALID) {
if (hook->to_delete) {
continue;
}
catched = ((uc_cb_hookinsn_invalid_t)hook->callback)(uc, hook->user_data);
if (catched) {
break;
}
}
四
总结
看雪ID:飞翔的猫咪
https://bbs.kanxue.com/user-home-607812.htm
# 往期推荐
3、安卓加固脱壳分享
球分享
球点赞
球在看