上次课内容:
C程序如何从源代码生成指令序列(二进制可执行文件)
本次课内容:
C程序(指令序列)如何执行
我们之前接触的是具有宿主操作系统(Linux)的运行时环境
printf()
的代码在哪里 😂
RTFM: C99
5.1.2.1 Freestanding environment
2 The effect of program termination in a freestanding environment is
implementation-defined.
QEMU虽然是个开源项目, 但还挺复杂, 不利于我们理解细节
让我们来设计一个面向RISC-V程序的简单freestanding运行时环境!
0
开始执行addi
指令ebreak
指令
a0=0
时, 输出寄存器a1
低8位的字符a0=1
时, 结束运行
static void ebreak(long arg0, long arg1) {
asm volatile("addi a0, x0, %0;"
"addi a1, x0, %1;"
"ebreak" : : "i"(arg0), "i"(arg1));
}
static void putch(char ch) { ebreak(0, ch); }
static void halt(int code) { ebreak(1, code); while (1); }
void _start() {
putch('A');
halt(0);
}
riscv64-linux-gnu-gcc -march=rv64g -ffreestanding -nostdlib -static -Wl,-Ttext=0 \
-O2 -o prog a.c
llvm-objdump -M no-aliases -d prog
看看反汇编
addi
和ebreak
指令但怎么让这个程序运行呢?
ISA手册定义了一个状态机
S = {<R, M>}
R = {PC, x0, x1, x2, ...}
PC
= 程序计数器 = 当前执行的指令位置M
= 内存
S0 = <R0, M0>
我们只要把这个状态机实现出来, 就可以用它来执行指令了!
程序 | 抽象计算机 | CPU | |
---|---|---|---|
状态 | {<V, PC>} |
{<R, M>} |
{时序逻辑电路} |
状态转移规则 | C语言语句的语义 | 指令的语义 | 组合逻辑电路 |
FM | C语言标准手册 | 指令集手册 | 架构设计文档 |
#include <stdint.h>
uint64_t R[32], PC; // according to the RISC-V manual
uint8_t M[64]; // 64-Byte memory
Q: 为什么不使用int64_t
和int8_t
?
A: C语言标准规定, 有符号数溢出是undefined behavior, 但无符号数不会溢出
指令周期(instruction cycle): 执行一条指令的步骤
状态机不断执行指令, 直到结束运行:
31 20 19 15 14 12 11 7 6 0
+---------------+-----+-----+-----+---------+
| imm[11:0] | rs1 | 000 | rd | 0010011 | ADDI
+---------------+-----+-----+-----+---------+
+---------------+-----+-----+-----+---------+
| 000000000001 |00000| 000 |00000| 1110011 | EBREAK
+---------------+-----+-----+-----+---------+
一个简单的实现:
void inst_cycle() {
uint32_t inst = *(uint32_t *)&M[PC];
if (((inst & 0x7f) == 0x13) && ((inst >> 12) & 0x7) == 0) { // addi
if (((inst >> 7) & 0x1f) != 0) {
R[(inst >> 7) & 0x1f] = R[(inst >> 15) & 0x1f] +
(((inst >> 20) & 0x7ff) - ((inst & 0x80000000) ? 4096 : 0));
}
} else if (inst == 0x00100073) { // ebreak
if (R[10] == 0) { putchar(R[11] & 0xff); }
else if (R[10] == 1) { halt = true; }
else { printf("Unsupported ebreak command\n"); }
} else { printf("Unsupported instuction\n"); }
PC += 4;
}
RTFM: 《The RISC-V Instruction Set Manual - Volume II: Privileged Architecture》
3.4 Reset
The pc is set to an implementation-defined reset vector... All other hart state is
unspecified.
注意这里的unspecified
和C语言标准的含义不同
根据手册, 初始状态如下:
R[0] = 0
, 0号寄存器恒为0PC = 0
, 与自制运行时环境共同约定M
中存放程序, 与自制运行时环境共同约定,
由模拟器加载程序#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
uint64_t R[32], PC;
uint8_t M[64] = {
0x13, 0x05, 0x00, 0x00, 0x93, 0x05, 0x10, 0x04, 0x73, 0x00, 0x10, 0x00,
0x13, 0x05, 0x10, 0x00, 0x93, 0x05, 0x00, 0x00, 0x73, 0x00, 0x10, 0x00,
0x6f, 0x00, 0x00, 0x00,
};
bool halt = false;
void inst_cycle() {
uint32_t inst = *(uint32_t *)&M[PC];
if (((inst & 0x7f) == 0x13) && ((inst >> 12) & 0x7) == 0) { // addi
if (((inst >> 7) & 0x1f) != 0) {
R[(inst >> 7) & 0x1f] = R[(inst >> 15) & 0x1f] +
(((inst >> 20) & 0x7ff) - ((inst & 0x80000000) ? 4096 : 0));
}
} else if (inst == 0x00100073) { // ebreak
if (R[10] == 0) { putchar(R[11] & 0xff); }
else if (R[10] == 1) { halt = true; }
else { printf("Unsupported ebreak command\n"); }
} else { printf("Unsupported instuction\n"); }
PC += 4;
}
int main() {
PC = 0; R[0] = 0; // can be omitted since uninitialized global variables are initialized with 0
while (!halt) { inst_cycle(); }
return 0;
}
// ...
uint8_t M[1024];
int main(int argc, char *argv[]) {
PC = 0; R[0] = 0;
FILE *fp = fopen(argv[1], "r");
fread(M, 1, 1024, fp);
fclose(fp);
while (!halt) { inst_cycle(); }
return 0;
}
YEMU很小, 可以很快定位问题; 但如何从大项目中存活?
#\
define C(c /**/)#c
/*size=3173*/#include<stdio.h>
/*crc=b7f9ecff.*/#include<stdlib.h>
/*Mile/Adele_von_Ascham*/#include<time.h>
typedef/**/int(I);I/*:3*/d,i,j,a,b,l,u[16],v
[18],w[36],x,y,z,k;char*P="\n\40(),",*p,*q,*t[18],m[4];
void/**/O(char*q){for(;*q;q++)*q>32?z=111-*q?z=(z+*q)%185,(k?
k--:(y=z%37,(x=z/37%7)?printf(*t,t[x],y?w[y-1]:95):y>14&&y<33?x
=y>15,printf(t[15+x],x?2<<y%16:l,x?(1<<y%16)-1:1):puts(t[y%28])))
,0:z+82:0;}void/**/Q(I(p),I*q){for(x=0;x<p;x++){q[x]=x;}for(;--p
>1;q[p]=y)y =q[x=rand()%-~p],q[x]=q[p];}char/**/n[999]=C(Average?!nQVQd%R>Rd%
R% %RNIPRfi#VQ}R;TtuodtsRUd%RUd%RUOSetirwf!RnruterR{RTSniamRtniQ>h.oidts<edulc
ni #V>rebmun<=NIPD-RhtiwRelipmocResaelPRrorre#QNIPRfednfi#V__ELIF__R_
Re nifed#V~-VU0V;}V{R= R][ORrahcRdengisnuRtsnocRcitatsVesle#Vfidne#V53556
. .1RfoRegnarRehtRniRre getniRnaRsiR]NIP[R erehwQQc.tuptuoR>Rtxt.tupniR
< R]NIP[R:egasuV_Redulcn i#VfednfiVfednuVenife dVfedfiVQc%Rs%#V);I/**/main(
I( f),char**e){if(f){for(i= time(NULL),p=n,q= n+997,x=18;x;p++){*p>32&&!(
*--q=*p>80&&*p<87?P[*p- 81]:* p)?t [( -- x)]=q+1:q;}if(f-2||(d=atoi
(e[1]))<1||65536<d){;O(" \""); goto O;}srand(i);Q(16,u);i=0;Q(
36,w);for(;i<36; i++){w[i] +=w [i]<26 ? 97:39; }O(C(ouoo9oBotoo%]#
ox^#oy_#ozoou#o{ a#o|b#o}c# o~d#oo-e #oo. f#oo/g#oo0h#oo1i#oo
2j#oo3k#oo4l#o p));for(j =8;EOF -(i= getchar());l+=1){a=1+
rand()%16;for(b =0;b<a||i- main (0,e);b++)x=d^d/4^d/8^d/
32,d= (d/ 2|x<<15)&65535; b|= !l<<17;Q(18,v);for(a=0;a<18;
a++ ){if( (b&(1<<(i=v[a] ))))* m=75+i,O(m),j=i<17&&j<i?i:j;}O(C(
!) ); }O(C(oqovoo97o /n!));i= 0;for(;i<8;O(m))m[2]=35,*m=56+u[i],m[1
]= 75 +i++;O(C(oA!oro oqoo9) );k=112-j*7;O(C(6o.!Z!Z#5o-!Y!Y#4~!X!X#3}
!W !W #2 |!V!V#1{!U!U#0z! T!T#/y!S!S#.x!R!R#-w!Q!Q#ooAv!P!P#+o#!O!O#*t!N!
N# oo >s!M!M#oo=r!L!L#oo<q!K!K# &pIo@:;= oUm#oo98m##oo9=8m#oo9oUm###oo9;=8m#o
o9 oUm##oo9=oUm#oo98m#### o09] #o1:^#o2;_#o3<o ou#o4=a#o5>b#o6?c#o
7@d#o8A e#o 9B f#o:Cg#o; D h#o<Ei #o=Fj#o> Gk#o?Hl#oo9os#####
));d=0 ;} O: for(x=y=0;x<8;++
x)y|= d&(1<<u[x])?
1<< x:0;return
/* :9 */
y ; }
YEMU v1.0其实也做得不够好, 让我们来改进它
不相信外界的输入/其他函数传递的参数, 通过断言提前拦截非预期情况
#include <assert.h>
// ...
int main(int argc, char *argv[]) {
PC = 0; R[0] = 0;
assert(argc >= 2); // 要求至少包含一个参数
FILE *fp = fopen(argv[1], "r");
assert(fp != NULL); // 要求argv[1]是一个可以成功打开的文件
int ret = fseek(fp, 0, SEEK_END);
assert(ret != -1); // 要求fseek()成功
long fsize = ftell(fp);
assert(fsize != -1); // 要求ftell()成功
rewind(fp);
assert(fsize < 1024); // 要求程序大小不超过1024字节
ret = fread(M, 1, 1024, fp);
assert(ret == fsize); // 要求完全读出程序的内容
fclose(fp);
while (!halt) { inst_cycle(); }
return 0;
}
将预期的正确行为直接写到程序中
segmentation fault
->
yemu.c:27: main: ...
程序中的断言足够多 -> 近似于证明了程序的正确性
IC验证教大家写SVA(SystemVerilog Assertion), 也是类似的道理
#define Assert(cond, format, ...) \
do { \
if (!(cond)) { \
fprintf(stderr, format "\n", ## __VA_ARGS__); \
assert(cond); \
} \
} while (0)
int main(int argc, char *argv[]) {
PC = 0; R[0] = 0;
Assert(argc >= 2, "Program is not given"); // 要求至少包含一个参数
FILE *fp = fopen(argv[1], "r");
Assert(fp != NULL, "Fail to open %s", argv[1]); // 要求argv[1]是一个可以成功打开的文件
int ret = fseek(fp, 0, SEEK_END);
Assert(ret != -1, "Fail to seek the end of the file"); // 要求fseek()成功
long fsize = ftell(fp);
Assert(fsize != -1, "Fail to return the file position"); // 要求ftell()成功
rewind(fp);
Assert(fsize < 1024, "Program size exceeds 1024 Bytes"); // 要求程序大小不超过1024字节
ret = fread(M, 1, 1024, fp);
Assert(ret == fsize, "Fail to load the whole program"); // 要求完全读出程序的内容
fclose(fp);
while (!halt) { inst_cycle(); }
return 0;
}
#include <string.h>
#include <errno.h>
#define Perror(cond, format, ...) \
Assert(cond, format ": %s", ## __VA_ARGS__, strerror(errno))
int main(int argc, char *argv[]) {
PC = 0; R[0] = 0;
Assert(argc >= 2, "Program is not given"); // 要求至少包含一个参数
FILE *fp = fopen(argv[1], "r");
Perror(fp != NULL, "Fail to open %s", argv[1]); // 要求argv[1]是一个可以成功打开的文件
int ret = fseek(fp, 0, SEEK_END);
Perror(ret != -1, "Fail to seek the end of the file"); // 要求fseek()成功
long fsize = ftell(fp);
Perror(fsize != -1, "Fail to return the file position"); // 要求ftell()成功
rewind(fp);
Assert(fsize < 1024, "Program size exceeds 1024 Bytes"); // 要求程序大小不超过1024字节
ret = fread(M, 1, 1024, fp);
Assert(ret == fsize, "Fail to load the whole program"); // 要求完全读出程序的内容
fclose(fp);
while (!halt) { inst_cycle(); }
return 0;
}
RTFM: man errno
破坏隐含依赖 = bug (例如这里改了, 那里忘了改):
随着项目规模增长, 需要分成多个文件来管理
if (((inst & 0x7f) == 0x13) && ((inst >> 12) & 0x7) == 0) { // addi
if (((inst >> 7) & 0x1f) != 0) {
R[(inst >> 7) & 0x1f] = R[(inst >> 15) & 0x1f] +
(((inst >> 20) & 0x7ff) - ((inst & 0x80000000) ? 4096 : 0));
}
} else if (((inst & 0x7f) == 0x13) && ((inst >> 12) & 0x7) == 0x4) { // xori
if (((inst >> 7) & 0x1f) != 0) {
R[(inst >> 7) & 0x1f] = R[(inst >> 15) & 0x1f] ^
(((inst >> 20) & 0x7ff) - ((inst & 0x80000000) ? 4096 : 0));
}
} else if (((inst & 0x7f) == 0x13) && ((inst >> 12) & 0x7) == 0x6) { // ori
if (((inst >> 7) & 0x1f) != 0) {
R[(inst >> 7) & 0x1f] = R[(inst >> 15) & 0x1f] |
(((inst >> 20) & 0x7ff) - ((inst & 0x80000000) ? 4096 : 0));
}
} else if (((inst & 0x7f) == 0x13) && ((inst >> 12) & 0x7) == 0x4) { // andi
if (((inst >> 7) & 0x1f) != 0) {
R[(inst >> 7) & 0x1f] = R[(inst >> 15) & 0x1f] &
(((inst >> 20) & 0x7ff) - ((inst & 0x80000000) ? 4096 : 0));
}
} else if (...) { ... }
上述代码有一处错误, 你找到了吗?
Copy-Paste = 编写相似代码时, 复制旧代码并稍作修改
上述代码不言自明本身就不怎么样, 不言自证就更难了
粘贴一时爽, 调试火葬场 😈
通过变量, 函数, 宏等消除重复/相似的代码
uint32_t inst = *(uint32_t *)&M[PC];
uint32_t opcode = inst & 0x7f;
uint32_t funct3 = (inst >> 12) & 0x7;
uint32_t rd = (inst >> 7 ) & 0x1f;
uint32_t rs1 = (inst >> 15) & 0x1f;
uint64_t imm = ((inst >> 20) & 0x7ff) - ((inst & 0x80000000) ? 4096 : 0);
if (opcode == 0x13) {
if (funct3 == 0x0) { R[rd] = R[rs1] + imm; } // addi
else if (funct3 == 0x4) { R[rd] = R[rs1] ^ imm; } // xori
else if (funct3 == 0x6) { R[rd] = R[rs1] | imm; } // ori
else if (funct3 == 0x7) { R[rd] = R[rs1] & imm; } // andi
else { panic("Unsupported funct3 = %d", funct3); }
R[0] = 0; // 若指令写入了R[0], 此处将其重置为0
} else if (...) { ... }
PC += 4;
typedef union {
struct {
uint32_t opcode : 7;
uint32_t rd : 5;
uint32_t funct3 : 3;
uint32_t rs1 : 5;
int64_t imm11_0 : 12;
} I;
struct { /* ... */ } R;
uint32_t bytes;
} inst_t;
inst_t *inst = (inst_t *)&M[PC];
uint32_t rd = inst->I.rd;
uint32_t rs1 = inst->I.rs1;
uint64_t imm = (int64_t)inst->I.imm11_0;
if (inst->I.opcode == 0b0010011) {
switch (inst->I.funct3) {
case 0b000: R[rd] = R[rs1] + imm; break; // addi
case 0b100: R[rd] = R[rs1] ^ imm; break; // xori
case 0b110: R[rd] = R[rs1] | imm; break; // ori
case 0b111: R[rd] = R[rs1] & imm; break; // andi
default: panic("Unsupported funct3 = %d", inst->I.funct3);
}
R[0] = 0; // 若指令写入了R[0], 此处将其重置为0
} else if (inst->bytes == 0x00100073) { ... }
struct
和位域(bit field)
union
switch-case
语句
正确的代码 != 好代码
好代码的两条重要准则
使用正确的编程模式写出好代码
assert
检查非预期行为import chisel3._
import chisel3.util._
class YEMU extends Module {
val io = IO(new Bundle{ val halt = Output(Bool()) })
val R = Mem(32, UInt(64.W))
val PC = RegInit(0.U(64.W))
val M = Mem(1024 / 4, UInt(32.W))
def Rread(idx: UInt) = Mux(idx === 0.U, 0.U(64.W), R(idx))
val Ibundle = new Bundle {
val imm11_0 = UInt(12.W)
val rs1 = UInt( 5.W)
val funct3 = UInt( 3.W)
val rd = UInt( 5.W)
val opcode = UInt( 7.W)
}
def SignEXT(imm11_0: UInt) = Cat(Fill(52, imm11_0(11)), imm11_0)
val inst = M(PC(63, 2)).asTypeOf(Ibundle)
val isAddi = (inst.opcode === "b0010011".U) && (inst.funct3 === "b000".U)
val isEbreak = inst.asUInt === "x00100073".U
assert(isAddi || isEbreak, "Invalid instruction 0x%x", inst.asUInt)
val rs1Val = Rread(Mux(isEbreak, 10.U(5.W), inst.rs1))
val rs2Val = Rread(Mux(isEbreak, 11.U(5.W), 0.U(5.W)))
when (isAddi) { R(inst.rd) := rs1Val + SignEXT(inst.imm11_0) }
when (isEbreak && (rs1Val === 0.U)) { printf("%c", rs2Val(7,0)) }
io.halt := isEbreak && (rs1Val === 1.U)
PC := PC + 4.U
}
module YEMU( // <stdin>:2:10
input clock,
reset,
output io_halt);
wire [31:0] _M_ext_R0_data; // YEMU.scala:8:15
wire [63:0] _R_ext_R0_data; // YEMU.scala:6:15
wire [63:0] _R_ext_R1_data; // YEMU.scala:6:15
reg [63:0] PC; // YEMU.scala:7:19
wire isAddi = _M_ext_R0_data[6:0] == 7'h13 & _M_ext_R0_data[14:12] == 3'h0; // YEMU.scala:8:15, :20:35, :21:{29,47,63}
wire isEbreak = _M_ext_R0_data == 32'h100073; // YEMU.scala:8:15, :22:30
wire [4:0] _rs1Val_T = isEbreak ? 5'hA : _M_ext_R0_data[19:15]; // YEMU.scala:8:15, :20:35, :22:30, :25:25
wire [63:0] rs1Val = _rs1Val_T == 5'h0 ? 64'h0 : _R_ext_R0_data; // YEMU.scala:6:15, :7:19, :9:{29,34}, :25:25, :26:25
wire [4:0] _rs2Val_T = isEbreak ? 5'hB : 5'h0; // YEMU.scala:22:30, :26:25
always @(posedge clock) begin
if (reset)
PC <= 64'h0; // YEMU.scala:7:19
else
PC <= PC + 64'h4; // YEMU.scala:7:19, :30:12
end // always @(posedge)
`ifndef SYNTHESIS // <stdin>:2:10
always @(posedge clock) begin // YEMU.scala:23:9
if (~reset & ~(isAddi | isEbreak)) begin // YEMU.scala:21:47, :22:30, :23:{9,17}
if (`ASSERT_VERBOSE_COND_) // YEMU.scala:23:9
$error("Assertion failed: Invalid instruction 0x%x\n at YEMU.scala:23 assert(isAddi || isEbreak, \"Invalid instruction 0x%%x\", inst.asUInt)\n", _M_ext_R0_data); // YEMU.scala:8:15, :23:9
if (`STOP_COND_) // YEMU.scala:23:9
$fatal; // YEMU.scala:23:9
end
if ((`PRINTF_COND_) & isEbreak & rs1Val == 64'h0 & ~reset) // YEMU.scala:7:19, :9:29, :22:30, :28:{29,47}
$fwrite(32'h80000002, "%c", {_rs2Val_T[3], _rs2Val_T[1:0]} == 3'h0 ? 8'h0 : _R_ext_R1_data[7:0]); // YEMU.scala:6:15, :7:19, :9:{29,34}, :21:63, :26:25, :28:47
end // always @(posedge)
`endif // not def SYNTHESIS
R_combMem R_ext ( // YEMU.scala:6:15
// ...
);
M_combMem M_ext ( // YEMU.scala:8:15
.R0_addr (PC[9:2]), // YEMU.scala:7:19, :20:15
.R0_en (1'h1), // <stdin>:2:10
.R0_clk (clock),
.R0_data (_M_ext_R0_data)
);
assign io_halt = isEbreak & rs1Val == 64'h1; // <stdin>:2:10, YEMU.scala:9:29, :22:30, :29:{23,34}
endmodule
#include <stdio.h>
#include "VYEMU.h"
#include "VYEMU___024root.h"
static VYEMU *top = NULL;
void step() { top->clock = 0; top->eval(); top->clock = 1; top->eval(); }
void reset(int n) { top->reset = 1; while (n --) { step(); } top->reset = 0; }
void load_prog(const char *bin) {
FILE *fp = fopen(bin, "r");
fread(&top->rootp->YEMU__DOT__M_ext__DOT__Memory, 1, 1024, fp);
fclose(fp);
}
int main(int argc, char *argv[]) {
top = new VYEMU;
load_prog(argv[1]);
reset(10);
while (!top->io_halt) { step(); }
return 0;
}
状态: 包含时序逻辑电路(部分组合逻辑信号和端口也用C变量来表示)
// obj_dir/VYEMU___024root.h
// PORTS
VL_IN8(clock,0,0);
VL_IN8(reset,0,0);
VL_OUT8(io_halt,0,0);
// LOCAL SIGNALS
IData/*31:0*/ YEMU__DOT___M_ext_R0_data;
QData/*63:0*/ YEMU__DOT__PC;
QData/*63:0*/ YEMU__DOT__rs1Val;
VlUnpacked<QData/*63:0*/, 32> YEMU__DOT__R_ext__DOT__Memory;
VlUnpacked<IData/*31:0*/, 256> YEMU__DOT__M_ext__DOT__Memory;
// ...
状态转移: 翻译Verilog中的组合逻辑电路
举例 | 效率 | 精确度 | |
---|---|---|---|
指令集模拟器 | YEMU, NEMU, QEMU | +++++ | 指令集(行为正确) |
体系结构模拟器 | GEM5 | +++ | 性能(大致运行时间) |
RTL仿真器 | VCS, Verilator | ++ | 微结构(IPC) |
晶体管仿真器 | Spice | + | 晶体管(物理特性) |
在处理器芯片设计企业中, 前三类都会使用:
后面的课程会进一步讨论
YEMU = 指令集模拟器 = 用C语言实现指令集手册定义的状态机
编写可读可维护的代码
使用正确的编程模式写出好代码
RTL仿真 = 用C++实现Verilog代码指定的电路状态机