Joseph Varghese
def factorial(int c): int = {
if (c == 1) {
return 1
} else {
return c*factorial(c-1)
}
}
def main(): int = {
printbig(factorial(5)) /* Inbuilt function */
return 0
}
0000000000000000 <main>:
push %rax
mov $0x5,%edi
callq 20 <factorial>
mov %eax,%edi
callq 12 <main+0x12>
xor %eax,%eax
pop %rcx
retq
nopw %cs:0x0(%rax,%rax,1)
0000000000000020 <factorial>:
push %rbx
sub $0x10,%rsp
mov %edi,0xc(%rsp)
cmp $0x1,%edi
jne 35 <factorial+0x15>
mov $0x1,%eax
jmp 44 <factorial+0x24>
mov 0xc(%rsp),%ebx
lea -0x1(%rbx),%edi
callq 20 <factorial>
imul %ebx,%eax
add $0x10,%rsp
pop %rbx
retq
We build a compiler to translate our language to CPU language using Ocaml and LLVM.
Input -> Lexer -> Parser -> Code Generation -> Output
def factorial(int c): int = {
if (c == 1) {
return 1
} else {
return c*factorial(c-1)
}
}
Breaks source code to tokens using regex rules.
rule token = parse
[' ' '\t' '\r'] { token lexbuf } (* Whitespace *)
| '\n' { NEWLINE }
| "def" { FUNCDEF }
| '(' { LPAREN }
| ')' { RPAREN }
| '{' { LBRACE }
| '}' { RBRACE }
| ':' { COLON }
| '+' { PLUS }
| '-' { MINUS }
| '*' { TIMES }
| '/' { DIVIDE }
| '=' { ASSIGN }
| "==" { EQ }
| "if" { IF }
| "else" { ELSE }
| "return" { RETURN }
| "int" { INT }
| "true" { TRUE }
| "false" { FALSE }
| ['0'-'9']+ as lxm { LITERAL(int_of_string lxm) } (* A number *)
| ['a'-'z' 'A'-'Z']['a'-'z' 'A'-'Z' '0'-'9' '_']* as lxm { ID(lxm) } (* A string *)
| eof { EOF }
FUNCDEF ID(factorial) LPAREN INT ID(c) RPAREN COLON INT ASSIGN LBRACE NEWLINE
IF LPAREN ID(c) EQ LITERAL RPAREN LBRACE NEWLINE RETURN LITERAL(1) NEWLINE RBRACE
ELSE LBRACE NEWLINE RETURN ID(c) TIMES ID(factorial) LPAREN ID(c) MINUS LITERAL(1)
RPAREN NEWLINE RBRACE NEWLINE RBRACE NEWLINE
Abstract Syntax Tree (AST) is source code of a program represented as a tree
A parser reads the tokens from the Lexer and builds an AST based on grammar rules.
Functions form basic block for our language.
type program = func_decl list (* A program is a list of function declarations *)
def factorial(int c): int = {
if (c == 1) {
return 1
} else {
return c*factorial(c-1)
}
}
In Ocaml can be represented as
type op = Add | Mul | Div | Sub | Equal
type typ = Bool | Int
type bind = typ * string
type func_decl = {
ftype: typ;
fname: string;
formals: bind list;
body: expr list;
}
A function body is a list of expressions.
In ocaml an expression can be represented as
expr =
Literal of int
| BoolLit of bool
| Id of string
| Binop of expr * op * expr
| Func of func_decl
| Assign of string * expr
| Call of string * expr list
| If of expr * expr list * expr list
| Return of expr
| ExprList of expr list
if (x == 1) {
return 10
} else {
return 20
}
If of
expr * expr list * expr list
a + b
Binop of
expr * op * expr
factorial(c-1)
Call of
string * expr list
def factorial(int c): int = {
if (c == 1) {
return 1
} else {
return c*factorial(c-1)
}
}
Lets combine all our grammar rules to build the complete list of rules.
func_decl:
FUNCDEF ID LPAREN formals_list RPAREN COLON typ ASSIGN expr_list { {
ftype = $7; fname = $2; formals = $4; body = $9;
} }
formals_list:
typ ID { [($1, $2)] } | formals_list COMMA typ ID { ($3, $4) :: $1}
typ:
INT { Int } | BOOL { Bool }
expr_list:
| NEWLINE {[]}
| expr_list RETURN expr NEWLINE { Return($3) :: $1}
| expr_list expr NEWLINE {$2 :: $1}
expr:
LITERAL { Literal($1) }
| TRUE { BoolLit(true) }
| FALSE { BoolLit(false) }
| ID { Id($1) }
| expr PLUS expr { Binop($1, Add, $3) }
| expr MINUS expr { Binop($1, Sub, $3) }
| expr TIMES expr { Binop($1, Mul, $3) }
| expr DIVIDE expr { Binop($1, Div, $3) }
| expr EQ expr { Binop($1, Equal, $3) }
| ID ASSIGN expr { Assign($1, $3) }
| ID LPAREN actuals_opt RPAREN { Call($1, $3) }
| IF LPAREN expr RPAREN b_expr_list ELSE b_expr_list { If($3, $5, $7) }
Walks the AST and generate LLVM bytecode
def add (int a, int b): int = {
return a+b
}
Draw picture for AST of this function Expressed in LLVM bytecode
define i32 @add(i32 %a, i32 %b) {
entry:
%a1 = alloca i32 ; allocate local var a1
store i32 %a, i32* %a1 ; store param a to a1
%b2 = alloca i32 ; allocate local var b2
store i32 %b, i32* %b2 ; store param b to b2
%a3 = load i32, i32* %a1 ; load a1 to a2
%b4 = load i32, i32* %b2 ; local b2 to b4
%tmp = add i32 %a3, %b4 ; add a3 to b4 and store in tmp
ret i32 %tmp ; return tmp
}
if (x == 1) {
return 10
} else {
return 20
}
Expressed in LLVM bytecode
let rec expr builder = function
A.Literal i -> L.const_int i32_t i
| A.BoolLit b -> L.const_int i1_t (if b then 1 else 0)
| A.Id s -> L.build_load (lookup s) s builder
| A.Binop (e1, op, e2) ->
let e1' = expr builder e1
and e2' = expr builder e2 in
(match op with
A.Add -> L.build_add
| A.Sub -> L.build_sub
| A.Mul -> L.build_mul
| A.Div -> L.build_sdiv
| A.Equal -> L.build_icmp L.Icmp.Eq
) e1' e2' "tmp" builder
| A.Assign (s, e) -> let e' = expr builder e in
ignore (L.build_store e' (lookup s) builder); e'
| A.Call (f, act) ->
let (fdef, fdecl) = StringMap.find function_decls f in
let actuals = List.rev (List.map (expr builder) (List.rev act)) in
let result = f ^ "_result") in
L.build_call fdef (Array.of_list actuals) result builder
let rec expr builder = function
| A.If (c, tl , el) ->
(* both then and else branches merge at a single point *)
let bool_val = expr builder c in
let merge_bb = L.append_block context "merge" the_function in
let then_bb = L.append_block context "then" the_function in
add_terminal (expr (L.builder_at_end context then_bb) (A.ExprList tl)
L.builder_at_end context then_bb) (L.build_br merge_bb);
let else_bb = L.append_block context "else" the_function in
add_terminal (expr (L.builder_at_end context else_bb) (A.ExprList el)
L.builder_at_end context else_bb) (L.build_br merge_bb);
let merger = L.build_cond_br bool_val then_bb else_bb builder in
L.position_at_end merge_bb builder; merger
| A.Return e ->
L.build_ret (expr builder e) builder
| A.ExprList el ->
let k = List.fold_left expr_build_returner builder el in
let g = L.block_terminator (L.insertion_block k) in
match g with
Some x -> x
| None -> L.const_int i32_t 0
declare i32 @printbig(i32)
define i32 @main() {
entry:
%factorial_result = call i32 @factorial(i32 5)
%printbig = call i32 @printbig(i32 %factorial_result)
ret i32 0
}
define i32 @factorial(i32 %c) {
entry:
%c1 = alloca i32
store i32 %c, i32* %c1
%c2 = load i32, i32* %c1
%tmp = icmp eq i32 %c2, 1
br i1 %tmp, label %then, label %else
merge: ; No predecessors!
ret i32 0
then: ; preds = %entry
ret i32 1
else: ; preds = %entry
%c3 = load i32, i32* %c1
%c4 = load i32, i32* %c1
%tmp5 = sub i32 %c4, 1
%factorial_result = call i32 @factorial(i32 %tmp5)
%tmp6 = mul i32 %c3, %factorial_result
ret i32 %tmp6
}
#include <stdio.h>
void printbig(int c)
{
printf("%d\n", c);
}
LLVM provides tools to compile LLVM bytecode to machine code
llc-3.8 genllvm.ll -filetype=obj # Compile to Object Code
gcc a.o printbig.o # Link to printbig library
Running our binary
./a.out
120
Thats all folks. We've built a compiler.
Special thanks to Niklas Heer & Ricardo Martinez