refactor: extract token
This commit is contained in:
parent
5ab22e2f68
commit
9569b20542
3 changed files with 167 additions and 112 deletions
107
lib/lexer.ml
107
lib/lexer.ml
|
@ -1,51 +1,3 @@
|
|||
(* small set of ml *)
|
||||
|
||||
type op_type =
|
||||
| Add
|
||||
| Sub
|
||||
| Mul
|
||||
| Div
|
||||
| Mod
|
||||
| Pow
|
||||
|
||||
let op2str op =
|
||||
match op with
|
||||
| Add -> "+"
|
||||
| Sub -> "-"
|
||||
| Mul -> "*"
|
||||
| Div -> "/"
|
||||
| Mod -> "%"
|
||||
| Pow -> "^"
|
||||
|
||||
type keyword_type =
|
||||
| Let
|
||||
| In
|
||||
| If
|
||||
| Then
|
||||
| Else
|
||||
| Fun
|
||||
|
||||
type token_type =
|
||||
| Eof
|
||||
| Identifier of string
|
||||
| Digit of string
|
||||
| Op of op_type
|
||||
| LParen
|
||||
| RParen
|
||||
| Equal
|
||||
| Arrow
|
||||
| Keyword of keyword_type
|
||||
| Comment of string
|
||||
| Fail of string
|
||||
|
||||
|
||||
type token = {
|
||||
(* token type *)
|
||||
token_type: token_type;
|
||||
(* start position *)
|
||||
pos: int;
|
||||
}
|
||||
|
||||
let epsilon = '\000'
|
||||
|
||||
(* Lexer is just state machine *)
|
||||
|
@ -54,8 +6,6 @@ let is_digit c = c >= '0' && c <= '9'
|
|||
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
|
||||
let is_alnum c = is_alpha c || is_digit c
|
||||
|
||||
|
||||
|
||||
type lexer_context = {
|
||||
total: string;
|
||||
pos: int;
|
||||
|
@ -189,36 +139,33 @@ let%test "test get_digit" =
|
|||
id = "123" && ctx'.pos = 3
|
||||
|
||||
let id_to_token_type id =
|
||||
match id with
|
||||
| "let" -> Keyword Let
|
||||
| "in" -> Keyword In
|
||||
| "if" -> Keyword If
|
||||
| "then" -> Keyword Then
|
||||
| "else" -> Keyword Else
|
||||
| "fun" -> Keyword Fun
|
||||
| _ -> Identifier id
|
||||
match (Token.str2keyword id) with
|
||||
| Some keyword -> keyword
|
||||
| None -> Token.Identifier id
|
||||
|
||||
|
||||
let lex_token (ctx: lexer_context) =
|
||||
let make_token token_type pos = {token_type = token_type; pos = pos} in
|
||||
let make_token token_type pos = {Token.token_type = token_type; pos = pos} in
|
||||
let ctx = skip_spaces ctx in
|
||||
let first_ch = input_first ctx in
|
||||
let pos = ctx.pos in
|
||||
let rest = input_rest ctx in
|
||||
match first_ch with
|
||||
| '\000' -> {token_type = Eof; pos = pos}, ctx
|
||||
| '(' -> make_token LParen pos, input_rest ctx
|
||||
| ')' -> make_token RParen pos, input_rest ctx
|
||||
| '=' -> make_token Equal pos, input_rest ctx
|
||||
| '+' -> make_token (Op Add) pos, input_rest ctx
|
||||
| '\000' -> {Token.token_type = Eof; pos = pos}, ctx
|
||||
| '(' -> make_token LParen pos, rest
|
||||
| ')' -> make_token RParen pos, rest
|
||||
| '=' -> make_token Equal pos, rest
|
||||
| '+' -> make_token Add pos, rest
|
||||
| '-' ->
|
||||
let second_ch = input_first (input_rest ctx) in
|
||||
let second_ch = input_first (rest) in
|
||||
if second_ch = '>' then
|
||||
make_token Arrow pos, input_rest (input_rest ctx)
|
||||
make_token Arrow pos, input_rest rest
|
||||
else
|
||||
make_token (Op Sub) pos, input_rest ctx
|
||||
| '*' -> make_token (Op Mul) pos, input_rest ctx
|
||||
make_token Sub pos, rest
|
||||
| '*' -> make_token Mul pos, rest
|
||||
| '/' ->
|
||||
(* check comment *)
|
||||
let second_ch = input_first (input_rest ctx) in
|
||||
let second_ch = input_first (rest) in
|
||||
if second_ch = '/' then
|
||||
let rec aux ctx =
|
||||
let ch = input_first ctx in
|
||||
|
@ -231,30 +178,30 @@ let lex_token (ctx: lexer_context) =
|
|||
let comment = String.sub ctx.total pos len in
|
||||
make_token (Comment comment) pos, ctx
|
||||
else
|
||||
make_token (Op Div) pos, input_rest ctx
|
||||
| '%' -> make_token (Op Mod) pos, input_rest ctx
|
||||
| '^' -> make_token (Op Pow) pos, input_rest ctx
|
||||
make_token (Div) pos, rest
|
||||
| '%' -> make_token (Mod) pos, rest
|
||||
| '^' -> make_token (Pow) pos, rest
|
||||
| c when is_alpha c ->
|
||||
let id, ctx = get_identifier ctx in
|
||||
make_token (id_to_token_type id) pos, ctx
|
||||
| c when is_digit c ->
|
||||
let id, ctx = get_digits ctx in
|
||||
make_token (Digit id) pos, ctx
|
||||
| _ -> make_token (Fail "invalid token") pos, input_rest ctx
|
||||
| _ -> make_token (Fail "invalid token") pos, rest
|
||||
|
||||
let%test "test lex_token 1" =
|
||||
let ctx = {total = "let"; pos = 0; line_pos = [||]} in
|
||||
let token, ctx' = lex_token ctx in
|
||||
token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3
|
||||
token.token_type = Let && token.pos = 0 && ctx'.pos = 3
|
||||
|
||||
let%test "test lex_token 2" =
|
||||
let ctx = {total = "let in"; pos = 0; line_pos = [||]} in
|
||||
let token, ctx' = lex_token ctx in
|
||||
let token', ctx'' = lex_token ctx' in
|
||||
token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3 &&
|
||||
token'.token_type = Keyword In && token'.pos = 4 && ctx''.pos = 6
|
||||
token.token_type = Let && token.pos = 0 && ctx'.pos = 3 &&
|
||||
token'.token_type = In && token'.pos = 4 && ctx''.pos = 6
|
||||
|
||||
let lex_tokens_seq (total: string): (token * lexer_context) Seq.t =
|
||||
let lex_tokens_seq (total: string): (Token.t * lexer_context) Seq.t =
|
||||
let rec aux ctx =
|
||||
let token, next_ctx = lex_token ctx in
|
||||
if token.token_type = Eof then
|
||||
|
@ -269,8 +216,8 @@ let%test "test lex_tokens_seq" =
|
|||
let seq = seq |> Seq.map (fun (token, _) -> token) in
|
||||
let tokens = List.of_seq seq in
|
||||
let expected = [
|
||||
{token_type = Keyword Let; pos = 0};
|
||||
{token_type = Keyword In; pos = 4};
|
||||
{token_type = Eof; pos = 6}
|
||||
{Token.token_type = Let; pos = 0};
|
||||
{Token.token_type = In; pos = 4};
|
||||
{Token.token_type = Eof; pos = 6}
|
||||
] in
|
||||
tokens = expected
|
||||
|
|
104
lib/parser.ml
104
lib/parser.ml
|
@ -1,7 +1,7 @@
|
|||
open Lexer
|
||||
|
||||
type parser_context = {
|
||||
seq: (Lexer.token * Lexer.lexer_context) Seq.t;
|
||||
seq: (Token.t * Lexer.lexer_context) Seq.t;
|
||||
errors: string list;
|
||||
}
|
||||
|
||||
|
@ -32,15 +32,15 @@ let or_parser (a: 'a parser) (b: 'a parser): 'a parser = fun (ctx: parser_contex
|
|||
|
||||
let (<|>) = or_parser
|
||||
|
||||
let peek_token: token parser = fun (ctx: parser_context) ->
|
||||
let peek_token: Token.t parser = fun (ctx: parser_context) ->
|
||||
Seq.uncons ctx.seq |> Option.map (fun ((t, _),_) -> (t,ctx))
|
||||
|
||||
let next_token: token parser = fun (ctx: parser_context) ->
|
||||
let next_token: Token.t parser = fun (ctx: parser_context) ->
|
||||
Seq.uncons ctx.seq |> Option.map (fun ((t,_), s) -> (t,
|
||||
{ ctx with seq = s}
|
||||
))
|
||||
|
||||
let match_token (tt: token_type) : token parser =
|
||||
let match_token (tt: Token.token_type) : Token.t parser =
|
||||
let* t = next_token in
|
||||
if t.token_type = tt then
|
||||
return t
|
||||
|
@ -79,6 +79,36 @@ BNF:
|
|||
expr ::= let_expr | fun_expr | if_expr | level3
|
||||
*)
|
||||
|
||||
type bin_op_type =
|
||||
| Add
|
||||
| Sub
|
||||
| Mul
|
||||
| Div
|
||||
| Mod
|
||||
| Pow
|
||||
|
||||
let token2op (t: Token.token_type): bin_op_type option =
|
||||
match t with
|
||||
| Token.Add -> Some Add
|
||||
| Token.Sub -> Some Sub
|
||||
| Token.Mul -> Some Mul
|
||||
| Token.Div -> Some Div
|
||||
| Token.Mod -> Some Mod
|
||||
| Token.Pow -> Some Pow
|
||||
| _ -> None
|
||||
|
||||
let op2str (op: bin_op_type): string =
|
||||
match op with
|
||||
| Add -> "+"
|
||||
| Sub -> "-"
|
||||
| Mul -> "*"
|
||||
| Div -> "/"
|
||||
| Mod -> "%"
|
||||
| Pow -> "^"
|
||||
|
||||
type mono_op_type =
|
||||
| Neg
|
||||
|
||||
type let_expr_tree = Let of string * expr_tree * expr_tree
|
||||
and fun_expr_tree = Fun of string * expr_tree
|
||||
and if_expr_tree = If of expr_tree * expr_tree * expr_tree
|
||||
|
@ -88,8 +118,8 @@ and expr_tree =
|
|||
| FunExpr of fun_expr_tree
|
||||
| IfExpr of if_expr_tree
|
||||
| CallExpr of call_expr_tree
|
||||
| BinOpExpr of Lexer.op_type * expr_tree * expr_tree
|
||||
| MonoOpExpr of Lexer.op_type * expr_tree
|
||||
| BinOpExpr of bin_op_type * expr_tree * expr_tree
|
||||
| MonoOpExpr of bin_op_type * expr_tree
|
||||
| Identifier of string
|
||||
| Number of int
|
||||
|
||||
|
@ -101,55 +131,55 @@ let expr2str (e: expr_tree): string =
|
|||
| FunExpr (Fun (id, e)) -> Printf.sprintf "fun %s ->\n%s%s" id (tab depth) (aux e (depth+1))
|
||||
| IfExpr (If (e1, e2, e3)) -> Printf.sprintf "if %s then\n%s%selse\n%s%s" (aux e1 depth) (tab depth) (aux e2 depth) (tab depth) (aux e3 depth)
|
||||
| CallExpr (Call (e1, e2)) -> Printf.sprintf "%s %s" (aux e1 depth) (aux e2 depth)
|
||||
| BinOpExpr (op, e1, e2) -> Printf.sprintf "%s %s %s" (aux e1 depth) (Lexer.op2str op) (aux e2 depth)
|
||||
| MonoOpExpr (op, e) -> Printf.sprintf "%s %s" (Lexer.op2str op) (aux e depth)
|
||||
| BinOpExpr (op, e1, e2) -> Printf.sprintf "%s %s %s" (aux e1 depth) (op2str op) (aux e2 depth)
|
||||
| MonoOpExpr (op, e) -> Printf.sprintf "%s %s" (op2str op) (aux e depth)
|
||||
| Identifier id -> id
|
||||
| Number n -> string_of_int n in
|
||||
aux e 0
|
||||
|
||||
let rec parse_let_expr (): let_expr_tree parser =
|
||||
let* _ = match_token (Lexer.Keyword Lexer.Let) in
|
||||
let* _ = match_token ( Token.Let) in
|
||||
let* tt = next_token in
|
||||
match tt.token_type with
|
||||
Lexer.Identifier(x) ->
|
||||
Token.Identifier(x) ->
|
||||
let id = x in
|
||||
let* _ = match_token Lexer.Equal in
|
||||
let* _ = match_token Token.Equal in
|
||||
let* e1 = expr() in
|
||||
let* _ = match_token (Lexer.Keyword Lexer.In) in
|
||||
let* _ = match_token (Token.In) in
|
||||
let* e2 = expr() in
|
||||
return (Let (id, e1, e2))
|
||||
| _ -> stop
|
||||
and parse_fun_expr (): fun_expr_tree parser =
|
||||
let* _ = match_token (Lexer.Keyword Lexer.Fun) in
|
||||
let* _ = match_token (Token.Fun) in
|
||||
let* tt = next_token in
|
||||
match tt.token_type with
|
||||
Lexer.Identifier(x) ->
|
||||
Token.Identifier(x) ->
|
||||
let id = x in
|
||||
let* _ = match_token Lexer.Arrow in
|
||||
let* _ = match_token Token.Arrow in
|
||||
let* e = expr() in
|
||||
return (Fun (id, e))
|
||||
| _ -> stop
|
||||
and parse_if_expr (): if_expr_tree parser =
|
||||
let* _ = match_token (Lexer.Keyword Lexer.If) in
|
||||
let* _ = match_token (Token.If) in
|
||||
let* e1 = expr() in
|
||||
let* _ = match_token (Lexer.Keyword Lexer.Then) in
|
||||
let* _ = match_token (Token.Then) in
|
||||
let* e2 = expr() in
|
||||
let* _ = match_token (Lexer.Keyword Lexer.Else) in
|
||||
let* _ = match_token (Token.Else) in
|
||||
let* e3 = expr() in
|
||||
return (If (e1, e2, e3))
|
||||
and parse_factor (): expr_tree parser =
|
||||
let* tt = peek_token in
|
||||
match tt.token_type with
|
||||
| Lexer.Identifier x ->
|
||||
| Token.Identifier x ->
|
||||
let* _ = next_token in
|
||||
return (Identifier x)
|
||||
| Lexer.Digit x ->
|
||||
| Token.Digit x ->
|
||||
let* _ = next_token in
|
||||
return (Number (int_of_string x))
|
||||
| Lexer.LParen ->
|
||||
let* _ = match_token Lexer.LParen in
|
||||
| Token.LParen ->
|
||||
let* _ = match_token Token.LParen in
|
||||
let* e = expr() in
|
||||
let* _ = match_token Lexer.RParen in
|
||||
let* _ = match_token Token.RParen in
|
||||
return e
|
||||
| _ -> stop
|
||||
and parse_call_expr (): expr_tree parser =
|
||||
|
@ -157,7 +187,7 @@ and parse_call_expr (): expr_tree parser =
|
|||
let rec aux e1 =
|
||||
let* c = peek_token in
|
||||
match c.token_type with
|
||||
| Lexer.Identifier _ | Lexer.Digit _ | Lexer.LParen ->
|
||||
| Token.Identifier _ | Token.Digit _ | Token.LParen ->
|
||||
let* e2 = parse_factor() in
|
||||
aux (CallExpr (Call (e1, e2)))
|
||||
| _ -> return e1 in
|
||||
|
@ -166,10 +196,14 @@ and parse_level1 (): expr_tree parser =
|
|||
let* e1 = parse_call_expr() in
|
||||
let rec aux e1 =
|
||||
let* c = peek_token in
|
||||
match c.token_type with
|
||||
| Lexer.Op op when op = Lexer.Add || op = Lexer.Sub ->
|
||||
let tt = c.token_type in
|
||||
match tt with
|
||||
| Token.Add | Token.Sub ->
|
||||
let* _ = next_token in
|
||||
let* e2 = parse_call_expr() in
|
||||
let op = match token2op tt with
|
||||
| Some x -> x
|
||||
| None -> failwith "unreachable" in
|
||||
aux (BinOpExpr (op, e1, e2))
|
||||
| _ -> return e1 in
|
||||
aux e1
|
||||
|
@ -178,9 +212,12 @@ and parse_level2 (): expr_tree parser =
|
|||
let rec aux e1 =
|
||||
let* c = peek_token in
|
||||
match c.token_type with
|
||||
| Lexer.Op op when op = Lexer.Mul || op = Lexer.Div || op = Lexer.Mod ->
|
||||
| Token.Mul | Token.Div | Token.Mod ->
|
||||
let* _ = next_token in
|
||||
let* e2 = parse_level1() in
|
||||
let op = match token2op c.token_type with
|
||||
| Some x -> x
|
||||
| None -> failwith "unreachable" in
|
||||
aux (BinOpExpr (op, e1, e2))
|
||||
| _ -> return e1 in
|
||||
aux e1
|
||||
|
@ -189,9 +226,12 @@ and parse_level3 (): expr_tree parser =
|
|||
let rec aux e1 =
|
||||
let* c = peek_token in
|
||||
match c.token_type with
|
||||
| Lexer.Op op when op = Lexer.Pow ->
|
||||
| Token.Pow ->
|
||||
let* _ = next_token in
|
||||
let* e2 = parse_level3() in
|
||||
let op = match token2op c.token_type with
|
||||
| Some x -> x
|
||||
| None -> failwith "unreachable" in
|
||||
aux (BinOpExpr (op, e1, e2))
|
||||
| _ -> return e1 in
|
||||
aux e1
|
||||
|
@ -201,10 +241,10 @@ and expr (): expr_tree parser =
|
|||
(parse_if_expr() |> fmap (fun x -> IfExpr x)) <|> parse_level3() in
|
||||
return e
|
||||
|
||||
let get_expr_tree_from_tokens (tokens: (Lexer.token * Lexer.lexer_context) Seq.t): expr_tree option =
|
||||
let ntokens = Seq.filter (fun (x,_) ->
|
||||
match x.token_type with
|
||||
| Lexer.Comment(_) -> false
|
||||
let get_expr_tree_from_tokens (tokens: (Token.t * Lexer.lexer_context) Seq.t): expr_tree option =
|
||||
let ntokens = Seq.filter (fun ((token,_): Token.t * Lexer.lexer_context) ->
|
||||
match token.Token.token_type with
|
||||
| Token.Comment(_) -> false
|
||||
| _ -> true
|
||||
) tokens in
|
||||
let ctx = { seq = ntokens; errors = [] } in
|
||||
|
|
68
lib/token.ml
Normal file
68
lib/token.ml
Normal file
|
@ -0,0 +1,68 @@
|
|||
|
||||
type token_type =
|
||||
| Eof
|
||||
| Identifier of string
|
||||
| Digit of string
|
||||
| Add
|
||||
| Sub
|
||||
| Mul
|
||||
| Div
|
||||
| Mod
|
||||
| Pow
|
||||
| LParen
|
||||
| RParen
|
||||
| Equal
|
||||
| Arrow
|
||||
| Colon
|
||||
| Let
|
||||
| In
|
||||
| If
|
||||
| Then
|
||||
| Else
|
||||
| Fun
|
||||
| Comment of string
|
||||
| Fail of string
|
||||
|
||||
let op = [
|
||||
Add, "+";
|
||||
Sub, "-";
|
||||
Mul, "*";
|
||||
Div, "/";
|
||||
Mod, "%";
|
||||
Pow, "^";
|
||||
] |> List.to_seq |> Hashtbl.of_seq
|
||||
|
||||
let op2str op = Hashtbl.find op
|
||||
|
||||
let symbol = [
|
||||
LParen, "(";
|
||||
RParen, ")";
|
||||
Equal, "=";
|
||||
Arrow, "->";
|
||||
Colon, ":";
|
||||
] |> List.to_seq |> Hashtbl.of_seq
|
||||
|
||||
|
||||
let keywordTable = [
|
||||
Let, "let";
|
||||
In, "in";
|
||||
If, "if";
|
||||
Then, "then";
|
||||
Else, "else";
|
||||
Fun, "fun";
|
||||
] |> List.to_seq |> Hashtbl.of_seq
|
||||
|
||||
let str2keywordTable = keywordTable
|
||||
|> Hashtbl.to_seq
|
||||
|> Seq.map (fun (x, y) -> (y, x))
|
||||
|> Hashtbl.of_seq
|
||||
|
||||
let keyword2str keyword = Hashtbl.find keywordTable keyword
|
||||
let str2keyword str = Hashtbl.find_opt str2keywordTable str
|
||||
|
||||
type t = {
|
||||
(* token type *)
|
||||
token_type: token_type;
|
||||
(* start position *)
|
||||
pos: int;
|
||||
}
|
Loading…
Add table
Reference in a new issue