refactor: extract token

This commit is contained in:
monoid 2025-01-30 01:40:22 +09:00
parent 5ab22e2f68
commit 9569b20542
3 changed files with 167 additions and 112 deletions

View file

@ -1,51 +1,3 @@
(* small set of ml *)
type op_type =
| Add
| Sub
| Mul
| Div
| Mod
| Pow
let op2str op =
match op with
| Add -> "+"
| Sub -> "-"
| Mul -> "*"
| Div -> "/"
| Mod -> "%"
| Pow -> "^"
type keyword_type =
| Let
| In
| If
| Then
| Else
| Fun
type token_type =
| Eof
| Identifier of string
| Digit of string
| Op of op_type
| LParen
| RParen
| Equal
| Arrow
| Keyword of keyword_type
| Comment of string
| Fail of string
type token = {
(* token type *)
token_type: token_type;
(* start position *)
pos: int;
}
let epsilon = '\000'
(* Lexer is just state machine *)
@ -54,8 +6,6 @@ let is_digit c = c >= '0' && c <= '9'
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
let is_alnum c = is_alpha c || is_digit c
type lexer_context = {
total: string;
pos: int;
@ -189,36 +139,33 @@ let%test "test get_digit" =
id = "123" && ctx'.pos = 3
let id_to_token_type id =
match id with
| "let" -> Keyword Let
| "in" -> Keyword In
| "if" -> Keyword If
| "then" -> Keyword Then
| "else" -> Keyword Else
| "fun" -> Keyword Fun
| _ -> Identifier id
match (Token.str2keyword id) with
| Some keyword -> keyword
| None -> Token.Identifier id
let lex_token (ctx: lexer_context) =
let make_token token_type pos = {token_type = token_type; pos = pos} in
let make_token token_type pos = {Token.token_type = token_type; pos = pos} in
let ctx = skip_spaces ctx in
let first_ch = input_first ctx in
let pos = ctx.pos in
let rest = input_rest ctx in
match first_ch with
| '\000' -> {token_type = Eof; pos = pos}, ctx
| '(' -> make_token LParen pos, input_rest ctx
| ')' -> make_token RParen pos, input_rest ctx
| '=' -> make_token Equal pos, input_rest ctx
| '+' -> make_token (Op Add) pos, input_rest ctx
| '\000' -> {Token.token_type = Eof; pos = pos}, ctx
| '(' -> make_token LParen pos, rest
| ')' -> make_token RParen pos, rest
| '=' -> make_token Equal pos, rest
| '+' -> make_token Add pos, rest
| '-' ->
let second_ch = input_first (input_rest ctx) in
let second_ch = input_first (rest) in
if second_ch = '>' then
make_token Arrow pos, input_rest (input_rest ctx)
make_token Arrow pos, input_rest rest
else
make_token (Op Sub) pos, input_rest ctx
| '*' -> make_token (Op Mul) pos, input_rest ctx
make_token Sub pos, rest
| '*' -> make_token Mul pos, rest
| '/' ->
(* check comment *)
let second_ch = input_first (input_rest ctx) in
let second_ch = input_first (rest) in
if second_ch = '/' then
let rec aux ctx =
let ch = input_first ctx in
@ -231,30 +178,30 @@ let lex_token (ctx: lexer_context) =
let comment = String.sub ctx.total pos len in
make_token (Comment comment) pos, ctx
else
make_token (Op Div) pos, input_rest ctx
| '%' -> make_token (Op Mod) pos, input_rest ctx
| '^' -> make_token (Op Pow) pos, input_rest ctx
make_token (Div) pos, rest
| '%' -> make_token (Mod) pos, rest
| '^' -> make_token (Pow) pos, rest
| c when is_alpha c ->
let id, ctx = get_identifier ctx in
make_token (id_to_token_type id) pos, ctx
| c when is_digit c ->
let id, ctx = get_digits ctx in
make_token (Digit id) pos, ctx
| _ -> make_token (Fail "invalid token") pos, input_rest ctx
| _ -> make_token (Fail "invalid token") pos, rest
let%test "test lex_token 1" =
let ctx = {total = "let"; pos = 0; line_pos = [||]} in
let token, ctx' = lex_token ctx in
token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3
token.token_type = Let && token.pos = 0 && ctx'.pos = 3
let%test "test lex_token 2" =
let ctx = {total = "let in"; pos = 0; line_pos = [||]} in
let token, ctx' = lex_token ctx in
let token', ctx'' = lex_token ctx' in
token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3 &&
token'.token_type = Keyword In && token'.pos = 4 && ctx''.pos = 6
token.token_type = Let && token.pos = 0 && ctx'.pos = 3 &&
token'.token_type = In && token'.pos = 4 && ctx''.pos = 6
let lex_tokens_seq (total: string): (token * lexer_context) Seq.t =
let lex_tokens_seq (total: string): (Token.t * lexer_context) Seq.t =
let rec aux ctx =
let token, next_ctx = lex_token ctx in
if token.token_type = Eof then
@ -269,8 +216,8 @@ let%test "test lex_tokens_seq" =
let seq = seq |> Seq.map (fun (token, _) -> token) in
let tokens = List.of_seq seq in
let expected = [
{token_type = Keyword Let; pos = 0};
{token_type = Keyword In; pos = 4};
{token_type = Eof; pos = 6}
{Token.token_type = Let; pos = 0};
{Token.token_type = In; pos = 4};
{Token.token_type = Eof; pos = 6}
] in
tokens = expected

View file

@ -1,7 +1,7 @@
open Lexer
type parser_context = {
seq: (Lexer.token * Lexer.lexer_context) Seq.t;
seq: (Token.t * Lexer.lexer_context) Seq.t;
errors: string list;
}
@ -32,15 +32,15 @@ let or_parser (a: 'a parser) (b: 'a parser): 'a parser = fun (ctx: parser_contex
let (<|>) = or_parser
let peek_token: token parser = fun (ctx: parser_context) ->
let peek_token: Token.t parser = fun (ctx: parser_context) ->
Seq.uncons ctx.seq |> Option.map (fun ((t, _),_) -> (t,ctx))
let next_token: token parser = fun (ctx: parser_context) ->
let next_token: Token.t parser = fun (ctx: parser_context) ->
Seq.uncons ctx.seq |> Option.map (fun ((t,_), s) -> (t,
{ ctx with seq = s}
))
let match_token (tt: token_type) : token parser =
let match_token (tt: Token.token_type) : Token.t parser =
let* t = next_token in
if t.token_type = tt then
return t
@ -79,6 +79,36 @@ BNF:
expr ::= let_expr | fun_expr | if_expr | level3
*)
type bin_op_type =
| Add
| Sub
| Mul
| Div
| Mod
| Pow
let token2op (t: Token.token_type): bin_op_type option =
match t with
| Token.Add -> Some Add
| Token.Sub -> Some Sub
| Token.Mul -> Some Mul
| Token.Div -> Some Div
| Token.Mod -> Some Mod
| Token.Pow -> Some Pow
| _ -> None
let op2str (op: bin_op_type): string =
match op with
| Add -> "+"
| Sub -> "-"
| Mul -> "*"
| Div -> "/"
| Mod -> "%"
| Pow -> "^"
type mono_op_type =
| Neg
type let_expr_tree = Let of string * expr_tree * expr_tree
and fun_expr_tree = Fun of string * expr_tree
and if_expr_tree = If of expr_tree * expr_tree * expr_tree
@ -88,8 +118,8 @@ and expr_tree =
| FunExpr of fun_expr_tree
| IfExpr of if_expr_tree
| CallExpr of call_expr_tree
| BinOpExpr of Lexer.op_type * expr_tree * expr_tree
| MonoOpExpr of Lexer.op_type * expr_tree
| BinOpExpr of bin_op_type * expr_tree * expr_tree
| MonoOpExpr of bin_op_type * expr_tree
| Identifier of string
| Number of int
@ -101,55 +131,55 @@ let expr2str (e: expr_tree): string =
| FunExpr (Fun (id, e)) -> Printf.sprintf "fun %s ->\n%s%s" id (tab depth) (aux e (depth+1))
| IfExpr (If (e1, e2, e3)) -> Printf.sprintf "if %s then\n%s%selse\n%s%s" (aux e1 depth) (tab depth) (aux e2 depth) (tab depth) (aux e3 depth)
| CallExpr (Call (e1, e2)) -> Printf.sprintf "%s %s" (aux e1 depth) (aux e2 depth)
| BinOpExpr (op, e1, e2) -> Printf.sprintf "%s %s %s" (aux e1 depth) (Lexer.op2str op) (aux e2 depth)
| MonoOpExpr (op, e) -> Printf.sprintf "%s %s" (Lexer.op2str op) (aux e depth)
| BinOpExpr (op, e1, e2) -> Printf.sprintf "%s %s %s" (aux e1 depth) (op2str op) (aux e2 depth)
| MonoOpExpr (op, e) -> Printf.sprintf "%s %s" (op2str op) (aux e depth)
| Identifier id -> id
| Number n -> string_of_int n in
aux e 0
let rec parse_let_expr (): let_expr_tree parser =
let* _ = match_token (Lexer.Keyword Lexer.Let) in
let* _ = match_token ( Token.Let) in
let* tt = next_token in
match tt.token_type with
Lexer.Identifier(x) ->
Token.Identifier(x) ->
let id = x in
let* _ = match_token Lexer.Equal in
let* _ = match_token Token.Equal in
let* e1 = expr() in
let* _ = match_token (Lexer.Keyword Lexer.In) in
let* _ = match_token (Token.In) in
let* e2 = expr() in
return (Let (id, e1, e2))
| _ -> stop
and parse_fun_expr (): fun_expr_tree parser =
let* _ = match_token (Lexer.Keyword Lexer.Fun) in
let* _ = match_token (Token.Fun) in
let* tt = next_token in
match tt.token_type with
Lexer.Identifier(x) ->
Token.Identifier(x) ->
let id = x in
let* _ = match_token Lexer.Arrow in
let* _ = match_token Token.Arrow in
let* e = expr() in
return (Fun (id, e))
| _ -> stop
and parse_if_expr (): if_expr_tree parser =
let* _ = match_token (Lexer.Keyword Lexer.If) in
let* _ = match_token (Token.If) in
let* e1 = expr() in
let* _ = match_token (Lexer.Keyword Lexer.Then) in
let* _ = match_token (Token.Then) in
let* e2 = expr() in
let* _ = match_token (Lexer.Keyword Lexer.Else) in
let* _ = match_token (Token.Else) in
let* e3 = expr() in
return (If (e1, e2, e3))
and parse_factor (): expr_tree parser =
let* tt = peek_token in
match tt.token_type with
| Lexer.Identifier x ->
| Token.Identifier x ->
let* _ = next_token in
return (Identifier x)
| Lexer.Digit x ->
| Token.Digit x ->
let* _ = next_token in
return (Number (int_of_string x))
| Lexer.LParen ->
let* _ = match_token Lexer.LParen in
| Token.LParen ->
let* _ = match_token Token.LParen in
let* e = expr() in
let* _ = match_token Lexer.RParen in
let* _ = match_token Token.RParen in
return e
| _ -> stop
and parse_call_expr (): expr_tree parser =
@ -157,7 +187,7 @@ and parse_call_expr (): expr_tree parser =
let rec aux e1 =
let* c = peek_token in
match c.token_type with
| Lexer.Identifier _ | Lexer.Digit _ | Lexer.LParen ->
| Token.Identifier _ | Token.Digit _ | Token.LParen ->
let* e2 = parse_factor() in
aux (CallExpr (Call (e1, e2)))
| _ -> return e1 in
@ -166,10 +196,14 @@ and parse_level1 (): expr_tree parser =
let* e1 = parse_call_expr() in
let rec aux e1 =
let* c = peek_token in
match c.token_type with
| Lexer.Op op when op = Lexer.Add || op = Lexer.Sub ->
let tt = c.token_type in
match tt with
| Token.Add | Token.Sub ->
let* _ = next_token in
let* e2 = parse_call_expr() in
let op = match token2op tt with
| Some x -> x
| None -> failwith "unreachable" in
aux (BinOpExpr (op, e1, e2))
| _ -> return e1 in
aux e1
@ -178,9 +212,12 @@ and parse_level2 (): expr_tree parser =
let rec aux e1 =
let* c = peek_token in
match c.token_type with
| Lexer.Op op when op = Lexer.Mul || op = Lexer.Div || op = Lexer.Mod ->
| Token.Mul | Token.Div | Token.Mod ->
let* _ = next_token in
let* e2 = parse_level1() in
let op = match token2op c.token_type with
| Some x -> x
| None -> failwith "unreachable" in
aux (BinOpExpr (op, e1, e2))
| _ -> return e1 in
aux e1
@ -189,9 +226,12 @@ and parse_level3 (): expr_tree parser =
let rec aux e1 =
let* c = peek_token in
match c.token_type with
| Lexer.Op op when op = Lexer.Pow ->
| Token.Pow ->
let* _ = next_token in
let* e2 = parse_level3() in
let op = match token2op c.token_type with
| Some x -> x
| None -> failwith "unreachable" in
aux (BinOpExpr (op, e1, e2))
| _ -> return e1 in
aux e1
@ -201,10 +241,10 @@ and expr (): expr_tree parser =
(parse_if_expr() |> fmap (fun x -> IfExpr x)) <|> parse_level3() in
return e
let get_expr_tree_from_tokens (tokens: (Lexer.token * Lexer.lexer_context) Seq.t): expr_tree option =
let ntokens = Seq.filter (fun (x,_) ->
match x.token_type with
| Lexer.Comment(_) -> false
let get_expr_tree_from_tokens (tokens: (Token.t * Lexer.lexer_context) Seq.t): expr_tree option =
let ntokens = Seq.filter (fun ((token,_): Token.t * Lexer.lexer_context) ->
match token.Token.token_type with
| Token.Comment(_) -> false
| _ -> true
) tokens in
let ctx = { seq = ntokens; errors = [] } in

68
lib/token.ml Normal file
View file

@ -0,0 +1,68 @@
type token_type =
| Eof
| Identifier of string
| Digit of string
| Add
| Sub
| Mul
| Div
| Mod
| Pow
| LParen
| RParen
| Equal
| Arrow
| Colon
| Let
| In
| If
| Then
| Else
| Fun
| Comment of string
| Fail of string
let op = [
Add, "+";
Sub, "-";
Mul, "*";
Div, "/";
Mod, "%";
Pow, "^";
] |> List.to_seq |> Hashtbl.of_seq
let op2str op = Hashtbl.find op
let symbol = [
LParen, "(";
RParen, ")";
Equal, "=";
Arrow, "->";
Colon, ":";
] |> List.to_seq |> Hashtbl.of_seq
let keywordTable = [
Let, "let";
In, "in";
If, "if";
Then, "then";
Else, "else";
Fun, "fun";
] |> List.to_seq |> Hashtbl.of_seq
let str2keywordTable = keywordTable
|> Hashtbl.to_seq
|> Seq.map (fun (x, y) -> (y, x))
|> Hashtbl.of_seq
let keyword2str keyword = Hashtbl.find keywordTable keyword
let str2keyword str = Hashtbl.find_opt str2keywordTable str
type t = {
(* token type *)
token_type: token_type;
(* start position *)
pos: int;
}