refactor: extract token

2025-01-30 01:40:22 +09:00 · 2025-01-30 01:40:22 +09:00 · 9569b20542
commit 9569b20542
parent 5ab22e2f68
3 changed files with 167 additions and 112 deletions
--- a/lib/lexer.ml
+++ b/lib/lexer.ml
@ -1,51 +1,3 @@
 (* small set of ml *)
 type op_type = 
  | Add
  | Sub
  | Mul
  | Div
  | Mod
  | Pow
 let op2str op = 
  match op with
  | Add -> "+"
  | Sub -> "-"
  | Mul -> "*"
  | Div -> "/"
  | Mod -> "%"
  | Pow -> "^"
 type keyword_type = 
  | Let
  | In
  | If
  | Then
  | Else
  | Fun
 type token_type = 
  | Eof
  | Identifier of string
  | Digit of string
  | Op of op_type
  | LParen
  | RParen
  | Equal
  | Arrow
  | Keyword of keyword_type
  | Comment of string
  | Fail of string
 type token = {
  (* token type *)
  token_type: token_type;
  (* start position *)
  pos: int;
 }
 let epsilon = '\000'
 (* Lexer is just state machine *)
@ -54,8 +6,6 @@ let is_digit c = c >= '0' && c <= '9'
 let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
 let is_alnum c = is_alpha c || is_digit c
 type lexer_context = {
  total: string;
  pos: int;
@ -189,36 +139,33 @@ let%test "test get_digit" =
  id = "123" && ctx'.pos = 3
 let id_to_token_type id = 
-  match id with
+  match (Token.str2keyword id) with 
-  | "let" -> Keyword Let
+  | Some keyword -> keyword
-  | "in" -> Keyword In
+  | None -> Token.Identifier id
-  | "if" -> Keyword If
+  
  | "then" -> Keyword Then
  | "else" -> Keyword Else
  | "fun" -> Keyword Fun
  | _ -> Identifier id
 let lex_token (ctx: lexer_context) = 
-  let make_token token_type pos = {token_type = token_type; pos = pos} in
+  let make_token token_type pos = {Token.token_type = token_type; pos = pos} in
  let ctx = skip_spaces ctx in
  let first_ch = input_first ctx in
  let pos = ctx.pos in
  let rest = input_rest ctx in
  match first_ch with
-  | '\000' -> {token_type = Eof; pos = pos}, ctx
+  | '\000' -> {Token.token_type = Eof; pos = pos}, ctx
-  | '(' -> make_token LParen pos, input_rest ctx
+  | '(' -> make_token LParen pos, rest
-  | ')' -> make_token RParen pos, input_rest ctx
+  | ')' -> make_token RParen pos, rest
-  | '=' -> make_token Equal pos, input_rest ctx
+  | '=' -> make_token Equal pos, rest
-  | '+' -> make_token (Op Add) pos, input_rest ctx
+  | '+' -> make_token Add pos, rest
  | '-' -> 
-    let second_ch = input_first (input_rest ctx) in
+    let second_ch = input_first (rest) in
    if second_ch = '>' then 
-      make_token Arrow pos, input_rest (input_rest ctx)
+      make_token Arrow pos, input_rest rest
    else 
-      make_token (Op Sub) pos, input_rest ctx
+      make_token Sub pos, rest
-  | '*' -> make_token (Op Mul) pos, input_rest ctx
+  | '*' -> make_token Mul pos, rest
  | '/' -> 
    (* check comment *)
-    let second_ch = input_first (input_rest ctx) in
+    let second_ch = input_first (rest) in
    if second_ch = '/' then 
      let rec aux ctx = 
        let ch = input_first ctx in 
@ -231,30 +178,30 @@ let lex_token (ctx: lexer_context) =
      let comment = String.sub ctx.total pos len in
      make_token (Comment comment) pos, ctx
    else 
-      make_token (Op Div) pos, input_rest ctx
+      make_token (Div) pos, rest
-  | '%' -> make_token (Op Mod) pos, input_rest ctx
+  | '%' -> make_token (Mod) pos, rest
-  | '^' -> make_token (Op Pow) pos, input_rest ctx
+  | '^' -> make_token (Pow) pos, rest
  | c when is_alpha c -> 
    let id, ctx = get_identifier ctx in
    make_token (id_to_token_type id) pos, ctx
  | c when is_digit c ->
    let id, ctx = get_digits ctx in
    make_token (Digit id) pos, ctx
-  | _ -> make_token (Fail "invalid token") pos, input_rest ctx
+  | _ -> make_token (Fail "invalid token") pos, rest
 let%test "test lex_token 1" =
  let ctx = {total = "let"; pos = 0; line_pos = [||]} in 
  let token, ctx' = lex_token ctx in 
-  token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3
+  token.token_type = Let && token.pos = 0 && ctx'.pos = 3
 let%test "test lex_token 2" =
  let ctx = {total = "let in"; pos = 0; line_pos = [||]} in
  let token, ctx' = lex_token ctx in
  let token', ctx'' = lex_token ctx' in
-  token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3 &&
+  token.token_type = Let && token.pos = 0 && ctx'.pos = 3 &&
-  token'.token_type = Keyword In && token'.pos = 4 && ctx''.pos = 6
+  token'.token_type = In && token'.pos = 4 && ctx''.pos = 6
-let lex_tokens_seq (total: string): (token * lexer_context) Seq.t = 
+let lex_tokens_seq (total: string): (Token.t * lexer_context) Seq.t = 
  let rec aux ctx = 
    let token, next_ctx = lex_token ctx in
    if token.token_type = Eof then
@ -269,8 +216,8 @@ let%test "test lex_tokens_seq" =
  let seq = seq |> Seq.map (fun (token, _) -> token) in 
  let tokens = List.of_seq seq in
  let expected = [
-    {token_type = Keyword Let; pos = 0};
+    {Token.token_type = Let; pos = 0};
-    {token_type = Keyword In; pos = 4};
+    {Token.token_type = In; pos = 4};
-    {token_type = Eof; pos = 6}
+    {Token.token_type = Eof; pos = 6}
  ] in
  tokens = expected
--- a/lib/parser.ml
+++ b/lib/parser.ml
@ -1,7 +1,7 @@
 open Lexer
 type parser_context = {
-  seq: (Lexer.token * Lexer.lexer_context) Seq.t;
+  seq: (Token.t * Lexer.lexer_context) Seq.t;
  errors: string list;
 }
@ -32,15 +32,15 @@ let or_parser (a: 'a parser) (b: 'a parser): 'a parser = fun (ctx: parser_contex
 let (<|>) = or_parser
-let peek_token: token parser = fun (ctx: parser_context) -> 
+let peek_token: Token.t parser = fun (ctx: parser_context) -> 
  Seq.uncons ctx.seq |> Option.map (fun ((t, _),_) -> (t,ctx))
-let next_token: token parser = fun (ctx: parser_context) -> 
+let next_token: Token.t parser = fun (ctx: parser_context) -> 
  Seq.uncons ctx.seq |> Option.map (fun ((t,_), s) -> (t, 
  { ctx with seq = s}
 ))
-let match_token  (tt: token_type) : token parser = 
+let match_token  (tt: Token.token_type) : Token.t parser = 
  let* t = next_token in
  if t.token_type = tt then
    return t
@ -79,6 +79,36 @@ BNF:
  expr ::= let_expr | fun_expr | if_expr | level3
 *)
 type bin_op_type = 
  | Add
  | Sub
  | Mul
  | Div
  | Mod
  | Pow
 let token2op (t: Token.token_type): bin_op_type option = 
  match t with
  | Token.Add -> Some Add
  | Token.Sub -> Some Sub
  | Token.Mul -> Some Mul
  | Token.Div -> Some Div
  | Token.Mod -> Some Mod
  | Token.Pow -> Some Pow
  | _ -> None
 let op2str (op: bin_op_type): string = 
  match op with
  | Add -> "+"
  | Sub -> "-"
  | Mul -> "*"
  | Div -> "/"
  | Mod -> "%"
  | Pow -> "^"
 type mono_op_type = 
  | Neg
 type let_expr_tree = Let of string * expr_tree * expr_tree
 and fun_expr_tree = Fun of string * expr_tree
 and if_expr_tree = If of expr_tree * expr_tree * expr_tree
@ -88,8 +118,8 @@ and expr_tree =
  | FunExpr of fun_expr_tree
  | IfExpr of if_expr_tree
  | CallExpr of call_expr_tree
-  | BinOpExpr of Lexer.op_type * expr_tree * expr_tree
+  | BinOpExpr of bin_op_type * expr_tree * expr_tree
-  | MonoOpExpr of Lexer.op_type * expr_tree
+  | MonoOpExpr of bin_op_type * expr_tree
  | Identifier of string
  | Number of int
@ -101,55 +131,55 @@ let expr2str (e: expr_tree): string =
    | FunExpr (Fun (id, e)) -> Printf.sprintf "fun %s ->\n%s%s" id (tab depth) (aux e (depth+1))
    | IfExpr (If (e1, e2, e3)) -> Printf.sprintf "if %s then\n%s%selse\n%s%s" (aux e1 depth) (tab depth) (aux e2 depth) (tab depth) (aux e3 depth)
    | CallExpr (Call (e1, e2)) -> Printf.sprintf "%s %s" (aux e1 depth) (aux e2 depth)
-    | BinOpExpr (op, e1, e2) -> Printf.sprintf "%s %s %s" (aux e1 depth) (Lexer.op2str op) (aux e2 depth)
+    | BinOpExpr (op, e1, e2) -> Printf.sprintf "%s %s %s" (aux e1 depth) (op2str op) (aux e2 depth)
-    | MonoOpExpr (op, e) -> Printf.sprintf "%s %s" (Lexer.op2str op) (aux e depth)
+    | MonoOpExpr (op, e) -> Printf.sprintf "%s %s" (op2str op) (aux e depth)
    | Identifier id -> id
    | Number n -> string_of_int n in
  aux e 0
 let rec parse_let_expr (): let_expr_tree parser = 
-  let* _ = match_token (Lexer.Keyword Lexer.Let) in
+  let* _ = match_token ( Token.Let) in
  let* tt = next_token in
  match tt.token_type with
-    Lexer.Identifier(x) ->
+  Token.Identifier(x) ->
      let id = x in
-      let* _ = match_token Lexer.Equal in
+      let* _ = match_token Token.Equal in
      let* e1 = expr() in
-      let* _ = match_token (Lexer.Keyword Lexer.In) in
+      let* _ = match_token (Token.In) in
      let* e2 = expr() in
      return (Let (id, e1, e2))
  | _ -> stop
 and parse_fun_expr (): fun_expr_tree parser =
-  let* _ = match_token (Lexer.Keyword Lexer.Fun) in
+  let* _ = match_token (Token.Fun) in
  let* tt = next_token in
  match tt.token_type with
-    Lexer.Identifier(x) ->
+  Token.Identifier(x) ->
      let id = x in
-      let* _ = match_token Lexer.Arrow in
+      let* _ = match_token Token.Arrow in
      let* e = expr() in
      return (Fun (id, e))
  | _ -> stop
 and parse_if_expr (): if_expr_tree parser =
-  let* _ = match_token (Lexer.Keyword Lexer.If) in
+  let* _ = match_token (Token.If) in
  let* e1 = expr() in
-  let* _ = match_token (Lexer.Keyword Lexer.Then) in
+  let* _ = match_token (Token.Then) in
  let* e2 = expr() in
-  let* _ = match_token (Lexer.Keyword Lexer.Else) in
+  let* _ = match_token (Token.Else) in
  let* e3 = expr() in
  return (If (e1, e2, e3))
 and parse_factor (): expr_tree parser =
  let* tt = peek_token in
  match tt.token_type with
-  | Lexer.Identifier x ->
+  | Token.Identifier x ->
    let* _ = next_token in
    return (Identifier x)
-  | Lexer.Digit x ->
+  | Token.Digit x ->
    let* _ = next_token in
    return (Number (int_of_string x)) 
-  | Lexer.LParen -> 
+  | Token.LParen -> 
-    let* _ = match_token Lexer.LParen in
+    let* _ = match_token Token.LParen in
    let* e = expr() in
-    let* _ = match_token Lexer.RParen in
+    let* _ = match_token Token.RParen in
    return e
  | _ -> stop
 and parse_call_expr (): expr_tree parser =
@ -157,7 +187,7 @@ and parse_call_expr (): expr_tree parser =
  let rec aux e1 = 
    let* c = peek_token in
    match c.token_type with
-    | Lexer.Identifier _ | Lexer.Digit _ | Lexer.LParen -> 
+    | Token.Identifier _ | Token.Digit _ | Token.LParen -> 
      let* e2 = parse_factor() in
      aux (CallExpr (Call (e1, e2)))
    | _ -> return e1 in
@ -166,10 +196,14 @@ and parse_level1 (): expr_tree parser =
  let* e1 = parse_call_expr() in
  let rec aux e1 = 
    let* c = peek_token in
-    match c.token_type with
+    let tt = c.token_type in
-    | Lexer.Op op when op = Lexer.Add || op = Lexer.Sub -> 
+    match tt with
    | Token.Add | Token.Sub ->
      let* _ = next_token in
      let* e2 = parse_call_expr() in
      let op = match token2op tt with
      | Some x -> x
      | None -> failwith "unreachable" in
      aux (BinOpExpr (op, e1, e2))
    | _ -> return e1 in
  aux e1
@ -178,9 +212,12 @@ and parse_level2 (): expr_tree parser =
  let rec aux e1 = 
    let* c = peek_token in
    match c.token_type with
-    | Lexer.Op op when op = Lexer.Mul || op = Lexer.Div || op = Lexer.Mod -> 
+    | Token.Mul | Token.Div | Token.Mod -> 
      let* _ = next_token in
      let* e2 = parse_level1() in
      let op = match token2op c.token_type with
      | Some x -> x
      | None -> failwith "unreachable" in
      aux (BinOpExpr (op, e1, e2))
    | _ -> return e1 in
  aux e1
@ -189,9 +226,12 @@ and parse_level3 (): expr_tree parser =
  let rec aux e1 = 
    let* c = peek_token in
    match c.token_type with
-    | Lexer.Op op when op = Lexer.Pow -> 
+    | Token.Pow -> 
      let* _ = next_token in
      let* e2 = parse_level3() in
      let op = match token2op c.token_type with
      | Some x -> x
      | None -> failwith "unreachable" in
      aux (BinOpExpr (op, e1, e2))
    | _ -> return e1 in
  aux e1
@ -201,10 +241,10 @@ and expr (): expr_tree parser =
    (parse_if_expr() |> fmap (fun x -> IfExpr x)) <|> parse_level3() in
  return e
-let get_expr_tree_from_tokens (tokens: (Lexer.token * Lexer.lexer_context) Seq.t): expr_tree option =
+let get_expr_tree_from_tokens (tokens: (Token.t * Lexer.lexer_context) Seq.t): expr_tree option =
-  let ntokens = Seq.filter (fun (x,_) -> 
+  let ntokens = Seq.filter (fun ((token,_): Token.t * Lexer.lexer_context) -> 
-    match x.token_type with 
+    match token.Token.token_type with 
-    | Lexer.Comment(_) -> false 
+    | Token.Comment(_) -> false 
    | _ -> true
  ) tokens in
  let ctx = { seq = ntokens; errors = [] } in
--- a/lib/token.ml
+++ b/lib/token.ml
@ -0,0 +1,68 @@
 type token_type = 
  | Eof
  | Identifier of string
  | Digit of string
  | Add
  | Sub
  | Mul
  | Div
  | Mod
  | Pow
  | LParen
  | RParen
  | Equal
  | Arrow
  | Colon
  | Let 
  | In
  | If
  | Then
  | Else
  | Fun
  | Comment of string
  | Fail of string
 let op = [
  Add, "+";
  Sub, "-";
  Mul, "*";
  Div, "/";
  Mod, "%";
  Pow, "^";
 ] |> List.to_seq |> Hashtbl.of_seq
 let op2str op = Hashtbl.find op
 let symbol = [
  LParen, "(";
  RParen, ")";
  Equal, "=";
  Arrow, "->";
  Colon, ":";
 ] |> List.to_seq |> Hashtbl.of_seq
 let keywordTable = [
  Let, "let";
  In, "in";
  If, "if";
  Then, "then";
  Else, "else";
  Fun, "fun";
 ] |> List.to_seq |> Hashtbl.of_seq
 let str2keywordTable = keywordTable 
  |> Hashtbl.to_seq 
  |> Seq.map (fun (x, y) -> (y, x)) 
  |> Hashtbl.of_seq
 let keyword2str keyword = Hashtbl.find keywordTable keyword
 let str2keyword str = Hashtbl.find_opt str2keywordTable str
 type t = {
  (* token type *)
  token_type: token_type;
  (* start position *)
  pos: int;
 }