diff --git a/lib/lexer.ml b/lib/lexer.ml index 30c82fe..71a22eb 100644 --- a/lib/lexer.ml +++ b/lib/lexer.ml @@ -1,51 +1,3 @@ -(* small set of ml *) - -type op_type = - | Add - | Sub - | Mul - | Div - | Mod - | Pow - -let op2str op = - match op with - | Add -> "+" - | Sub -> "-" - | Mul -> "*" - | Div -> "/" - | Mod -> "%" - | Pow -> "^" - -type keyword_type = - | Let - | In - | If - | Then - | Else - | Fun - -type token_type = - | Eof - | Identifier of string - | Digit of string - | Op of op_type - | LParen - | RParen - | Equal - | Arrow - | Keyword of keyword_type - | Comment of string - | Fail of string - - -type token = { - (* token type *) - token_type: token_type; - (* start position *) - pos: int; -} - let epsilon = '\000' (* Lexer is just state machine *) @@ -54,8 +6,6 @@ let is_digit c = c >= '0' && c <= '9' let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') let is_alnum c = is_alpha c || is_digit c - - type lexer_context = { total: string; pos: int; @@ -189,36 +139,33 @@ let%test "test get_digit" = id = "123" && ctx'.pos = 3 let id_to_token_type id = - match id with - | "let" -> Keyword Let - | "in" -> Keyword In - | "if" -> Keyword If - | "then" -> Keyword Then - | "else" -> Keyword Else - | "fun" -> Keyword Fun - | _ -> Identifier id + match (Token.str2keyword id) with + | Some keyword -> keyword + | None -> Token.Identifier id + let lex_token (ctx: lexer_context) = - let make_token token_type pos = {token_type = token_type; pos = pos} in + let make_token token_type pos = {Token.token_type = token_type; pos = pos} in let ctx = skip_spaces ctx in let first_ch = input_first ctx in let pos = ctx.pos in + let rest = input_rest ctx in match first_ch with - | '\000' -> {token_type = Eof; pos = pos}, ctx - | '(' -> make_token LParen pos, input_rest ctx - | ')' -> make_token RParen pos, input_rest ctx - | '=' -> make_token Equal pos, input_rest ctx - | '+' -> make_token (Op Add) pos, input_rest ctx + | '\000' -> {Token.token_type = Eof; pos = pos}, ctx + | '(' -> make_token LParen pos, rest + | ')' -> make_token RParen pos, rest + | '=' -> make_token Equal pos, rest + | '+' -> make_token Add pos, rest | '-' -> - let second_ch = input_first (input_rest ctx) in + let second_ch = input_first (rest) in if second_ch = '>' then - make_token Arrow pos, input_rest (input_rest ctx) + make_token Arrow pos, input_rest rest else - make_token (Op Sub) pos, input_rest ctx - | '*' -> make_token (Op Mul) pos, input_rest ctx + make_token Sub pos, rest + | '*' -> make_token Mul pos, rest | '/' -> (* check comment *) - let second_ch = input_first (input_rest ctx) in + let second_ch = input_first (rest) in if second_ch = '/' then let rec aux ctx = let ch = input_first ctx in @@ -231,30 +178,30 @@ let lex_token (ctx: lexer_context) = let comment = String.sub ctx.total pos len in make_token (Comment comment) pos, ctx else - make_token (Op Div) pos, input_rest ctx - | '%' -> make_token (Op Mod) pos, input_rest ctx - | '^' -> make_token (Op Pow) pos, input_rest ctx + make_token (Div) pos, rest + | '%' -> make_token (Mod) pos, rest + | '^' -> make_token (Pow) pos, rest | c when is_alpha c -> let id, ctx = get_identifier ctx in make_token (id_to_token_type id) pos, ctx | c when is_digit c -> let id, ctx = get_digits ctx in make_token (Digit id) pos, ctx - | _ -> make_token (Fail "invalid token") pos, input_rest ctx + | _ -> make_token (Fail "invalid token") pos, rest let%test "test lex_token 1" = let ctx = {total = "let"; pos = 0; line_pos = [||]} in let token, ctx' = lex_token ctx in - token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3 + token.token_type = Let && token.pos = 0 && ctx'.pos = 3 let%test "test lex_token 2" = let ctx = {total = "let in"; pos = 0; line_pos = [||]} in let token, ctx' = lex_token ctx in let token', ctx'' = lex_token ctx' in - token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3 && - token'.token_type = Keyword In && token'.pos = 4 && ctx''.pos = 6 + token.token_type = Let && token.pos = 0 && ctx'.pos = 3 && + token'.token_type = In && token'.pos = 4 && ctx''.pos = 6 -let lex_tokens_seq (total: string): (token * lexer_context) Seq.t = +let lex_tokens_seq (total: string): (Token.t * lexer_context) Seq.t = let rec aux ctx = let token, next_ctx = lex_token ctx in if token.token_type = Eof then @@ -269,8 +216,8 @@ let%test "test lex_tokens_seq" = let seq = seq |> Seq.map (fun (token, _) -> token) in let tokens = List.of_seq seq in let expected = [ - {token_type = Keyword Let; pos = 0}; - {token_type = Keyword In; pos = 4}; - {token_type = Eof; pos = 6} + {Token.token_type = Let; pos = 0}; + {Token.token_type = In; pos = 4}; + {Token.token_type = Eof; pos = 6} ] in tokens = expected diff --git a/lib/parser.ml b/lib/parser.ml index 3b5dec1..9ffaf37 100644 --- a/lib/parser.ml +++ b/lib/parser.ml @@ -1,7 +1,7 @@ open Lexer type parser_context = { - seq: (Lexer.token * Lexer.lexer_context) Seq.t; + seq: (Token.t * Lexer.lexer_context) Seq.t; errors: string list; } @@ -32,15 +32,15 @@ let or_parser (a: 'a parser) (b: 'a parser): 'a parser = fun (ctx: parser_contex let (<|>) = or_parser -let peek_token: token parser = fun (ctx: parser_context) -> +let peek_token: Token.t parser = fun (ctx: parser_context) -> Seq.uncons ctx.seq |> Option.map (fun ((t, _),_) -> (t,ctx)) -let next_token: token parser = fun (ctx: parser_context) -> +let next_token: Token.t parser = fun (ctx: parser_context) -> Seq.uncons ctx.seq |> Option.map (fun ((t,_), s) -> (t, { ctx with seq = s} )) -let match_token (tt: token_type) : token parser = +let match_token (tt: Token.token_type) : Token.t parser = let* t = next_token in if t.token_type = tt then return t @@ -79,6 +79,36 @@ BNF: expr ::= let_expr | fun_expr | if_expr | level3 *) +type bin_op_type = + | Add + | Sub + | Mul + | Div + | Mod + | Pow + +let token2op (t: Token.token_type): bin_op_type option = + match t with + | Token.Add -> Some Add + | Token.Sub -> Some Sub + | Token.Mul -> Some Mul + | Token.Div -> Some Div + | Token.Mod -> Some Mod + | Token.Pow -> Some Pow + | _ -> None + +let op2str (op: bin_op_type): string = + match op with + | Add -> "+" + | Sub -> "-" + | Mul -> "*" + | Div -> "/" + | Mod -> "%" + | Pow -> "^" + +type mono_op_type = + | Neg + type let_expr_tree = Let of string * expr_tree * expr_tree and fun_expr_tree = Fun of string * expr_tree and if_expr_tree = If of expr_tree * expr_tree * expr_tree @@ -88,8 +118,8 @@ and expr_tree = | FunExpr of fun_expr_tree | IfExpr of if_expr_tree | CallExpr of call_expr_tree - | BinOpExpr of Lexer.op_type * expr_tree * expr_tree - | MonoOpExpr of Lexer.op_type * expr_tree + | BinOpExpr of bin_op_type * expr_tree * expr_tree + | MonoOpExpr of bin_op_type * expr_tree | Identifier of string | Number of int @@ -101,55 +131,55 @@ let expr2str (e: expr_tree): string = | FunExpr (Fun (id, e)) -> Printf.sprintf "fun %s ->\n%s%s" id (tab depth) (aux e (depth+1)) | IfExpr (If (e1, e2, e3)) -> Printf.sprintf "if %s then\n%s%selse\n%s%s" (aux e1 depth) (tab depth) (aux e2 depth) (tab depth) (aux e3 depth) | CallExpr (Call (e1, e2)) -> Printf.sprintf "%s %s" (aux e1 depth) (aux e2 depth) - | BinOpExpr (op, e1, e2) -> Printf.sprintf "%s %s %s" (aux e1 depth) (Lexer.op2str op) (aux e2 depth) - | MonoOpExpr (op, e) -> Printf.sprintf "%s %s" (Lexer.op2str op) (aux e depth) + | BinOpExpr (op, e1, e2) -> Printf.sprintf "%s %s %s" (aux e1 depth) (op2str op) (aux e2 depth) + | MonoOpExpr (op, e) -> Printf.sprintf "%s %s" (op2str op) (aux e depth) | Identifier id -> id | Number n -> string_of_int n in aux e 0 let rec parse_let_expr (): let_expr_tree parser = - let* _ = match_token (Lexer.Keyword Lexer.Let) in + let* _ = match_token ( Token.Let) in let* tt = next_token in match tt.token_type with - Lexer.Identifier(x) -> + Token.Identifier(x) -> let id = x in - let* _ = match_token Lexer.Equal in + let* _ = match_token Token.Equal in let* e1 = expr() in - let* _ = match_token (Lexer.Keyword Lexer.In) in + let* _ = match_token (Token.In) in let* e2 = expr() in return (Let (id, e1, e2)) | _ -> stop and parse_fun_expr (): fun_expr_tree parser = - let* _ = match_token (Lexer.Keyword Lexer.Fun) in + let* _ = match_token (Token.Fun) in let* tt = next_token in match tt.token_type with - Lexer.Identifier(x) -> + Token.Identifier(x) -> let id = x in - let* _ = match_token Lexer.Arrow in + let* _ = match_token Token.Arrow in let* e = expr() in return (Fun (id, e)) | _ -> stop and parse_if_expr (): if_expr_tree parser = - let* _ = match_token (Lexer.Keyword Lexer.If) in + let* _ = match_token (Token.If) in let* e1 = expr() in - let* _ = match_token (Lexer.Keyword Lexer.Then) in + let* _ = match_token (Token.Then) in let* e2 = expr() in - let* _ = match_token (Lexer.Keyword Lexer.Else) in + let* _ = match_token (Token.Else) in let* e3 = expr() in return (If (e1, e2, e3)) and parse_factor (): expr_tree parser = let* tt = peek_token in match tt.token_type with - | Lexer.Identifier x -> + | Token.Identifier x -> let* _ = next_token in return (Identifier x) - | Lexer.Digit x -> + | Token.Digit x -> let* _ = next_token in return (Number (int_of_string x)) - | Lexer.LParen -> - let* _ = match_token Lexer.LParen in + | Token.LParen -> + let* _ = match_token Token.LParen in let* e = expr() in - let* _ = match_token Lexer.RParen in + let* _ = match_token Token.RParen in return e | _ -> stop and parse_call_expr (): expr_tree parser = @@ -157,7 +187,7 @@ and parse_call_expr (): expr_tree parser = let rec aux e1 = let* c = peek_token in match c.token_type with - | Lexer.Identifier _ | Lexer.Digit _ | Lexer.LParen -> + | Token.Identifier _ | Token.Digit _ | Token.LParen -> let* e2 = parse_factor() in aux (CallExpr (Call (e1, e2))) | _ -> return e1 in @@ -166,10 +196,14 @@ and parse_level1 (): expr_tree parser = let* e1 = parse_call_expr() in let rec aux e1 = let* c = peek_token in - match c.token_type with - | Lexer.Op op when op = Lexer.Add || op = Lexer.Sub -> + let tt = c.token_type in + match tt with + | Token.Add | Token.Sub -> let* _ = next_token in let* e2 = parse_call_expr() in + let op = match token2op tt with + | Some x -> x + | None -> failwith "unreachable" in aux (BinOpExpr (op, e1, e2)) | _ -> return e1 in aux e1 @@ -178,9 +212,12 @@ and parse_level2 (): expr_tree parser = let rec aux e1 = let* c = peek_token in match c.token_type with - | Lexer.Op op when op = Lexer.Mul || op = Lexer.Div || op = Lexer.Mod -> + | Token.Mul | Token.Div | Token.Mod -> let* _ = next_token in let* e2 = parse_level1() in + let op = match token2op c.token_type with + | Some x -> x + | None -> failwith "unreachable" in aux (BinOpExpr (op, e1, e2)) | _ -> return e1 in aux e1 @@ -189,9 +226,12 @@ and parse_level3 (): expr_tree parser = let rec aux e1 = let* c = peek_token in match c.token_type with - | Lexer.Op op when op = Lexer.Pow -> + | Token.Pow -> let* _ = next_token in let* e2 = parse_level3() in + let op = match token2op c.token_type with + | Some x -> x + | None -> failwith "unreachable" in aux (BinOpExpr (op, e1, e2)) | _ -> return e1 in aux e1 @@ -201,10 +241,10 @@ and expr (): expr_tree parser = (parse_if_expr() |> fmap (fun x -> IfExpr x)) <|> parse_level3() in return e -let get_expr_tree_from_tokens (tokens: (Lexer.token * Lexer.lexer_context) Seq.t): expr_tree option = - let ntokens = Seq.filter (fun (x,_) -> - match x.token_type with - | Lexer.Comment(_) -> false +let get_expr_tree_from_tokens (tokens: (Token.t * Lexer.lexer_context) Seq.t): expr_tree option = + let ntokens = Seq.filter (fun ((token,_): Token.t * Lexer.lexer_context) -> + match token.Token.token_type with + | Token.Comment(_) -> false | _ -> true ) tokens in let ctx = { seq = ntokens; errors = [] } in diff --git a/lib/token.ml b/lib/token.ml new file mode 100644 index 0000000..384b07b --- /dev/null +++ b/lib/token.ml @@ -0,0 +1,68 @@ + +type token_type = + | Eof + | Identifier of string + | Digit of string + | Add + | Sub + | Mul + | Div + | Mod + | Pow + | LParen + | RParen + | Equal + | Arrow + | Colon + | Let + | In + | If + | Then + | Else + | Fun + | Comment of string + | Fail of string + +let op = [ + Add, "+"; + Sub, "-"; + Mul, "*"; + Div, "/"; + Mod, "%"; + Pow, "^"; +] |> List.to_seq |> Hashtbl.of_seq + +let op2str op = Hashtbl.find op + +let symbol = [ + LParen, "("; + RParen, ")"; + Equal, "="; + Arrow, "->"; + Colon, ":"; +] |> List.to_seq |> Hashtbl.of_seq + + +let keywordTable = [ + Let, "let"; + In, "in"; + If, "if"; + Then, "then"; + Else, "else"; + Fun, "fun"; +] |> List.to_seq |> Hashtbl.of_seq + +let str2keywordTable = keywordTable + |> Hashtbl.to_seq + |> Seq.map (fun (x, y) -> (y, x)) + |> Hashtbl.of_seq + +let keyword2str keyword = Hashtbl.find keywordTable keyword +let str2keyword str = Hashtbl.find_opt str2keywordTable str + +type t = { + (* token type *) + token_type: token_type; + (* start position *) + pos: int; +} \ No newline at end of file