let epsilon = '\000' (* Lexer is just state machine *) let is_digit c = c >= '0' && c <= '9' let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') let is_alnum c = is_alpha c || is_digit c type lexer_context = { total: string; pos: int; (* \n position array *) line_pos: int array; } let binary_search_range arr x = if Array.length arr = 0 then 0 else let rec aux low high = match compare low high with | 0 -> if arr.(low) >= x then low else low + 1 (* unreachable *) | c when c > 0 -> invalid_arg "binary_search_range" | _ -> let mid = (low + high) / 2 in if arr.(mid) >= x && ( mid = 0 || arr.(mid - 1) < x) then mid else if arr.(mid) < x then aux (mid + 1) high else aux low (mid - 1) in aux 0 (Array.length arr - 1) let get_line_and_col (line_pos: int array) (pos: int) = let line_index = binary_search_range line_pos pos in (* let _ = Printf.printf "line_index: %d\n" line_index in *) let line_start_pos = if line_index > 0 then (line_pos.(line_index - 1) + 1) else 0 in (line_index + 1, pos - (line_start_pos) + 1) let%test "test: get_line_and_col 1" = let actual = get_line_and_col [|1; 2; 3; 4; 6|] 3 in let expected = (3, 1) in (* let _ = Printf.printf "(%d,%d)\n" (fst actual) (snd actual) in *) actual = expected let%test "test: get_line_and_col 2" = let actual = get_line_and_col [|1; 2; 3; 4; 6|] 10 in let expected = (6, 4) in (* let _ = Printf.printf "(%d,%d)\n" (fst actual) (snd actual) in *) actual = expected let input_first (ctx: lexer_context) = if ctx.pos < String.length ctx.total then ctx.total.[ctx.pos] else epsilon let%test "test first" = let ctx = {total = "abc"; pos = 0; line_pos = [||]} in input_first ctx = 'a' let input_rest (ctx: lexer_context) = let ch = input_first ctx in if ch = '\n' then {ctx with pos = ctx.pos + 1; line_pos = Array.append ctx.line_pos [|ctx.pos|]} else {ctx with pos = ctx.pos + 1} let%test "test rest" = let ctx = {total = "abc"; pos = 0; line_pos = [||]} in let ctx' = input_rest ctx in ctx'.pos = 1 && ctx'.line_pos = [||] let%test "test rest with new line" = let ctx = {total = "a\nbc"; pos = 1; line_pos = [||]} in let ctx' = input_rest ctx in ctx'.pos = 2 && ctx'.line_pos = [|1|] let%test "test rest with new line 2" = let ctx = {total = "a\nb\nc"; pos = 3; line_pos = [|1|]} in let ctx' = input_rest ctx in (* let _ = Printf.printf "pos: %d, line_pos: %s" ctx'.pos (String.concat "," (List.map string_of_int (Array.to_list ctx'.line_pos))) in *) ctx'.pos = 4 && ctx'.line_pos = [|1; 3|] let rec skip_spaces (ctx: lexer_context) = let ch = input_first ctx in if ch = ' ' || ch = '\t' || ch = '\n' then skip_spaces (input_rest ctx) else ctx let%test "test skip_spaces" = let ctx = {total = " \nabc"; pos = 0; line_pos = [||]} in let ctx' = skip_spaces ctx in (* let _ = Printf.printf "pos: %d, line_pos: %s" ctx'.pos (String.concat "," (List.map string_of_int (Array.to_list ctx'.line_pos))) in *) ctx'.pos = 3 && ctx'.line_pos = [|2|] (* 1. identifier: [a-zA-Z][a-zA-Z0-9]* 2. digit: [0-9]+ 3. operator: +, -, *, /, %, ^ 4. keyword: let, in, if, then, else, fun 5. (, ), = 6. comment: //.* *) let get_identifier (ctx: lexer_context) = let rec aux ctx = let ch = input_first ctx in if is_alnum ch then aux (input_rest ctx) else ctx in let ctx' = aux ctx in let len = ctx'.pos - ctx.pos in let id = String.sub ctx'.total ctx.pos len in id, ctx' let%test "test get_identifier" = let ctx = {total = "abc"; pos = 0; line_pos = [||]} in let id, ctx' = get_identifier ctx in id = "abc" && ctx'.pos = 3 let get_digits (ctx: lexer_context) = let rec aux ctx = let ch = input_first ctx in if is_digit ch then aux (input_rest ctx) else ctx in let ctx' = aux ctx in let len = ctx'.pos - ctx.pos in let id = String.sub ctx'.total ctx.pos len in id, ctx' let%test "test get_digit" = let ctx = {total = "123"; pos = 0; line_pos = [||]} in let id, ctx' = get_digits ctx in id = "123" && ctx'.pos = 3 let id_to_token_type id = match (Token.str2keyword id) with | Some keyword -> keyword | None -> Token.Identifier id let lex_token (ctx: lexer_context) = let make_token token_type pos = {Token.token_type = token_type; pos = pos} in let ctx = skip_spaces ctx in let first_ch = input_first ctx in let pos = ctx.pos in let rest = input_rest ctx in match first_ch with | '\000' -> {Token.token_type = Eof; pos = pos}, ctx | '(' -> make_token LParen pos, rest | ')' -> make_token RParen pos, rest | '=' -> make_token Equal pos, rest | '+' -> make_token Add pos, rest | '-' -> let second_ch = input_first (rest) in if second_ch = '>' then make_token Arrow pos, input_rest rest else make_token Sub pos, rest | '*' -> make_token Mul pos, rest | '/' -> (* check comment *) let second_ch = input_first (rest) in if second_ch = '/' then let rec aux ctx = let ch = input_first ctx in if ch = '\n' then ctx else aux (input_rest ctx) in let ctx = aux ctx in let len = ctx.pos - pos in let comment = String.sub ctx.total pos len in make_token (Comment comment) pos, ctx else make_token (Div) pos, rest | '%' -> make_token (Mod) pos, rest | '^' -> make_token (Pow) pos, rest | c when is_alpha c -> let id, ctx = get_identifier ctx in make_token (id_to_token_type id) pos, ctx | c when is_digit c -> let id, ctx = get_digits ctx in make_token (Digit id) pos, ctx | _ -> make_token (Fail "invalid token") pos, rest let%test "test lex_token 1" = let ctx = {total = "let"; pos = 0; line_pos = [||]} in let token, ctx' = lex_token ctx in token.token_type = Let && token.pos = 0 && ctx'.pos = 3 let%test "test lex_token 2" = let ctx = {total = "let in"; pos = 0; line_pos = [||]} in let token, ctx' = lex_token ctx in let token', ctx'' = lex_token ctx' in token.token_type = Let && token.pos = 0 && ctx'.pos = 3 && token'.token_type = In && token'.pos = 4 && ctx''.pos = 6 let lex_tokens_seq (total: string): (Token.t * lexer_context) Seq.t = let rec aux ctx = let token, next_ctx = lex_token ctx in if token.token_type = Eof then Seq.Cons ((token, next_ctx), fun () -> Seq.Nil) else Seq.Cons ((token, next_ctx), fun () -> aux next_ctx) in fun () -> aux {total = total; pos = 0; line_pos = [||]} let%test "test lex_tokens_seq" = let total = "let in" in let seq = lex_tokens_seq total in let seq = seq |> Seq.map (fun (token, _) -> token) in let tokens = List.of_seq seq in let expected = [ {Token.token_type = Let; pos = 0}; {Token.token_type = In; pos = 4}; {Token.token_type = Eof; pos = 6} ] in tokens = expected