small-set-of-ml/lib/lexer.ml

277 lines
7.6 KiB
OCaml
Raw Normal View History

2025-01-29 17:17:22 +09:00
(* small set of ml *)
type op_type =
| Add
| Sub
| Mul
| Div
| Mod
| Pow
let op2str op =
match op with
| Add -> "+"
| Sub -> "-"
| Mul -> "*"
| Div -> "/"
| Mod -> "%"
| Pow -> "^"
type keyword_type =
| Let
| In
| If
| Then
| Else
| Fun
type token_type =
| Eof
| Identifier of string
| Digit of string
| Op of op_type
| LParen
| RParen
| Equal
| Arrow
| Keyword of keyword_type
| Comment of string
| Fail of string
2025-01-29 17:56:00 +09:00
2025-01-29 17:17:22 +09:00
type token = {
(* token type *)
token_type: token_type;
(* start position *)
pos: int;
2025-01-29 17:56:00 +09:00
}
2025-01-29 17:17:22 +09:00
2025-01-29 17:56:00 +09:00
let epsilon = '\000'
2025-01-29 17:17:22 +09:00
(* Lexer is just state machine *)
let is_digit c = c >= '0' && c <= '9'
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
let is_alnum c = is_alpha c || is_digit c
type lexer_context = {
total: string;
pos: int;
(* \n position array *)
line_pos: int array;
2025-01-29 17:56:00 +09:00
}
2025-01-29 17:17:22 +09:00
let binary_search_range arr x =
if Array.length arr = 0 then 0
else
let rec aux low high =
match compare low high with
| 0 -> if arr.(low) >= x then low else low + 1
(* unreachable *)
2025-01-29 17:56:00 +09:00
| c when c > 0 -> invalid_arg "binary_search_range"
2025-01-29 17:17:22 +09:00
| _ ->
let mid = (low + high) / 2 in
if arr.(mid) >= x && ( mid = 0 || arr.(mid - 1) < x) then mid
else if arr.(mid) < x then aux (mid + 1) high
else aux low (mid - 1)
in
aux 0 (Array.length arr - 1)
let get_line_and_col (line_pos: int array) (pos: int) =
let line_index = binary_search_range line_pos pos in
(* let _ = Printf.printf "line_index: %d\n" line_index in *)
let line_start_pos = if line_index > 0 then
(line_pos.(line_index - 1) + 1) else 0 in
2025-01-29 17:56:00 +09:00
(line_index + 1, pos - (line_start_pos) + 1)
2025-01-29 17:17:22 +09:00
let%test "test: get_line_and_col 1" =
let actual = get_line_and_col [|1; 2; 3; 4; 6|] 3 in
let expected = (3, 1) in
(* let _ = Printf.printf "(%d,%d)\n" (fst actual) (snd actual) in *)
2025-01-29 17:56:00 +09:00
actual = expected
2025-01-29 17:17:22 +09:00
let%test "test: get_line_and_col 2" =
let actual = get_line_and_col [|1; 2; 3; 4; 6|] 10 in
let expected = (6, 4) in
(* let _ = Printf.printf "(%d,%d)\n" (fst actual) (snd actual) in *)
2025-01-29 17:56:00 +09:00
actual = expected
2025-01-29 17:17:22 +09:00
let input_first (ctx: lexer_context) =
if ctx.pos < String.length ctx.total then
ctx.total.[ctx.pos]
else
2025-01-29 17:56:00 +09:00
epsilon
2025-01-29 17:17:22 +09:00
let%test "test first" =
let ctx = {total = "abc"; pos = 0; line_pos = [||]} in
2025-01-29 17:56:00 +09:00
input_first ctx = 'a'
2025-01-29 17:17:22 +09:00
let input_rest (ctx: lexer_context) = let ch = input_first ctx in
if ch = '\n' then
{ctx with pos = ctx.pos + 1; line_pos = Array.append ctx.line_pos [|ctx.pos|]}
else
2025-01-29 17:56:00 +09:00
{ctx with pos = ctx.pos + 1}
2025-01-29 17:17:22 +09:00
let%test "test rest" =
let ctx = {total = "abc"; pos = 0; line_pos = [||]} in
let ctx' = input_rest ctx in
2025-01-29 17:56:00 +09:00
ctx'.pos = 1 && ctx'.line_pos = [||]
2025-01-29 17:17:22 +09:00
let%test "test rest with new line" =
let ctx = {total = "a\nbc"; pos = 1; line_pos = [||]} in
let ctx' = input_rest ctx in
2025-01-29 17:56:00 +09:00
ctx'.pos = 2 && ctx'.line_pos = [|1|]
2025-01-29 17:17:22 +09:00
let%test "test rest with new line 2" =
let ctx = {total = "a\nb\nc"; pos = 3; line_pos = [|1|]} in
let ctx' = input_rest ctx in
(* let _ = Printf.printf "pos: %d, line_pos: %s" ctx'.pos (String.concat ","
(List.map string_of_int (Array.to_list ctx'.line_pos))) in *)
2025-01-29 17:56:00 +09:00
ctx'.pos = 4 && ctx'.line_pos = [|1; 3|]
2025-01-29 17:17:22 +09:00
let rec skip_spaces (ctx: lexer_context) =
let ch = input_first ctx in
if ch = ' ' || ch = '\t' || ch = '\n' then
skip_spaces (input_rest ctx)
else
2025-01-29 17:56:00 +09:00
ctx
2025-01-29 17:17:22 +09:00
let%test "test skip_spaces" =
let ctx = {total = " \nabc"; pos = 0; line_pos = [||]} in
let ctx' = skip_spaces ctx in
(* let _ = Printf.printf "pos: %d, line_pos: %s" ctx'.pos (String.concat ","
(List.map string_of_int (Array.to_list ctx'.line_pos))) in *)
2025-01-29 17:56:00 +09:00
ctx'.pos = 3 && ctx'.line_pos = [|2|]
2025-01-29 17:17:22 +09:00
(*
1. identifier: [a-zA-Z][a-zA-Z0-9]*
2. digit: [0-9]+
3. operator: +, -, *, /, %, ^
4. keyword: let, in, if, then, else, fun
5. (, ), =
6. comment: //.*
*)
let get_identifier (ctx: lexer_context) =
let rec aux ctx =
let ch = input_first ctx in
if is_alnum ch then
aux (input_rest ctx)
else
ctx in
let ctx' = aux ctx in
let len = ctx'.pos - ctx.pos in
let id = String.sub ctx'.total ctx.pos len in
2025-01-29 17:56:00 +09:00
id, ctx'
2025-01-29 17:17:22 +09:00
let%test "test get_identifier" =
let ctx = {total = "abc"; pos = 0; line_pos = [||]} in
let id, ctx' = get_identifier ctx in
2025-01-29 17:56:00 +09:00
id = "abc" && ctx'.pos = 3
2025-01-29 17:17:22 +09:00
let get_digits (ctx: lexer_context) =
let rec aux ctx =
let ch = input_first ctx in
if is_digit ch then
aux (input_rest ctx)
else
ctx in
let ctx' = aux ctx in
let len = ctx'.pos - ctx.pos in
let id = String.sub ctx'.total ctx.pos len in
2025-01-29 17:56:00 +09:00
id, ctx'
2025-01-29 17:17:22 +09:00
let%test "test get_digit" =
let ctx = {total = "123"; pos = 0; line_pos = [||]} in
let id, ctx' = get_digits ctx in
2025-01-29 17:56:00 +09:00
id = "123" && ctx'.pos = 3
2025-01-29 17:17:22 +09:00
let id_to_token_type id =
match id with
| "let" -> Keyword Let
| "in" -> Keyword In
| "if" -> Keyword If
| "then" -> Keyword Then
| "else" -> Keyword Else
| "fun" -> Keyword Fun
2025-01-29 17:56:00 +09:00
| _ -> Identifier id
2025-01-29 17:17:22 +09:00
let lex_token (ctx: lexer_context) =
let make_token token_type pos = {token_type = token_type; pos = pos} in
let ctx = skip_spaces ctx in
let first_ch = input_first ctx in
let pos = ctx.pos in
match first_ch with
| '\000' -> {token_type = Eof; pos = pos}, ctx
| '(' -> make_token LParen pos, input_rest ctx
| ')' -> make_token RParen pos, input_rest ctx
| '=' -> make_token Equal pos, input_rest ctx
| '+' -> make_token (Op Add) pos, input_rest ctx
| '-' ->
let second_ch = input_first (input_rest ctx) in
if second_ch = '>' then
make_token Arrow pos, input_rest (input_rest ctx)
else
make_token (Op Sub) pos, input_rest ctx
| '*' -> make_token (Op Mul) pos, input_rest ctx
| '/' ->
(* check comment *)
let second_ch = input_first (input_rest ctx) in
if second_ch = '/' then
let rec aux ctx =
let ch = input_first ctx in
if ch = '\n' then
ctx
else
aux (input_rest ctx) in
let ctx = aux ctx in
let len = ctx.pos - pos in
let comment = String.sub ctx.total pos len in
make_token (Comment comment) pos, ctx
else
make_token (Op Div) pos, input_rest ctx
| '%' -> make_token (Op Mod) pos, input_rest ctx
| '^' -> make_token (Op Pow) pos, input_rest ctx
| c when is_alpha c ->
let id, ctx = get_identifier ctx in
make_token (id_to_token_type id) pos, ctx
| c when is_digit c ->
let id, ctx = get_digits ctx in
make_token (Digit id) pos, ctx
2025-01-29 17:56:00 +09:00
| _ -> make_token (Fail "invalid token") pos, input_rest ctx
2025-01-29 17:17:22 +09:00
let%test "test lex_token 1" =
let ctx = {total = "let"; pos = 0; line_pos = [||]} in
let token, ctx' = lex_token ctx in
2025-01-29 17:56:00 +09:00
token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3
2025-01-29 17:17:22 +09:00
let%test "test lex_token 2" =
let ctx = {total = "let in"; pos = 0; line_pos = [||]} in
let token, ctx' = lex_token ctx in
let token', ctx'' = lex_token ctx' in
token.token_type = Keyword Let && token.pos = 0 && ctx'.pos = 3 &&
2025-01-29 17:56:00 +09:00
token'.token_type = Keyword In && token'.pos = 4 && ctx''.pos = 6
2025-01-29 17:17:22 +09:00
let lex_tokens_seq (total: string): (token * lexer_context) Seq.t =
let rec aux ctx =
let token, next_ctx = lex_token ctx in
2025-01-29 22:36:57 +09:00
if token.token_type = Eof then
2025-01-29 17:17:22 +09:00
Seq.Cons ((token, next_ctx), fun () -> Seq.Nil)
else
Seq.Cons ((token, next_ctx), fun () -> aux next_ctx) in
2025-01-29 17:56:00 +09:00
fun () -> aux {total = total; pos = 0; line_pos = [||]}
2025-01-29 17:17:22 +09:00
let%test "test lex_tokens_seq" =
let total = "let in" in
let seq = lex_tokens_seq total in
let seq = seq |> Seq.map (fun (token, _) -> token) in
let tokens = List.of_seq seq in
let expected = [
{token_type = Keyword Let; pos = 0};
{token_type = Keyword In; pos = 4};
{token_type = Eof; pos = 6}
] in
2025-01-29 17:56:00 +09:00
tokens = expected