small-set-of-ml/lib/lexer.ml
2025-01-30 01:40:22 +09:00

223 lines
6.8 KiB
OCaml

let epsilon = '\000'
(* Lexer is just state machine *)
let is_digit c = c >= '0' && c <= '9'
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
let is_alnum c = is_alpha c || is_digit c
type lexer_context = {
total: string;
pos: int;
(* \n position array *)
line_pos: int array;
}
let binary_search_range arr x =
if Array.length arr = 0 then 0
else
let rec aux low high =
match compare low high with
| 0 -> if arr.(low) >= x then low else low + 1
(* unreachable *)
| c when c > 0 -> invalid_arg "binary_search_range"
| _ ->
let mid = (low + high) / 2 in
if arr.(mid) >= x && ( mid = 0 || arr.(mid - 1) < x) then mid
else if arr.(mid) < x then aux (mid + 1) high
else aux low (mid - 1)
in
aux 0 (Array.length arr - 1)
let get_line_and_col (line_pos: int array) (pos: int) =
let line_index = binary_search_range line_pos pos in
(* let _ = Printf.printf "line_index: %d\n" line_index in *)
let line_start_pos = if line_index > 0 then
(line_pos.(line_index - 1) + 1) else 0 in
(line_index + 1, pos - (line_start_pos) + 1)
let%test "test: get_line_and_col 1" =
let actual = get_line_and_col [|1; 2; 3; 4; 6|] 3 in
let expected = (3, 1) in
(* let _ = Printf.printf "(%d,%d)\n" (fst actual) (snd actual) in *)
actual = expected
let%test "test: get_line_and_col 2" =
let actual = get_line_and_col [|1; 2; 3; 4; 6|] 10 in
let expected = (6, 4) in
(* let _ = Printf.printf "(%d,%d)\n" (fst actual) (snd actual) in *)
actual = expected
let input_first (ctx: lexer_context) =
if ctx.pos < String.length ctx.total then
ctx.total.[ctx.pos]
else
epsilon
let%test "test first" =
let ctx = {total = "abc"; pos = 0; line_pos = [||]} in
input_first ctx = 'a'
let input_rest (ctx: lexer_context) = let ch = input_first ctx in
if ch = '\n' then
{ctx with pos = ctx.pos + 1; line_pos = Array.append ctx.line_pos [|ctx.pos|]}
else
{ctx with pos = ctx.pos + 1}
let%test "test rest" =
let ctx = {total = "abc"; pos = 0; line_pos = [||]} in
let ctx' = input_rest ctx in
ctx'.pos = 1 && ctx'.line_pos = [||]
let%test "test rest with new line" =
let ctx = {total = "a\nbc"; pos = 1; line_pos = [||]} in
let ctx' = input_rest ctx in
ctx'.pos = 2 && ctx'.line_pos = [|1|]
let%test "test rest with new line 2" =
let ctx = {total = "a\nb\nc"; pos = 3; line_pos = [|1|]} in
let ctx' = input_rest ctx in
(* let _ = Printf.printf "pos: %d, line_pos: %s" ctx'.pos (String.concat ","
(List.map string_of_int (Array.to_list ctx'.line_pos))) in *)
ctx'.pos = 4 && ctx'.line_pos = [|1; 3|]
let rec skip_spaces (ctx: lexer_context) =
let ch = input_first ctx in
if ch = ' ' || ch = '\t' || ch = '\n' then
skip_spaces (input_rest ctx)
else
ctx
let%test "test skip_spaces" =
let ctx = {total = " \nabc"; pos = 0; line_pos = [||]} in
let ctx' = skip_spaces ctx in
(* let _ = Printf.printf "pos: %d, line_pos: %s" ctx'.pos (String.concat ","
(List.map string_of_int (Array.to_list ctx'.line_pos))) in *)
ctx'.pos = 3 && ctx'.line_pos = [|2|]
(*
1. identifier: [a-zA-Z][a-zA-Z0-9]*
2. digit: [0-9]+
3. operator: +, -, *, /, %, ^
4. keyword: let, in, if, then, else, fun
5. (, ), =
6. comment: //.*
*)
let get_identifier (ctx: lexer_context) =
let rec aux ctx =
let ch = input_first ctx in
if is_alnum ch then
aux (input_rest ctx)
else
ctx in
let ctx' = aux ctx in
let len = ctx'.pos - ctx.pos in
let id = String.sub ctx'.total ctx.pos len in
id, ctx'
let%test "test get_identifier" =
let ctx = {total = "abc"; pos = 0; line_pos = [||]} in
let id, ctx' = get_identifier ctx in
id = "abc" && ctx'.pos = 3
let get_digits (ctx: lexer_context) =
let rec aux ctx =
let ch = input_first ctx in
if is_digit ch then
aux (input_rest ctx)
else
ctx in
let ctx' = aux ctx in
let len = ctx'.pos - ctx.pos in
let id = String.sub ctx'.total ctx.pos len in
id, ctx'
let%test "test get_digit" =
let ctx = {total = "123"; pos = 0; line_pos = [||]} in
let id, ctx' = get_digits ctx in
id = "123" && ctx'.pos = 3
let id_to_token_type id =
match (Token.str2keyword id) with
| Some keyword -> keyword
| None -> Token.Identifier id
let lex_token (ctx: lexer_context) =
let make_token token_type pos = {Token.token_type = token_type; pos = pos} in
let ctx = skip_spaces ctx in
let first_ch = input_first ctx in
let pos = ctx.pos in
let rest = input_rest ctx in
match first_ch with
| '\000' -> {Token.token_type = Eof; pos = pos}, ctx
| '(' -> make_token LParen pos, rest
| ')' -> make_token RParen pos, rest
| '=' -> make_token Equal pos, rest
| '+' -> make_token Add pos, rest
| '-' ->
let second_ch = input_first (rest) in
if second_ch = '>' then
make_token Arrow pos, input_rest rest
else
make_token Sub pos, rest
| '*' -> make_token Mul pos, rest
| '/' ->
(* check comment *)
let second_ch = input_first (rest) in
if second_ch = '/' then
let rec aux ctx =
let ch = input_first ctx in
if ch = '\n' then
ctx
else
aux (input_rest ctx) in
let ctx = aux ctx in
let len = ctx.pos - pos in
let comment = String.sub ctx.total pos len in
make_token (Comment comment) pos, ctx
else
make_token (Div) pos, rest
| '%' -> make_token (Mod) pos, rest
| '^' -> make_token (Pow) pos, rest
| c when is_alpha c ->
let id, ctx = get_identifier ctx in
make_token (id_to_token_type id) pos, ctx
| c when is_digit c ->
let id, ctx = get_digits ctx in
make_token (Digit id) pos, ctx
| _ -> make_token (Fail "invalid token") pos, rest
let%test "test lex_token 1" =
let ctx = {total = "let"; pos = 0; line_pos = [||]} in
let token, ctx' = lex_token ctx in
token.token_type = Let && token.pos = 0 && ctx'.pos = 3
let%test "test lex_token 2" =
let ctx = {total = "let in"; pos = 0; line_pos = [||]} in
let token, ctx' = lex_token ctx in
let token', ctx'' = lex_token ctx' in
token.token_type = Let && token.pos = 0 && ctx'.pos = 3 &&
token'.token_type = In && token'.pos = 4 && ctx''.pos = 6
let lex_tokens_seq (total: string): (Token.t * lexer_context) Seq.t =
let rec aux ctx =
let token, next_ctx = lex_token ctx in
if token.token_type = Eof then
Seq.Cons ((token, next_ctx), fun () -> Seq.Nil)
else
Seq.Cons ((token, next_ctx), fun () -> aux next_ctx) in
fun () -> aux {total = total; pos = 0; line_pos = [||]}
let%test "test lex_tokens_seq" =
let total = "let in" in
let seq = lex_tokens_seq total in
let seq = seq |> Seq.map (fun (token, _) -> token) in
let tokens = List.of_seq seq in
let expected = [
{Token.token_type = Let; pos = 0};
{Token.token_type = In; pos = 4};
{Token.token_type = Eof; pos = 6}
] in
tokens = expected