223 lines
6.8 KiB
OCaml
223 lines
6.8 KiB
OCaml
let epsilon = '\000'
|
|
|
|
(* Lexer is just state machine *)
|
|
|
|
let is_digit c = c >= '0' && c <= '9'
|
|
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
|
|
let is_alnum c = is_alpha c || is_digit c
|
|
|
|
type lexer_context = {
|
|
total: string;
|
|
pos: int;
|
|
(* \n position array *)
|
|
line_pos: int array;
|
|
}
|
|
|
|
let binary_search_range arr x =
|
|
if Array.length arr = 0 then 0
|
|
else
|
|
let rec aux low high =
|
|
match compare low high with
|
|
| 0 -> if arr.(low) >= x then low else low + 1
|
|
(* unreachable *)
|
|
| c when c > 0 -> invalid_arg "binary_search_range"
|
|
| _ ->
|
|
let mid = (low + high) / 2 in
|
|
if arr.(mid) >= x && ( mid = 0 || arr.(mid - 1) < x) then mid
|
|
else if arr.(mid) < x then aux (mid + 1) high
|
|
else aux low (mid - 1)
|
|
in
|
|
aux 0 (Array.length arr - 1)
|
|
let get_line_and_col (line_pos: int array) (pos: int) =
|
|
let line_index = binary_search_range line_pos pos in
|
|
(* let _ = Printf.printf "line_index: %d\n" line_index in *)
|
|
let line_start_pos = if line_index > 0 then
|
|
(line_pos.(line_index - 1) + 1) else 0 in
|
|
(line_index + 1, pos - (line_start_pos) + 1)
|
|
|
|
let%test "test: get_line_and_col 1" =
|
|
let actual = get_line_and_col [|1; 2; 3; 4; 6|] 3 in
|
|
let expected = (3, 1) in
|
|
(* let _ = Printf.printf "(%d,%d)\n" (fst actual) (snd actual) in *)
|
|
actual = expected
|
|
|
|
let%test "test: get_line_and_col 2" =
|
|
let actual = get_line_and_col [|1; 2; 3; 4; 6|] 10 in
|
|
let expected = (6, 4) in
|
|
(* let _ = Printf.printf "(%d,%d)\n" (fst actual) (snd actual) in *)
|
|
actual = expected
|
|
|
|
|
|
let input_first (ctx: lexer_context) =
|
|
if ctx.pos < String.length ctx.total then
|
|
ctx.total.[ctx.pos]
|
|
else
|
|
epsilon
|
|
|
|
let%test "test first" =
|
|
let ctx = {total = "abc"; pos = 0; line_pos = [||]} in
|
|
input_first ctx = 'a'
|
|
|
|
let input_rest (ctx: lexer_context) = let ch = input_first ctx in
|
|
if ch = '\n' then
|
|
{ctx with pos = ctx.pos + 1; line_pos = Array.append ctx.line_pos [|ctx.pos|]}
|
|
else
|
|
{ctx with pos = ctx.pos + 1}
|
|
|
|
let%test "test rest" =
|
|
let ctx = {total = "abc"; pos = 0; line_pos = [||]} in
|
|
let ctx' = input_rest ctx in
|
|
ctx'.pos = 1 && ctx'.line_pos = [||]
|
|
|
|
let%test "test rest with new line" =
|
|
let ctx = {total = "a\nbc"; pos = 1; line_pos = [||]} in
|
|
let ctx' = input_rest ctx in
|
|
ctx'.pos = 2 && ctx'.line_pos = [|1|]
|
|
|
|
let%test "test rest with new line 2" =
|
|
let ctx = {total = "a\nb\nc"; pos = 3; line_pos = [|1|]} in
|
|
let ctx' = input_rest ctx in
|
|
(* let _ = Printf.printf "pos: %d, line_pos: %s" ctx'.pos (String.concat ","
|
|
(List.map string_of_int (Array.to_list ctx'.line_pos))) in *)
|
|
ctx'.pos = 4 && ctx'.line_pos = [|1; 3|]
|
|
|
|
let rec skip_spaces (ctx: lexer_context) =
|
|
let ch = input_first ctx in
|
|
if ch = ' ' || ch = '\t' || ch = '\n' then
|
|
skip_spaces (input_rest ctx)
|
|
else
|
|
ctx
|
|
|
|
let%test "test skip_spaces" =
|
|
let ctx = {total = " \nabc"; pos = 0; line_pos = [||]} in
|
|
let ctx' = skip_spaces ctx in
|
|
(* let _ = Printf.printf "pos: %d, line_pos: %s" ctx'.pos (String.concat ","
|
|
(List.map string_of_int (Array.to_list ctx'.line_pos))) in *)
|
|
ctx'.pos = 3 && ctx'.line_pos = [|2|]
|
|
|
|
(*
|
|
1. identifier: [a-zA-Z][a-zA-Z0-9]*
|
|
2. digit: [0-9]+
|
|
3. operator: +, -, *, /, %, ^
|
|
4. keyword: let, in, if, then, else, fun
|
|
5. (, ), =
|
|
6. comment: //.*
|
|
*)
|
|
|
|
let get_identifier (ctx: lexer_context) =
|
|
let rec aux ctx =
|
|
let ch = input_first ctx in
|
|
if is_alnum ch then
|
|
aux (input_rest ctx)
|
|
else
|
|
ctx in
|
|
let ctx' = aux ctx in
|
|
let len = ctx'.pos - ctx.pos in
|
|
let id = String.sub ctx'.total ctx.pos len in
|
|
id, ctx'
|
|
|
|
let%test "test get_identifier" =
|
|
let ctx = {total = "abc"; pos = 0; line_pos = [||]} in
|
|
let id, ctx' = get_identifier ctx in
|
|
id = "abc" && ctx'.pos = 3
|
|
|
|
let get_digits (ctx: lexer_context) =
|
|
let rec aux ctx =
|
|
let ch = input_first ctx in
|
|
if is_digit ch then
|
|
aux (input_rest ctx)
|
|
else
|
|
ctx in
|
|
let ctx' = aux ctx in
|
|
let len = ctx'.pos - ctx.pos in
|
|
let id = String.sub ctx'.total ctx.pos len in
|
|
id, ctx'
|
|
|
|
let%test "test get_digit" =
|
|
let ctx = {total = "123"; pos = 0; line_pos = [||]} in
|
|
let id, ctx' = get_digits ctx in
|
|
id = "123" && ctx'.pos = 3
|
|
|
|
let id_to_token_type id =
|
|
match (Token.str2keyword id) with
|
|
| Some keyword -> keyword
|
|
| None -> Token.Identifier id
|
|
|
|
|
|
let lex_token (ctx: lexer_context) =
|
|
let make_token token_type pos = {Token.token_type = token_type; pos = pos} in
|
|
let ctx = skip_spaces ctx in
|
|
let first_ch = input_first ctx in
|
|
let pos = ctx.pos in
|
|
let rest = input_rest ctx in
|
|
match first_ch with
|
|
| '\000' -> {Token.token_type = Eof; pos = pos}, ctx
|
|
| '(' -> make_token LParen pos, rest
|
|
| ')' -> make_token RParen pos, rest
|
|
| '=' -> make_token Equal pos, rest
|
|
| '+' -> make_token Add pos, rest
|
|
| '-' ->
|
|
let second_ch = input_first (rest) in
|
|
if second_ch = '>' then
|
|
make_token Arrow pos, input_rest rest
|
|
else
|
|
make_token Sub pos, rest
|
|
| '*' -> make_token Mul pos, rest
|
|
| '/' ->
|
|
(* check comment *)
|
|
let second_ch = input_first (rest) in
|
|
if second_ch = '/' then
|
|
let rec aux ctx =
|
|
let ch = input_first ctx in
|
|
if ch = '\n' then
|
|
ctx
|
|
else
|
|
aux (input_rest ctx) in
|
|
let ctx = aux ctx in
|
|
let len = ctx.pos - pos in
|
|
let comment = String.sub ctx.total pos len in
|
|
make_token (Comment comment) pos, ctx
|
|
else
|
|
make_token (Div) pos, rest
|
|
| '%' -> make_token (Mod) pos, rest
|
|
| '^' -> make_token (Pow) pos, rest
|
|
| c when is_alpha c ->
|
|
let id, ctx = get_identifier ctx in
|
|
make_token (id_to_token_type id) pos, ctx
|
|
| c when is_digit c ->
|
|
let id, ctx = get_digits ctx in
|
|
make_token (Digit id) pos, ctx
|
|
| _ -> make_token (Fail "invalid token") pos, rest
|
|
|
|
let%test "test lex_token 1" =
|
|
let ctx = {total = "let"; pos = 0; line_pos = [||]} in
|
|
let token, ctx' = lex_token ctx in
|
|
token.token_type = Let && token.pos = 0 && ctx'.pos = 3
|
|
|
|
let%test "test lex_token 2" =
|
|
let ctx = {total = "let in"; pos = 0; line_pos = [||]} in
|
|
let token, ctx' = lex_token ctx in
|
|
let token', ctx'' = lex_token ctx' in
|
|
token.token_type = Let && token.pos = 0 && ctx'.pos = 3 &&
|
|
token'.token_type = In && token'.pos = 4 && ctx''.pos = 6
|
|
|
|
let lex_tokens_seq (total: string): (Token.t * lexer_context) Seq.t =
|
|
let rec aux ctx =
|
|
let token, next_ctx = lex_token ctx in
|
|
if token.token_type = Eof then
|
|
Seq.Cons ((token, next_ctx), fun () -> Seq.Nil)
|
|
else
|
|
Seq.Cons ((token, next_ctx), fun () -> aux next_ctx) in
|
|
fun () -> aux {total = total; pos = 0; line_pos = [||]}
|
|
|
|
let%test "test lex_tokens_seq" =
|
|
let total = "let in" in
|
|
let seq = lex_tokens_seq total in
|
|
let seq = seq |> Seq.map (fun (token, _) -> token) in
|
|
let tokens = List.of_seq seq in
|
|
let expected = [
|
|
{Token.token_type = Let; pos = 0};
|
|
{Token.token_type = In; pos = 4};
|
|
{Token.token_type = Eof; pos = 6}
|
|
] in
|
|
tokens = expected
|