leibniz/lib/lexer.ml
2026-01-19 03:37:26 +00:00

112 lines
4 KiB
OCaml

type position = { line : int; col : int; offset : int }
type token =
| Num of float
| Var of string
| Ident of string
| Plus | Minus | Star | Slash | Caret
| LParen | RParen
| Comma
| EOF
type token_with_pos = { token : token; start_pos : position; end_pos : position }
let is_digit c = c >= '0' && c <= '9'
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
let is_alphanum c = is_alpha c || is_digit c || c = '_'
let needs_implicit_mult tok1 tok2 =
match (tok1, tok2) with
| (Num _ | Var _ | RParen), (Num _ | Var _ | LParen | Ident _) -> true
| _ -> false
let tokenize str =
let len = String.length str in
let line = ref 1 in
let col = ref 1 in
let rec aux i acc =
if i >= len then
let pos = { line = !line; col = !col; offset = i } in
List.rev ({ token = EOF; start_pos = pos; end_pos = pos } :: acc)
else
let start_pos = { line = !line; col = !col; offset = i } in
match str.[i] with
| ' ' | '\t' ->
col := !col + 1;
aux (i + 1) acc
| '\n' ->
line := !line + 1;
col := 1;
aux (i + 1) acc
| '+' ->
col := !col + 1;
let end_pos = { line = !line; col = !col; offset = i + 1 } in
aux (i + 1) ({ token = Plus; start_pos; end_pos } :: acc)
| '-' ->
col := !col + 1;
let end_pos = { line = !line; col = !col; offset = i + 1 } in
aux (i + 1) ({ token = Minus; start_pos; end_pos } :: acc)
| '*' ->
col := !col + 1;
let end_pos = { line = !line; col = !col; offset = i + 1 } in
aux (i + 1) ({ token = Star; start_pos; end_pos } :: acc)
| '/' ->
col := !col + 1;
let end_pos = { line = !line; col = !col; offset = i + 1 } in
aux (i + 1) ({ token = Slash; start_pos; end_pos } :: acc)
| '^' ->
col := !col + 1;
let end_pos = { line = !line; col = !col; offset = i + 1 } in
aux (i + 1) ({ token = Caret; start_pos; end_pos } :: acc)
| '(' ->
col := !col + 1;
let end_pos = { line = !line; col = !col; offset = i + 1 } in
aux (i + 1) ({ token = LParen; start_pos; end_pos } :: acc)
| ')' ->
col := !col + 1;
let end_pos = { line = !line; col = !col; offset = i + 1 } in
aux (i + 1) ({ token = RParen; start_pos; end_pos } :: acc)
| ',' ->
col := !col + 1;
let end_pos = { line = !line; col = !col; offset = i + 1 } in
aux (i + 1) ({ token = Comma; start_pos; end_pos } :: acc)
| c when is_digit c || c = '.' ->
let j = ref (i + 1) in
while !j < len && (is_digit str.[!j] || str.[!j] = '.') do
incr j
done;
let num_str = String.sub str i (!j - i) in
col := !col + (!j - i);
let end_pos = { line = !line; col = !col; offset = !j } in
aux !j ({ token = Num (float_of_string num_str); start_pos; end_pos } :: acc)
| c when is_alpha c ->
let j = ref (i + 1) in
while !j < len && is_alphanum str.[!j] do
incr j
done;
let id = String.sub str i (!j - i) in
col := !col + (!j - i);
let end_pos = { line = !line; col = !col; offset = !j } in
let tok = if !j < len && str.[!j] = '(' then Ident id else Var id in
aux !j ({ token = tok; start_pos; end_pos } :: acc)
| c ->
failwith (Printf.sprintf "unexpected character: %c at line %d, column %d"
c !line !col)
in
let tokens = aux 0 [] in
let rec insert_implicit_mult = function
| [] -> []
| [t] -> [t]
| t1 :: t2 :: rest ->
if needs_implicit_mult t1.token t2.token then
let mult_pos = t2.start_pos in
t1 :: { token = Star; start_pos = mult_pos; end_pos = mult_pos } ::
insert_implicit_mult (t2 :: rest)
else
t1 :: insert_implicit_mult (t2 :: rest)
in
insert_implicit_mult tokens