Recipes for OCamlLex

Christian Lindig
Sep 10, 2015 · 9 min read

What Scanning Does

alpha:=45 alpha+="hello"
ID(alpha) ASSIGN INT(45) ID(alpha) PLUSEQ STR(hello) EOF
<<type token>>=
type token =
| STR of string
| INT of int
| ID of string
| PLUSEQ
| MINUSEQ
| STAREQ
| SLASHEQ
| PLUS
| MINUS
| STAR
| SLASH
| ASSIGN
| EOF (* end of input *)

Organisation of a Scanner Description

<<tokenizer.mll>>=
{
(* short names for important modules *)
module L = Lexing
module B = Buffer
<<type token>>
<<prolog>>
}
<<definitions>>
<<rules>>
{
<<main function>>
let () = main () (* call main function on startup *)
}

Prolog

<<prolog>>=
let get = L.lexeme
let sprintf = Printf.sprintf
<<prolog>>=
let position lexbuf =
let p = lexbuf.L.lex_curr_p in
sprintf "%s:%d:%d"
p.L.pos_fname p.L.pos_lnum (p.L.pos_cnum - p.L.pos_bol)
<<prolog>>=
let set_filename (fname:string) (lexbuf:L.lexbuf) =
( lexbuf.L.lex_curr_p <-
{ lexbuf.L.lex_curr_p with L.pos_fname = fname }
; lexbuf
)
<<prolog>>=
exception Error of string
let error lexbuf fmt =
Printf.kprintf (fun msg ->
raise (Error ((position lexbuf)^" "^msg))) fmt

Definitions and Rules

<<definitions>>=
let ws = [' ' '\t']
let nl = ['\n']
<<definitions>>=
let digit = ['0'-'9']
let alpha = ['a'-'z' 'A'-'Z']
let id = alpha (alpha|digit)*
<<rules>>=
rule token = parse
| ws+ { token lexbuf }
| nl { L.new_line lexbuf; token lexbuf }
| digit+ { INT(int_of_string @@ get lexbuf) }
| id { ID(get lexbuf)}
| '+' { PLUS }
| '-' { MINUS }
| '*' { STAR }
| '/' { SLASH }
| "+=" { PLUSEQ }
| "-=" { MINUSEQ }
| "*=" { STAREQ }
| "/=" { SLASHEQ }
| ":=" { ASSIGN }
<<special case for string>>
| eof { EOF }
| _ { error lexbuf
"found '%s' - don't know how to handle" @@ get lexbuf }
<<rules>>=
and escape b = parse
| '&' { B.add_string b "&amp;"; escape b lexbuf }
| '"' { B.add_string b "&quot;"; escape b lexbuf }
| '\'' { B.add_string b "&apos;"; escape b lexbuf }
| '>' { B.add_string b "&gt;"; escape b lexbuf }
| '<' { B.add_string b "&lt;"; escape b lexbuf }
| [^'&' '"' '\'' '>' '<']+
{ B.add_string b @@ get lexbuf
; escape b lexbuf
}
| eof { let x = B.contents b in B.clear b; x }
| _ { error lexbuf
"don't know how to quote: %s" (get lexbuf) }

A Simple Main Function

<<main function>>=
let escape str = escape (B.create 100) (L.from_string str)
let to_string = function
| STR(str) -> sprintf "STR(%s)" (escape str)
| INT(d) -> sprintf "INT(%d)" d
| PLUS -> sprintf "PLUS"
| MINUS -> sprintf "MINUS"
| STAR -> sprintf "STAR"
| SLASH -> sprintf "SLASH"
| PLUSEQ -> sprintf "PLUSEQ"
| MINUSEQ -> sprintf "MINUSEQ"
| STAREQ -> sprintf "STAREQ"
| SLASHEQ -> sprintf "SLASHEQ"
| ID(str) -> sprintf "ID(%s)" str
| ASSIGN -> sprintf "ASSIGN"
| EOF -> sprintf "EOF"
let main () =
let lexbuf = set_filename "stdin" @@ L.from_channel stdin in
let rec loop acc = function
| EOF -> to_string EOF :: acc |> List.rev
| x -> loop (to_string x :: acc) (token lexbuf)
in
loop [] (token lexbuf)
|> String.concat " "
|> print_endline

Running the Code

$ ocamllex tokenizer.mll
$ ocamlbuild tokenizer.native
$ echo "1+2" | ./tokenizer.native
INT(1) PLUS INT(2) EOF

Scanning Complex Tokens Like Strings

<<special case for string>>=
| '"' { STR (string (B.create 100) lexbuf) } (* see below *)
<<rules>>=
and string buf = parse (* use buf to build up result *)
| [^'"' '\n' '\\']+
{ B.add_string buf @@ get lexbuf
; string buf lexbuf
}
| '\n' { B.add_string buf @@ get lexbuf
; L.new_line lexbuf
; string buf lexbuf
}
| '\\' '"' { B.add_char buf '"'
; string buf lexbuf
}
| '\\' { B.add_char buf '\\'
; string buf lexbuf
}
| '"' { B.contents buf } (* return *)
| eof { error lexbuf "end of input inside of a string" }
| _ { error lexbuf
"found '%s' - don't know how to handle" @@ get lexbuf }

Limitations

Resources

Welcome to a place where words matter. On Medium, smart voices and original ideas take center stage - with no ads in sight. Watch
Follow all the topics you care about, and we’ll deliver the best stories for you to your homepage and inbox. Explore
Get unlimited access to the best stories on Medium — and support writers while you’re at it. Just $5/month. Upgrade