-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.mll
134 lines (117 loc) · 4.66 KB
/
lexer.mll
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
(* Analyse lexicale *)
{
open Lexing
open Parser
open Ast
(* Erreurs lexicales *)
exception Lexical_error of string
let newline lexbuf =
let pos = lexbuf.lex_curr_p in
lexbuf.lex_curr_p <-
{ pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; pos_cnum = 0 }
let char_error s = raise (Lexical_error ("illegal character sequence: " ^ s))
let keyword_or_ident s =
try
List.assoc s [
"int", INT;
"void", VOID;
"char", CHAR;
"short", SHORT;
"long", LONG;
"double", DOUBLE;
"unsigned", UNSIGNED;
"struct", STRUCT;
"extern", EXTERN;
"sizeof", SIZEOF;
"if", IF;
"else", ELSE;
"while", WHILE;
"for", FOR;
"return", RETURN;
];
with Not_found -> IDENT s
let int32_of_number_string num ign =
Int32.of_string (String.sub num 0 ((String.length num) - ign))
let int64_of_number_string num ign =
Int64.of_string (String.sub num 0 ((String.length num) - ign))
let process_char c =
match c with
| "\\n" -> '\n'
| "\\t" -> '\t'
| "\\r" -> '\r'
| "\\\\" -> '\\'
| _ as s -> print_string ("["^s^"]"); ' '
let string_buffer = Buffer.create 80
}
let digit = ['0'-'9']
let alpha = ['a'-'z' 'A'-'Z']
let identifier = (alpha | ['_']) (alpha | digit | ['_'])*
let const_float = (digit+ ['.'] digit* | ['.']digit+)((['e'] | ['E'])['-']?digit+)?
let simple_char = [' '-'~']#['\'' '\\' '\"'] | digit
let complexe_char = ['\\'] (['\\'] | ['n'] | ['t'] | ['r'])
let const_string = ['"'] (simple_char | complexe_char)* ['"']
rule token = parse
| '\n' { newline lexbuf; token lexbuf }
| [' ' '\t' '\r']+ { token lexbuf }
| "/*" { comment lexbuf }
| "#" { macro lexbuf }
| "*/" { raise (Lexical_error "Error: unmatched comment ending") }
| eof { EOF }
| ";" { SEMI }
| "=" { ASSIGN }
| "*" { STAR }
| "{" { LBRACKET }
| "}" { RBRACKET }
| "(" { LPAR }
| ")" { RPAR }
| "," { COMMA }
| "-" { MINUS }
| "+" { PLUS }
| "*" { STAR }
| "/" { DIV }
| "==" { EQ }
| "!=" { NEQ }
| ">" { GT }
| ">=" { GTE }
| "<" { LT }
| "<=" { LTE }
| "%" { MOD }
| "&&" { AND }
| "||" { OR }
| "++" { PLUSPLUS }
| "--" { MINUSMINUS }
| "!" { NOT }
| "&" { BAND }
| "->" { ARROW }
| "." { DOT }
| "[" { L_SQ_BRACKET }
| "]" { R_SQ_BRACKET }
| "//" { comment_line lexbuf }
| "\"" { Buffer.reset string_buffer; CONST_STRING(string lexbuf) }
| "'" simple_char "'" { CONST_CHAR (lexeme_char lexbuf 1) }
| "'" complexe_char "'" { CONST_CHAR (process_char (lexeme lexbuf)) }
| (['-'])? digit+ (['u'] | ['U']) (['l'] | ['L']) { UNSIGNED_LONG_NUM (int64_of_number_string (lexeme lexbuf) 2) }
| ['-']? digit+ (['l'] | ['L']) { LONG_NUM (int64_of_number_string (lexeme lexbuf) 1) }
| digit+ (['u'] | ['U']) { UNSIGNED_NUM (int64_of_number_string (lexeme lexbuf) 1) }
| ['-']? digit+ { NUM (int64_of_number_string (lexeme lexbuf) 0) }
| ['-']? const_float { NUM_FLOAT (float_of_string (lexeme lexbuf)) }
| identifier { keyword_or_ident (lexeme lexbuf) }
| _ { char_error (lexeme lexbuf) }
and string = parse
(* TODO: changer les char en leur valeur pour que ce soit correctement interprété *)
| simple_char { Buffer.add_string string_buffer (lexeme lexbuf); string lexbuf }
| complexe_char { Buffer.add_char string_buffer (process_char (lexeme lexbuf)); string lexbuf }
| "\"" { Buffer.contents string_buffer }
| _ { raise (Lexical_error "Invalid character") }
and comment_line = parse
| ['\n' '\r']+ { token lexbuf }
| eof { token lexbuf }
| _ { comment_line lexbuf }
and comment = parse
| "*/" { token lexbuf }
| "/*" { raise (Lexical_error "Imbricated comments are forbiden") }
| eof { raise (Lexical_error "Unterminated comment") }
| _ { comment lexbuf }
and macro = parse
| _ { macro lexbuf }
| ['\r' '\n']+ { token lexbuf }