tokenizer.c

changeset 0
86b7f6f9c5c0
child 1
6e704fc09528
equal deleted inserted replaced
-1:000000000000 0:86b7f6f9c5c0
1 /*
2 * libtu/tokenizer.c
3 *
4 * Copyright (c) Tuomo Valkonen 1999-2000.
5 *
6 * This file is distributed under the terms of the "Artistic License".
7 * See the included file LICENSE for details.
8 */
9
10 #include <errno.h>
11 #include <stdio.h>
12 #include <ctype.h>
13 #include <malloc.h>
14 #include <limits.h>
15 #include <assert.h>
16 #include <math.h>
17 #include <string.h>
18
19 #include "include/tokenizer.h"
20 #include "include/misc.h"
21 #include "include/output.h"
22
23
24 static const char *errors[]={
25 DUMMY_TR("(no error)"),
26 DUMMY_TR("Unexpected end of file"), /* E_TOKZ_UNEXPECTED_EOF */
27 DUMMY_TR("Unexpected end of line"), /* E_TOKZ_UNEXPECTED_EOL */
28 DUMMY_TR("End of line expected"), /* E_TOKZ_EOL_EXPECTED */
29 DUMMY_TR("Invalid character"), /* E_TOKZ_INVALID_CHAR*/
30 DUMMY_TR("Numeric constant too big"), /* E_TOKZ_TOOBIG */
31 DUMMY_TR("Invalid numberic format"), /* E_TOKZ_NUMFMT */
32 DUMMY_TR("Junk after numeric constant"), /* E_TOKZ_NUM_JUNK */
33 DUMMY_TR("Not an integer"), /* E_TOKZ_NOTINT */
34 DUMMY_TR("Numeric constant out of range"), /* E_TOKZ_RANGE */
35 DUMMY_TR("Multi-character character constant"), /* E_TOKZ_MULTICHAR */
36 DUMMY_TR("Token/statement limit reached"), /* E_TOKZ_TOKEN_LIMIT */
37 DUMMY_TR("Unknown option"), /* E_TOKZ_UNKONWN_OPTION */
38 DUMMY_TR("Syntax error"), /* E_TOKZ_SYNTAX */
39 DUMMY_TR("Invalid argument"), /* E_TOKZ_INVALID_ARGUMENT */
40 DUMMY_TR("End of statement expected"), /* E_TOKZ_EOS_EXPECTED */
41 DUMMY_TR("Too few arguments"), /* E_TOKZ_TOO_FEW_ARGS */
42 DUMMY_TR("Too many arguments"), /* E_TOKZ_TOO_MANY_ARGS */
43 DUMMY_TR("Maximum section nestin level exceeded"), /* E_TOK_Z_MAX_NEST */
44 DUMMY_TR("Unexpected end of statement"), /* E_TOKZ_UNEXPECTED_EOS */
45 DUMMY_TR("Identifier expected"), /* E_TOKZ_IDENTIFIER_EXPECTED */
46 };
47
48
49 /* */
50
51 #define STRBLEN 32
52
53 #define STRING_DECL(X) char* X=NULL; char X##_tmp[STRBLEN]; int X##_tmpl=0
54 #define STRING_DECL_P(X, P) char* X=NULL; char X##_tmp[STRBLEN]=P; int X##_tmpl=sizeof(P)-1
55 #define STRING_APPEND(X, C) {if(!_string_append(&X, X##_tmp, &X##_tmpl, c)) return -ENOMEM;}
56 #define STRING_FREE(X) if(X!=NULL) free(X)
57 #define STRING_FINISH(X) {if(!_string_finish(&X, X##_tmp, X##_tmpl)) return -ENOMEM;}
58
59
60 static bool _string_append(char **p, char *tmp, int *tmplen, char c)
61 {
62 char *tmp2;
63
64 if(*tmplen==STRBLEN-1){
65 tmp[STRBLEN-1]='\0';
66 if(*p!=NULL){
67 tmp2=scat(*p, tmp);
68 free(*p);
69 *p=tmp2;
70 }else{
71 *p=scopy(tmp);
72 }
73 *tmplen=1;
74 tmp[0]=c;
75 return *p!=NULL;
76 }else{
77 tmp[(*tmplen)++]=c;
78 return TRUE;
79 }
80 }
81
82
83 static bool _string_finish(char **p, char *tmp, int tmplen)
84 {
85 char *tmp2;
86
87 if(tmplen==0){
88 if(*p==NULL)
89 *p=scopy("");
90 }else{
91 tmp[tmplen]='\0';
92 if(*p!=NULL){
93 tmp2=scat(*p, tmp);
94 free(*p);
95 *p=tmp2;
96 }else{
97 *p=scopy(tmp);
98 }
99 }
100 return *p!=NULL;
101 }
102
103
104 /* */
105
106
107 #define INC_LINE() tokz->line++
108 #define GETCH() _getch(tokz)
109 #define UNGETCH(C) _ungetch(tokz, C)
110
111 static int _getch(Tokenizer *tokz)
112 {
113 int c;
114
115 if(tokz->ungetc!=-1){
116 c=tokz->ungetc;
117 tokz->ungetc=-1;
118 }else{
119 c=getc(tokz->file);
120 }
121 /* if(c=='\n')
122 tokz->line++;*/
123
124 return c;
125 }
126
127
128 static void _ungetch(Tokenizer *tokz, int c)
129 {
130 /* if(c=='\n')
131 tokz->line--;*/
132 tokz->ungetc=c;
133 /*ungetc(c, tokz->file);*/
134 }
135
136
137 /* */
138
139
140 static int scan_line_comment(Token *tok, Tokenizer *tokz)
141 {
142 STRING_DECL_P(s, "#");
143 int c;
144
145 c=GETCH();
146
147 while(c!='\n' && c!=EOF){
148 STRING_APPEND(s, c);
149 c=GETCH();
150 }
151
152 UNGETCH(c);
153
154 STRING_FINISH(s);
155
156 TOK_SET_COMMENT(tok, s);
157
158 return 0;
159 }
160
161
162 static int skip_line_comment(Tokenizer *tokz)
163 {
164 int c;
165
166 do{
167 c=GETCH();
168 }while(c!='\n' && c!=EOF);
169
170 UNGETCH(c);
171
172 return 0;
173 }
174
175
176 /* */
177
178
179 static int scan_c_comment(Token *tok, Tokenizer *tokz)
180 {
181 STRING_DECL_P(s, "/*");
182 int c;
183 int st=0;
184
185 while(1){
186 c=GETCH();
187
188 if(c==EOF){
189 STRING_FREE(s);
190 return E_TOKZ_UNEXPECTED_EOF;
191 }
192
193 STRING_APPEND(s, c);
194
195 if(c=='\n'){
196 INC_LINE();
197 }else if(st==0 && c=='*'){
198 st=1;
199 }else if(st==1){
200 if(c=='/')
201 break;
202 st=0;
203 }
204 }
205
206 STRING_FINISH(s);
207
208 TOK_SET_COMMENT(tok, s);
209
210 return 0;
211 }
212
213
214 static int skip_c_comment(Tokenizer *tokz)
215 {
216 int c;
217 int st=0;
218
219 while(1){
220 c=GETCH();
221
222 if(c==EOF)
223 return E_TOKZ_UNEXPECTED_EOF;
224
225 if(c=='\n')
226 INC_LINE();
227 else if(st==0 && c=='*')
228 st=1;
229 else if(st==1){
230 if(c=='/')
231 break;
232 st=0;
233 }
234 }
235
236 return 0;
237 }
238
239
240 /* */
241
242
243 static int scan_char_escape(Tokenizer *tokz)
244 {
245 static char* special_chars="nrtbae";
246 static char* specials="\n\r\t\b\a\033";
247 int base, max;
248 int i ,c;
249
250 c=GETCH();
251
252 for(i=0;special_chars[i];i++){
253 if(special_chars[i]==c)
254 return specials[c];
255 }
256
257 if(c=='x' || c=='X'){
258 base=16;max=2;i=0;
259 }else if(c=='d' || c=='D'){
260 base=10;max=3;i=0;
261 }else if(c=='8' || c=='9'){
262 base=10;max=2;i=c-'0';
263 }else if('0'<=c && c<='7'){
264 base=8;max=2;i=c-'0';
265 }else if(c=='\n'){
266 UNGETCH(c);
267 return -2;
268 }else{
269 return c;
270 }
271
272
273 while(--max>=0){
274 c=GETCH();
275
276 if(c==EOF)
277 return EOF;
278
279 if(c=='\n'){
280 UNGETCH(c);
281 return -2;
282 }
283
284 if(base==16){
285 if(!isxdigit(c))
286 break;
287
288 i<<=4;
289
290 if(isdigit(c))
291 i+=c-'0';
292 else if(i>='a')
293 i+=0xa+c-'a';
294 else
295 i+=0xa+c-'a';
296
297 }else if(base==10){
298 if(!isdigit(c))
299 break;
300 i*=10;
301 i+=c-'0';
302 }else{
303 if(c<'0' || c>'7')
304 break;
305 i<<=3;
306 i+=c-'0';
307 }
308 }
309
310 if(max>=0)
311 UNGETCH(c);
312
313 return i;
314 }
315
316
317 /* */
318
319
320 static int scan_string(Token *tok, Tokenizer *tokz, bool escapes)
321 {
322 STRING_DECL(s);
323 int c;
324
325 while(1){
326 c=GETCH();
327
328 if(c=='"')
329 break;
330
331 if(c=='\n'){
332 UNGETCH(c);
333 STRING_FREE(s);
334 return E_TOKZ_UNEXPECTED_EOL;
335 }
336
337 if(c=='\\' && escapes){
338 c=scan_char_escape(tokz);
339 if(c==-2){
340 STRING_FREE(s);
341 return E_TOKZ_UNEXPECTED_EOL;
342 }
343 }
344
345 if(c==EOF){
346 STRING_FREE(s);
347 return E_TOKZ_UNEXPECTED_EOF;
348 }
349
350 STRING_APPEND(s, c);
351 }
352
353 STRING_FINISH(s);
354
355 TOK_SET_STRING(tok, s);
356
357 return 0;
358 }
359
360
361 /* */
362
363
364 static int scan_char(Token *tok, Tokenizer *tokz)
365 {
366 int c, c2;
367
368 c=GETCH();
369
370 if(c==EOF)
371 return E_TOKZ_UNEXPECTED_EOF;
372
373 if(c=='\n')
374 return E_TOKZ_UNEXPECTED_EOL;
375
376 if(c=='\\'){
377 c=scan_char_escape(tokz);
378
379 if(c==EOF)
380 return E_TOKZ_UNEXPECTED_EOF;
381
382 if(c==-2)
383 return E_TOKZ_UNEXPECTED_EOL;
384 }
385
386 c2=GETCH();
387
388 if(c2!='\'')
389 return E_TOKZ_MULTICHAR;
390
391 TOK_SET_CHAR(tok, c);
392
393 return 0;
394 }
395
396
397 /* */
398
399
400 #define START_IDENT(X) (isalpha(X) || X=='_' || X=='$')
401
402
403 static int scan_identifier(Token *tok, Tokenizer *tokz, int c)
404 {
405 STRING_DECL(s);
406
407 do{
408 STRING_APPEND(s, c);
409 c=GETCH();
410 }while(isalnum(c) || c=='_' || c=='$');
411
412 UNGETCH(c);
413
414 STRING_FINISH(s);
415
416 TOK_SET_IDENT(tok, s);
417
418 return 0;
419 }
420
421
422 #include "numparser2.h"
423 #include "np-conv.h"
424
425
426 static int scan_number(Token *tok, Tokenizer *tokz, int c)
427 {
428 NPNum num=NUM_INIT;
429 int e;
430
431 if((e=parse_number(&num, tokz, c)))
432 return e;
433
434 if(num.type==NPNUM_INT){
435 long l;
436 if((e=num_to_long(&l, &num, TRUE)))
437 return e;
438
439 TOK_SET_LONG(tok, l);
440 }else if(num.type==NPNUM_FLOAT){
441 double d;
442 if((e=num_to_double(&d, &num)))
443 return e;
444
445 TOK_SET_DOUBLE(tok, d);
446 }else{
447 return E_TOKZ_NUMFMT;
448 }
449
450 return 0;
451 }
452
453
454 /* */
455
456
457 static uchar op_map[]={
458 0x00, /* ________ 0-7 */
459 0x00, /* ________ 8-15 */
460 0x00, /* ________ 16-23 */
461 0x00, /* ________ 24-31 */
462 0x62, /* _!___%&_ 32-39 */
463 0xff, /* ()*+,-./ 40-47 */
464 0x00, /* ________ 48-55 */
465 0xfc, /* __:;<=>? 56-63 */
466 0x01, /* @_______ 64-71 */
467 0x00, /* ________ 72-79 */
468 0x00, /* ________ 80-87 */
469 0x78, /* ___[_]^_ 88-95 */
470 0x00, /* ________ 96-103 */
471 0x00, /* ________ 104-111 */
472 0x00, /* ________ 112-119 */
473 0x38 /* ___{|}__ 120-127 */
474 };
475
476
477 static bool map_isset(uchar *map, uint ch)
478 {
479 if(ch>127)
480 return FALSE;
481
482 return map[ch>>3]&(1<<(ch&7));
483 }
484
485
486 static bool is_opch(uint ch)
487 {
488 return map_isset(op_map, ch);
489 }
490
491
492 static int scan_op(Token *tok, Tokenizer *tokz, int c)
493 {
494 int c2;
495 int op=-1;
496
497 /* Quickly check it is an operator character */
498 if(!is_opch(c))
499 return E_TOKZ_INVALID_CHAR;
500
501 switch(c){
502 case '+':
503 case '-':
504 case '*':
505 /* case '/': Checked elsewhere */
506 case '%':
507 case '^':
508 case '!':
509 case '=':
510 case '<':
511 case '>':
512 c2=GETCH();
513 if(c2=='='){
514 op=c|(c2<<8);
515 }else if(c2==c && (c2!='%' && c2!='!' && c2!='*')){
516 if(c=='<' || c=='>'){
517 int c3=GETCH();
518 if(c3=='='){
519 op=c|(c2<<8)|(c3<<16);
520 }else{
521 UNGETCH(c3);
522 op=c|(c2<<8);
523 }
524 }else{
525 op=c|(c2<<8);
526 }
527 }else{
528 UNGETCH(c2);
529 op=c;
530 }
531 break;
532
533 /* It is already known that it is a operator so these are not needed
534 case ':':
535 case '~':
536 case '?':
537 case '.':
538 case ';';
539 case '{':
540 case '}':
541 case '@':
542 case '|':
543 case '&':
544 */
545 default:
546 op=c;
547 }
548
549 TOK_SET_OP(tok, op);
550
551 return 0;
552 }
553
554
555 /* */
556
557
558 void tokz_warn_error(const Tokenizer *tokz, int line, int e)
559 {
560 if(e==E_TOKZ_UNEXPECTED_EOF)
561 line=0;
562
563 if(e<0)
564 warn_obj_line(tokz->name, line, "%s", strerror(-e));
565 else
566 warn_obj_line(tokz->name, line, "%s", TR(errors[e]));
567 }
568
569
570 bool tokz_get_token(Tokenizer *tokz, Token *tok)
571 {
572 int c, c2, e;
573
574 assert(tokz->file);
575
576 tok_free(tok);
577
578 while(1){
579
580 e=0;
581
582 do{
583 c=GETCH();
584 }while(c!='\n' && c!=EOF && isspace(c));
585
586 tok->line=tokz->line;
587
588 switch(c){
589 case EOF:
590 TOK_SET_OP(tok, OP_EOF);
591 return TRUE;
592
593 case '\n':
594 INC_LINE();
595
596 if(tokz->flags&TOKZ_IGNORE_NEXTLINE)
597 continue;
598
599 TOK_SET_OP(tok, OP_NEXTLINE);
600
601 return TRUE;
602
603 case '\\':
604 do{
605 c=GETCH();
606 if(c==EOF){
607 TOK_SET_OP(tok, OP_EOF);
608 return FALSE;
609 }
610 if(!isspace(c)){
611 tokz_warn_error(tokz, tokz->line, E_TOKZ_EOL_EXPECTED);
612 return FALSE;
613 }
614 }while(c!='\n');
615
616 INC_LINE();
617 continue;
618
619 case '#':
620 if(tokz->flags&TOKZ_READ_COMMENTS){
621 e=scan_line_comment(tok, tokz);
622 break;
623 }else if((e=skip_line_comment(tokz))){
624 break;
625 }
626
627 continue;
628
629 case '/':
630 {
631 c2=GETCH();
632
633 if(c2=='='){
634 TOK_SET_OP(tok, OP_AS_DIV);
635 return TRUE;
636 }
637
638 if(c2!='*'){
639 UNGETCH(c2);
640 TOK_SET_OP(tok, OP_DIV);
641 return TRUE;
642 }
643
644 if(tokz->flags&TOKZ_READ_COMMENTS){
645 e=scan_c_comment(tok, tokz);
646 break;
647 }else if((e=skip_c_comment(tokz))){
648 break;
649 }
650
651 continue;
652 }
653
654 case '\"':
655 e=scan_string(tok, tokz, TRUE);
656 break;
657
658 case '\'':
659 e=scan_char(tok, tokz);
660 break;
661
662 default:
663 if(('0'<=c && c<='9') || c=='-' || c=='+'){
664 e=scan_number(tok, tokz, c);
665 break;
666 }
667
668 if(START_IDENT(c))
669 e=scan_identifier(tok, tokz, c);
670 else
671 e=scan_op(tok, tokz, c);
672 }
673
674 if(!e)
675 return TRUE;
676
677 tokz_warn_error(tokz, tokz->line, e);
678 return FALSE;
679 }
680 }
681
682
683 Tokenizer *tokz_open(const char *fname)
684 {
685 Tokenizer*tokz;
686 FILE*file;
687
688 file=fopen(fname, "r");
689
690 if(file==NULL){
691 warn_err_obj(fname);
692 return NULL;
693 }
694
695 tokz=tokz_open_file(file);
696
697 if(tokz==NULL)
698 fclose(file);
699 else
700 tokz->name=fname;
701
702 return tokz;
703 }
704
705
706 Tokenizer *tokz_open_file(FILE *file)
707 {
708 Tokenizer*tokz;
709
710 tokz=ALLOC(Tokenizer);
711
712 if(tokz==NULL){
713 warn_err();
714 return NULL;
715 }
716
717 tokz->file=file;
718 tokz->name=NULL;
719 tokz->line=1;
720 tokz->ungetc=-1;
721 tokz->flags=0;
722 tokz->optstack=NULL;
723 tokz->nest_lvl=0;
724
725 return tokz;
726 }
727
728
729 void tokz_close(Tokenizer *tokz)
730 {
731 if(tokz->file!=NULL)
732 fclose(tokz->file);
733
734 free(tokz);
735 }
736
737
738 /* */
739
740
741 void tok_free(Token *tok)
742 {
743 if(TOK_IS_STRING(tok))
744 free(TOK_STRING_VAL(tok));
745
746 tok->type=TOK_INVALID;
747 }
748
749
750 void tok_init(Token *tok)
751 {
752 static Token dummy=TOK_INIT;
753
754 memcpy(tok, &dummy, sizeof(*tok));
755 }
756

mercurial