tokenizer.c

changeset 0
86b7f6f9c5c0
child 1
6e704fc09528
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tokenizer.c	Tue Feb 15 18:57:52 2005 +0100
@@ -0,0 +1,756 @@
+/*
+ * libtu/tokenizer.c
+ *
+ * Copyright (c) Tuomo Valkonen 1999-2000.
+ * 
+ * This file is distributed under the terms of the "Artistic License".
+ * See the included file LICENSE for details.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <malloc.h>
+#include <limits.h>
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "include/tokenizer.h"
+#include "include/misc.h"
+#include "include/output.h"
+
+
+static const char *errors[]={
+	DUMMY_TR("(no error)"),
+	DUMMY_TR("Unexpected end of file"),				/* E_TOKZ_UNEXPECTED_EOF */
+	DUMMY_TR("Unexpected end of line"),				/* E_TOKZ_UNEXPECTED_EOL */
+	DUMMY_TR("End of line expected"),				/* E_TOKZ_EOL_EXPECTED */
+	DUMMY_TR("Invalid character"),					/* E_TOKZ_INVALID_CHAR*/
+	DUMMY_TR("Numeric constant too big"),			/* E_TOKZ_TOOBIG */
+	DUMMY_TR("Invalid numberic format"),			/* E_TOKZ_NUMFMT */
+	DUMMY_TR("Junk after numeric constant"),		/* E_TOKZ_NUM_JUNK */
+	DUMMY_TR("Not an integer"),						/* E_TOKZ_NOTINT */
+	DUMMY_TR("Numeric constant out of range"),		/* E_TOKZ_RANGE */
+	DUMMY_TR("Multi-character character constant"),	/* E_TOKZ_MULTICHAR */
+	DUMMY_TR("Token/statement limit reached"),		/* E_TOKZ_TOKEN_LIMIT */
+	DUMMY_TR("Unknown option"),						/* E_TOKZ_UNKONWN_OPTION */
+	DUMMY_TR("Syntax error"),						/* E_TOKZ_SYNTAX */
+	DUMMY_TR("Invalid argument"),					/* E_TOKZ_INVALID_ARGUMENT */
+	DUMMY_TR("End of statement expected"),			/* E_TOKZ_EOS_EXPECTED */
+	DUMMY_TR("Too few arguments"),					/* E_TOKZ_TOO_FEW_ARGS */
+	DUMMY_TR("Too many arguments"),					/* E_TOKZ_TOO_MANY_ARGS */
+	DUMMY_TR("Maximum section nestin level exceeded"), /* E_TOK_Z_MAX_NEST */
+	DUMMY_TR("Unexpected end of statement"),		/* E_TOKZ_UNEXPECTED_EOS */
+	DUMMY_TR("Identifier expected"),				/* E_TOKZ_IDENTIFIER_EXPECTED */
+};
+
+
+/* */
+
+#define STRBLEN 32
+
+#define STRING_DECL(X) char* X=NULL; char X##_tmp[STRBLEN]; int X##_tmpl=0
+#define STRING_DECL_P(X, P) char* X=NULL; char X##_tmp[STRBLEN]=P; int X##_tmpl=sizeof(P)-1
+#define STRING_APPEND(X, C) {if(!_string_append(&X, X##_tmp, &X##_tmpl, c)) return -ENOMEM;}
+#define STRING_FREE(X) if(X!=NULL) free(X)
+#define STRING_FINISH(X) {if(!_string_finish(&X, X##_tmp, X##_tmpl)) return -ENOMEM;}
+
+
+static bool _string_append(char **p, char *tmp, int *tmplen, char c)
+{
+	char *tmp2;
+	
+	if(*tmplen==STRBLEN-1){
+		tmp[STRBLEN-1]='\0';
+		if(*p!=NULL){
+			tmp2=scat(*p, tmp);
+			free(*p);
+			*p=tmp2;
+		}else{
+			*p=scopy(tmp);
+		}
+		*tmplen=1;
+		tmp[0]=c;
+		return *p!=NULL;
+	}else{
+		tmp[(*tmplen)++]=c;
+		return TRUE;
+	}
+}
+
+
+static bool _string_finish(char **p, char *tmp, int tmplen)
+{
+	char *tmp2;
+	
+	if(tmplen==0){
+		if(*p==NULL)
+			*p=scopy("");
+	}else{
+		tmp[tmplen]='\0';
+		if(*p!=NULL){
+			tmp2=scat(*p, tmp);
+			free(*p);
+			*p=tmp2;
+		}else{
+			*p=scopy(tmp);
+		}
+	}
+	return *p!=NULL;
+}
+
+
+/* */
+
+
+#define INC_LINE() tokz->line++
+#define GETCH() _getch(tokz)
+#define UNGETCH(C) _ungetch(tokz, C)
+
+static int _getch(Tokenizer *tokz)
+{
+	int c;
+	
+	if(tokz->ungetc!=-1){
+		c=tokz->ungetc;
+		tokz->ungetc=-1;
+	}else{
+		c=getc(tokz->file);
+	}
+/*	if(c=='\n')
+		tokz->line++;*/
+	
+	return c;
+}
+
+
+static void _ungetch(Tokenizer *tokz, int c)
+{
+/*	if(c=='\n')
+		tokz->line--;*/
+	tokz->ungetc=c;
+	/*ungetc(c, tokz->file);*/
+}
+
+
+/* */
+
+
+static int scan_line_comment(Token *tok, Tokenizer *tokz)
+{
+	STRING_DECL_P(s, "#");
+	int c;
+
+	c=GETCH();
+				
+	while(c!='\n' && c!=EOF){
+		STRING_APPEND(s, c);
+		c=GETCH();
+	}
+
+	UNGETCH(c);
+
+	STRING_FINISH(s);
+	
+	TOK_SET_COMMENT(tok, s);
+	
+	return 0;
+}
+
+
+static int skip_line_comment(Tokenizer *tokz)
+{
+	int c;
+	
+	do{
+		c=GETCH();
+	}while(c!='\n' && c!=EOF);
+
+	UNGETCH(c);
+		
+	return 0;
+}
+
+
+/* */
+
+
+static int scan_c_comment(Token *tok, Tokenizer *tokz)
+{
+	STRING_DECL_P(s, "/*");
+	int c;
+	int st=0;
+	
+	while(1){
+		c=GETCH();
+		
+		if(c==EOF){
+			STRING_FREE(s);
+			return E_TOKZ_UNEXPECTED_EOF;
+		}
+		
+		STRING_APPEND(s, c);
+		
+		if(c=='\n'){
+			INC_LINE();
+		}else if(st==0 && c=='*'){
+			st=1;
+		}else if(st==1){
+			if(c=='/')
+				break;
+			st=0;
+		}
+	}
+
+	STRING_FINISH(s);
+
+	TOK_SET_COMMENT(tok, s);
+
+	return 0;
+}
+
+
+static int skip_c_comment(Tokenizer *tokz)
+{
+	int c;
+	int st=0;
+	
+	while(1){
+		c=GETCH();
+		
+		if(c==EOF)
+			return E_TOKZ_UNEXPECTED_EOF;
+		
+		if(c=='\n')
+			INC_LINE();
+		else if(st==0 && c=='*')
+			st=1;
+		else if(st==1){
+			if(c=='/')
+				break;
+			st=0;
+		}
+	}
+	
+	return 0;
+}
+
+
+/* */
+
+
+static int scan_char_escape(Tokenizer *tokz)
+{
+	static char* special_chars="nrtbae";
+	static char* specials="\n\r\t\b\a\033";
+	int base, max;
+	int i ,c;
+	
+	c=GETCH();
+	
+	for(i=0;special_chars[i];i++){
+		if(special_chars[i]==c)
+			return specials[c];
+	}
+	
+	if(c=='x' || c=='X'){
+		base=16;max=2;i=0;
+	}else if(c=='d' || c=='D'){
+		base=10;max=3;i=0;
+	}else if(c=='8' || c=='9'){
+		base=10;max=2;i=c-'0';
+	}else if('0'<=c && c<='7'){
+		base=8;max=2;i=c-'0';
+	}else if(c=='\n'){
+		UNGETCH(c);
+		return -2;
+	}else{
+		return c;
+	}
+	
+		
+	while(--max>=0){
+		c=GETCH();
+		
+		if(c==EOF)
+			return EOF;
+		
+		if(c=='\n'){
+			UNGETCH(c);
+			return -2;
+		}
+		
+		if(base==16){
+			if(!isxdigit(c))
+				break;
+			
+			i<<=4;
+			
+			if(isdigit(c))
+				i+=c-'0';
+			else if(i>='a')
+				i+=0xa+c-'a';
+			else
+				i+=0xa+c-'a';
+			
+		}else if(base==10){
+			if(!isdigit(c))
+				break;
+			i*=10;
+			i+=c-'0';
+		}else{
+			if(c<'0' || c>'7')
+				break;
+			i<<=3;
+			i+=c-'0';
+		}
+	}
+	
+	if(max>=0)
+		UNGETCH(c);
+
+	return i;
+}
+
+
+/* */
+
+
+static int scan_string(Token *tok, Tokenizer *tokz, bool escapes)
+{
+	STRING_DECL(s);
+	int c;
+
+	while(1){	
+		c=GETCH();
+		
+		if(c=='"')
+			break;
+		
+		if(c=='\n'){
+			UNGETCH(c);
+			STRING_FREE(s);
+			return E_TOKZ_UNEXPECTED_EOL;
+		}
+		
+		if(c=='\\' && escapes){
+			c=scan_char_escape(tokz);
+			if(c==-2){
+				STRING_FREE(s);
+				return E_TOKZ_UNEXPECTED_EOL;
+			}
+		}
+		
+		if(c==EOF){
+			STRING_FREE(s);
+			return E_TOKZ_UNEXPECTED_EOF;
+		}
+		
+		STRING_APPEND(s, c);
+	}
+	
+	STRING_FINISH(s);
+	
+	TOK_SET_STRING(tok, s);
+
+	return 0;
+}
+
+
+/* */
+
+
+static int scan_char(Token *tok, Tokenizer *tokz)
+{
+	int c, c2;
+	
+	c=GETCH();
+	
+	if(c==EOF)
+		return E_TOKZ_UNEXPECTED_EOF;
+	
+	if(c=='\n')
+		return E_TOKZ_UNEXPECTED_EOL;
+
+	if(c=='\\'){
+		c=scan_char_escape(tokz);	
+		
+		if(c==EOF)
+			return E_TOKZ_UNEXPECTED_EOF;
+		
+		if(c==-2)
+			return E_TOKZ_UNEXPECTED_EOL;
+	}
+	
+	c2=GETCH();
+	
+	if(c2!='\'')
+		return E_TOKZ_MULTICHAR;
+	
+	TOK_SET_CHAR(tok, c);
+	
+	return 0;
+}
+
+
+/* */
+
+
+#define START_IDENT(X) (isalpha(X) || X=='_' || X=='$')
+
+
+static int scan_identifier(Token *tok, Tokenizer *tokz, int c)
+{
+	STRING_DECL(s);
+	
+	do{
+		STRING_APPEND(s, c);
+		c=GETCH();
+	}while(isalnum(c) || c=='_' || c=='$');
+	
+	UNGETCH(c);
+	
+	STRING_FINISH(s);
+	
+	TOK_SET_IDENT(tok, s);
+
+	return 0;
+}
+
+
+#include "numparser2.h"
+#include "np-conv.h"
+
+
+static int scan_number(Token *tok, Tokenizer *tokz, int c)
+{
+	NPNum num=NUM_INIT;
+	int e;
+	
+	if((e=parse_number(&num, tokz, c)))
+		return e;
+	
+	if(num.type==NPNUM_INT){
+		long l;
+		if((e=num_to_long(&l, &num, TRUE)))
+			return e;
+	
+		TOK_SET_LONG(tok, l);
+	}else if(num.type==NPNUM_FLOAT){
+  		double d;
+  		if((e=num_to_double(&d, &num)))
+	  		return e;
+			
+		TOK_SET_DOUBLE(tok, d);
+	}else{
+		return E_TOKZ_NUMFMT;
+	}
+
+	return 0;
+}
+
+
+/* */
+
+
+static uchar op_map[]={
+	0x00,		/* ________ 0-7     */
+	0x00,		/* ________ 8-15    */
+	0x00,		/* ________ 16-23   */
+	0x00,		/* ________ 24-31   */
+	0x62,		/* _!___%&_ 32-39   */
+	0xff,		/* ()*+,-./ 40-47   */
+	0x00,		/* ________ 48-55   */
+	0xfc,		/* __:;<=>? 56-63   */
+	0x01,		/* @_______ 64-71   */
+	0x00,		/* ________ 72-79   */
+	0x00,		/* ________ 80-87   */
+	0x78,		/* ___[_]^_ 88-95   */
+	0x00,		/* ________ 96-103  */
+	0x00,		/* ________ 104-111 */
+	0x00,		/* ________ 112-119 */
+	0x38		/* ___{|}__ 120-127 */
+};
+
+
+static bool map_isset(uchar *map, uint ch)
+{
+	if(ch>127)
+		return FALSE;
+
+	return map[ch>>3]&(1<<(ch&7));
+}
+
+
+static bool is_opch(uint ch)
+{
+	return map_isset(op_map, ch);
+}
+
+
+static int scan_op(Token *tok, Tokenizer *tokz,  int c)
+{
+	int c2;
+	int op=-1;
+	
+	/* Quickly check it is an operator character */
+	if(!is_opch(c))
+		return E_TOKZ_INVALID_CHAR;
+
+	switch(c){
+	case '+':
+	case '-':
+	case '*':
+/*	case '/':	 Checked elsewhere */
+	case '%':
+	case '^':
+	case '!':
+	case '=':
+	case '<':
+	case '>':
+		c2=GETCH();
+		if(c2=='='){
+			op=c|(c2<<8);
+		}else if(c2==c && (c2!='%' && c2!='!' && c2!='*')){
+			if(c=='<' || c=='>'){
+				int c3=GETCH();
+				if(c3=='='){
+					op=c|(c2<<8)|(c3<<16);
+				}else{
+					UNGETCH(c3);
+					op=c|(c2<<8);
+				}
+			}else{
+				op=c|(c2<<8);
+			}
+		}else{
+			UNGETCH(c2);
+			op=c;
+		}
+		break;
+		
+	/* It is already known that it is a operator so these are not needed
+	case ':':
+	case '~':
+	case '?':
+	case '.':
+	case ';';
+	case '{':
+	case '}':
+	case '@':		
+	case '|':
+	case '&':
+	*/
+	default:
+		op=c;
+	}
+	
+	TOK_SET_OP(tok, op);
+
+	return 0;
+}
+
+
+/* */
+
+
+void tokz_warn_error(const Tokenizer *tokz, int line, int e)
+{
+	if(e==E_TOKZ_UNEXPECTED_EOF)
+		line=0;
+	
+	if(e<0)
+		warn_obj_line(tokz->name, line, "%s", strerror(-e));
+	else
+		warn_obj_line(tokz->name, line, "%s", TR(errors[e]));
+}
+
+
+bool tokz_get_token(Tokenizer *tokz, Token *tok)
+{
+	int c, c2, e;
+	
+	assert(tokz->file);
+	
+	tok_free(tok);
+	
+	while(1){
+	
+		e=0;
+		
+		do{
+			c=GETCH();
+		}while(c!='\n' && c!=EOF && isspace(c));
+	
+		tok->line=tokz->line;
+	
+		switch(c){
+		case EOF:
+			TOK_SET_OP(tok, OP_EOF);
+			return TRUE;
+			
+		case '\n':
+			INC_LINE();
+			
+			if(tokz->flags&TOKZ_IGNORE_NEXTLINE)
+				continue;
+			
+			TOK_SET_OP(tok, OP_NEXTLINE);
+			
+			return TRUE;
+			
+		case '\\':
+			do{
+				c=GETCH();
+				if(c==EOF){
+					TOK_SET_OP(tok, OP_EOF);
+					return FALSE;
+				}
+				if(!isspace(c)){
+					tokz_warn_error(tokz, tokz->line, E_TOKZ_EOL_EXPECTED);
+					return FALSE;
+				}
+			}while(c!='\n');
+			
+			INC_LINE();
+			continue;
+
+		case '#':
+			if(tokz->flags&TOKZ_READ_COMMENTS){
+				e=scan_line_comment(tok, tokz);
+				break;
+			}else if((e=skip_line_comment(tokz))){
+				break;
+			}
+			
+			continue;
+			
+		case '/':
+			{
+				c2=GETCH();
+				
+				if(c2=='='){
+					TOK_SET_OP(tok, OP_AS_DIV);
+					return TRUE;
+				}
+				
+				if(c2!='*'){
+					UNGETCH(c2);
+					TOK_SET_OP(tok, OP_DIV);
+					return TRUE;
+				}
+
+				if(tokz->flags&TOKZ_READ_COMMENTS){
+					e=scan_c_comment(tok, tokz);
+					break;
+				}else if((e=skip_c_comment(tokz))){
+					break;
+				}
+				
+				continue;
+			}
+			
+		case '\"':
+			e=scan_string(tok, tokz, TRUE);
+			break;
+
+		case '\'':
+			e=scan_char(tok, tokz);
+			break;
+
+		default: 
+			if(('0'<=c && c<='9') || c=='-' || c=='+'){
+				e=scan_number(tok, tokz, c);
+				break;
+			}
+
+		 	if(START_IDENT(c))
+				e=scan_identifier(tok, tokz, c);
+			else
+				e=scan_op(tok, tokz, c);
+		}
+		
+		if(!e)
+			return TRUE;
+		
+		tokz_warn_error(tokz, tokz->line, e);
+		return FALSE;
+	}
+}
+
+
+Tokenizer *tokz_open(const char *fname)
+{ 
+	Tokenizer*tokz;
+	FILE*file;
+	
+	file=fopen(fname, "r");
+	
+	if(file==NULL){
+		warn_err_obj(fname);
+		return NULL;
+	}
+	
+	tokz=tokz_open_file(file);
+	
+	if(tokz==NULL)
+		fclose(file);
+	else
+		tokz->name=fname;
+	
+	return tokz;
+}
+
+
+Tokenizer *tokz_open_file(FILE *file)
+{
+	Tokenizer*tokz;
+	
+	tokz=ALLOC(Tokenizer);
+	
+	if(tokz==NULL){
+		warn_err();
+		return NULL;
+	}
+	
+	tokz->file=file;
+	tokz->name=NULL;
+	tokz->line=1;
+	tokz->ungetc=-1;
+	tokz->flags=0;
+	tokz->optstack=NULL;
+	tokz->nest_lvl=0;
+	
+	return tokz;
+}
+
+
+void tokz_close(Tokenizer *tokz)
+{
+	if(tokz->file!=NULL)
+		fclose(tokz->file);
+
+	free(tokz);
+}
+
+
+/* */
+
+
+void tok_free(Token *tok)
+{
+	if(TOK_IS_STRING(tok))
+		free(TOK_STRING_VAL(tok));
+	
+	tok->type=TOK_INVALID;
+}
+
+
+void tok_init(Token *tok)
+{
+	static Token dummy=TOK_INIT;
+	
+	memcpy(tok, &dummy, sizeof(*tok));
+}
+

mercurial