tokenizer.c

Tue, 15 Feb 2005 18:57:52 +0100

author
tailor@f281.ttorni.ton.tut.fi
date
Tue, 15 Feb 2005 18:57:52 +0100
changeset 0
86b7f6f9c5c0
child 1
6e704fc09528
permissions
-rw-r--r--

Tailorization of trunk
Import of the upstream sources from the repository

http://tao.uab.es/ion/svn/libtu/trunk

as of revision 2

/*
 * libtu/tokenizer.c
 *
 * Copyright (c) Tuomo Valkonen 1999-2000.
 * 
 * This file is distributed under the terms of the "Artistic License".
 * See the included file LICENSE for details.
 */

#include <errno.h>
#include <stdio.h>
#include <ctype.h>
#include <malloc.h>
#include <limits.h>
#include <assert.h>
#include <math.h>
#include <string.h>

#include "include/tokenizer.h"
#include "include/misc.h"
#include "include/output.h"


static const char *errors[]={
	DUMMY_TR("(no error)"),
	DUMMY_TR("Unexpected end of file"),				/* E_TOKZ_UNEXPECTED_EOF */
	DUMMY_TR("Unexpected end of line"),				/* E_TOKZ_UNEXPECTED_EOL */
	DUMMY_TR("End of line expected"),				/* E_TOKZ_EOL_EXPECTED */
	DUMMY_TR("Invalid character"),					/* E_TOKZ_INVALID_CHAR*/
	DUMMY_TR("Numeric constant too big"),			/* E_TOKZ_TOOBIG */
	DUMMY_TR("Invalid numberic format"),			/* E_TOKZ_NUMFMT */
	DUMMY_TR("Junk after numeric constant"),		/* E_TOKZ_NUM_JUNK */
	DUMMY_TR("Not an integer"),						/* E_TOKZ_NOTINT */
	DUMMY_TR("Numeric constant out of range"),		/* E_TOKZ_RANGE */
	DUMMY_TR("Multi-character character constant"),	/* E_TOKZ_MULTICHAR */
	DUMMY_TR("Token/statement limit reached"),		/* E_TOKZ_TOKEN_LIMIT */
	DUMMY_TR("Unknown option"),						/* E_TOKZ_UNKONWN_OPTION */
	DUMMY_TR("Syntax error"),						/* E_TOKZ_SYNTAX */
	DUMMY_TR("Invalid argument"),					/* E_TOKZ_INVALID_ARGUMENT */
	DUMMY_TR("End of statement expected"),			/* E_TOKZ_EOS_EXPECTED */
	DUMMY_TR("Too few arguments"),					/* E_TOKZ_TOO_FEW_ARGS */
	DUMMY_TR("Too many arguments"),					/* E_TOKZ_TOO_MANY_ARGS */
	DUMMY_TR("Maximum section nestin level exceeded"), /* E_TOK_Z_MAX_NEST */
	DUMMY_TR("Unexpected end of statement"),		/* E_TOKZ_UNEXPECTED_EOS */
	DUMMY_TR("Identifier expected"),				/* E_TOKZ_IDENTIFIER_EXPECTED */
};


/* */

#define STRBLEN 32

#define STRING_DECL(X) char* X=NULL; char X##_tmp[STRBLEN]; int X##_tmpl=0
#define STRING_DECL_P(X, P) char* X=NULL; char X##_tmp[STRBLEN]=P; int X##_tmpl=sizeof(P)-1
#define STRING_APPEND(X, C) {if(!_string_append(&X, X##_tmp, &X##_tmpl, c)) return -ENOMEM;}
#define STRING_FREE(X) if(X!=NULL) free(X)
#define STRING_FINISH(X) {if(!_string_finish(&X, X##_tmp, X##_tmpl)) return -ENOMEM;}


static bool _string_append(char **p, char *tmp, int *tmplen, char c)
{
	char *tmp2;
	
	if(*tmplen==STRBLEN-1){
		tmp[STRBLEN-1]='\0';
		if(*p!=NULL){
			tmp2=scat(*p, tmp);
			free(*p);
			*p=tmp2;
		}else{
			*p=scopy(tmp);
		}
		*tmplen=1;
		tmp[0]=c;
		return *p!=NULL;
	}else{
		tmp[(*tmplen)++]=c;
		return TRUE;
	}
}


static bool _string_finish(char **p, char *tmp, int tmplen)
{
	char *tmp2;
	
	if(tmplen==0){
		if(*p==NULL)
			*p=scopy("");
	}else{
		tmp[tmplen]='\0';
		if(*p!=NULL){
			tmp2=scat(*p, tmp);
			free(*p);
			*p=tmp2;
		}else{
			*p=scopy(tmp);
		}
	}
	return *p!=NULL;
}


/* */


#define INC_LINE() tokz->line++
#define GETCH() _getch(tokz)
#define UNGETCH(C) _ungetch(tokz, C)

static int _getch(Tokenizer *tokz)
{
	int c;
	
	if(tokz->ungetc!=-1){
		c=tokz->ungetc;
		tokz->ungetc=-1;
	}else{
		c=getc(tokz->file);
	}
/*	if(c=='\n')
		tokz->line++;*/
	
	return c;
}


static void _ungetch(Tokenizer *tokz, int c)
{
/*	if(c=='\n')
		tokz->line--;*/
	tokz->ungetc=c;
	/*ungetc(c, tokz->file);*/
}


/* */


static int scan_line_comment(Token *tok, Tokenizer *tokz)
{
	STRING_DECL_P(s, "#");
	int c;

	c=GETCH();
				
	while(c!='\n' && c!=EOF){
		STRING_APPEND(s, c);
		c=GETCH();
	}

	UNGETCH(c);

	STRING_FINISH(s);
	
	TOK_SET_COMMENT(tok, s);
	
	return 0;
}


static int skip_line_comment(Tokenizer *tokz)
{
	int c;
	
	do{
		c=GETCH();
	}while(c!='\n' && c!=EOF);

	UNGETCH(c);
		
	return 0;
}


/* */


static int scan_c_comment(Token *tok, Tokenizer *tokz)
{
	STRING_DECL_P(s, "/*");
	int c;
	int st=0;
	
	while(1){
		c=GETCH();
		
		if(c==EOF){
			STRING_FREE(s);
			return E_TOKZ_UNEXPECTED_EOF;
		}
		
		STRING_APPEND(s, c);
		
		if(c=='\n'){
			INC_LINE();
		}else if(st==0 && c=='*'){
			st=1;
		}else if(st==1){
			if(c=='/')
				break;
			st=0;
		}
	}

	STRING_FINISH(s);

	TOK_SET_COMMENT(tok, s);

	return 0;
}


static int skip_c_comment(Tokenizer *tokz)
{
	int c;
	int st=0;
	
	while(1){
		c=GETCH();
		
		if(c==EOF)
			return E_TOKZ_UNEXPECTED_EOF;
		
		if(c=='\n')
			INC_LINE();
		else if(st==0 && c=='*')
			st=1;
		else if(st==1){
			if(c=='/')
				break;
			st=0;
		}
	}
	
	return 0;
}


/* */


static int scan_char_escape(Tokenizer *tokz)
{
	static char* special_chars="nrtbae";
	static char* specials="\n\r\t\b\a\033";
	int base, max;
	int i ,c;
	
	c=GETCH();
	
	for(i=0;special_chars[i];i++){
		if(special_chars[i]==c)
			return specials[c];
	}
	
	if(c=='x' || c=='X'){
		base=16;max=2;i=0;
	}else if(c=='d' || c=='D'){
		base=10;max=3;i=0;
	}else if(c=='8' || c=='9'){
		base=10;max=2;i=c-'0';
	}else if('0'<=c && c<='7'){
		base=8;max=2;i=c-'0';
	}else if(c=='\n'){
		UNGETCH(c);
		return -2;
	}else{
		return c;
	}
	
		
	while(--max>=0){
		c=GETCH();
		
		if(c==EOF)
			return EOF;
		
		if(c=='\n'){
			UNGETCH(c);
			return -2;
		}
		
		if(base==16){
			if(!isxdigit(c))
				break;
			
			i<<=4;
			
			if(isdigit(c))
				i+=c-'0';
			else if(i>='a')
				i+=0xa+c-'a';
			else
				i+=0xa+c-'a';
			
		}else if(base==10){
			if(!isdigit(c))
				break;
			i*=10;
			i+=c-'0';
		}else{
			if(c<'0' || c>'7')
				break;
			i<<=3;
			i+=c-'0';
		}
	}
	
	if(max>=0)
		UNGETCH(c);

	return i;
}


/* */


static int scan_string(Token *tok, Tokenizer *tokz, bool escapes)
{
	STRING_DECL(s);
	int c;

	while(1){	
		c=GETCH();
		
		if(c=='"')
			break;
		
		if(c=='\n'){
			UNGETCH(c);
			STRING_FREE(s);
			return E_TOKZ_UNEXPECTED_EOL;
		}
		
		if(c=='\\' && escapes){
			c=scan_char_escape(tokz);
			if(c==-2){
				STRING_FREE(s);
				return E_TOKZ_UNEXPECTED_EOL;
			}
		}
		
		if(c==EOF){
			STRING_FREE(s);
			return E_TOKZ_UNEXPECTED_EOF;
		}
		
		STRING_APPEND(s, c);
	}
	
	STRING_FINISH(s);
	
	TOK_SET_STRING(tok, s);

	return 0;
}


/* */


static int scan_char(Token *tok, Tokenizer *tokz)
{
	int c, c2;
	
	c=GETCH();
	
	if(c==EOF)
		return E_TOKZ_UNEXPECTED_EOF;
	
	if(c=='\n')
		return E_TOKZ_UNEXPECTED_EOL;

	if(c=='\\'){
		c=scan_char_escape(tokz);	
		
		if(c==EOF)
			return E_TOKZ_UNEXPECTED_EOF;
		
		if(c==-2)
			return E_TOKZ_UNEXPECTED_EOL;
	}
	
	c2=GETCH();
	
	if(c2!='\'')
		return E_TOKZ_MULTICHAR;
	
	TOK_SET_CHAR(tok, c);
	
	return 0;
}


/* */


#define START_IDENT(X) (isalpha(X) || X=='_' || X=='$')


static int scan_identifier(Token *tok, Tokenizer *tokz, int c)
{
	STRING_DECL(s);
	
	do{
		STRING_APPEND(s, c);
		c=GETCH();
	}while(isalnum(c) || c=='_' || c=='$');
	
	UNGETCH(c);
	
	STRING_FINISH(s);
	
	TOK_SET_IDENT(tok, s);

	return 0;
}


#include "numparser2.h"
#include "np-conv.h"


static int scan_number(Token *tok, Tokenizer *tokz, int c)
{
	NPNum num=NUM_INIT;
	int e;
	
	if((e=parse_number(&num, tokz, c)))
		return e;
	
	if(num.type==NPNUM_INT){
		long l;
		if((e=num_to_long(&l, &num, TRUE)))
			return e;
	
		TOK_SET_LONG(tok, l);
	}else if(num.type==NPNUM_FLOAT){
  		double d;
  		if((e=num_to_double(&d, &num)))
	  		return e;
			
		TOK_SET_DOUBLE(tok, d);
	}else{
		return E_TOKZ_NUMFMT;
	}

	return 0;
}


/* */


static uchar op_map[]={
	0x00,		/* ________ 0-7     */
	0x00,		/* ________ 8-15    */
	0x00,		/* ________ 16-23   */
	0x00,		/* ________ 24-31   */
	0x62,		/* _!___%&_ 32-39   */
	0xff,		/* ()*+,-./ 40-47   */
	0x00,		/* ________ 48-55   */
	0xfc,		/* __:;<=>? 56-63   */
	0x01,		/* @_______ 64-71   */
	0x00,		/* ________ 72-79   */
	0x00,		/* ________ 80-87   */
	0x78,		/* ___[_]^_ 88-95   */
	0x00,		/* ________ 96-103  */
	0x00,		/* ________ 104-111 */
	0x00,		/* ________ 112-119 */
	0x38		/* ___{|}__ 120-127 */
};


static bool map_isset(uchar *map, uint ch)
{
	if(ch>127)
		return FALSE;

	return map[ch>>3]&(1<<(ch&7));
}


static bool is_opch(uint ch)
{
	return map_isset(op_map, ch);
}


static int scan_op(Token *tok, Tokenizer *tokz,  int c)
{
	int c2;
	int op=-1;
	
	/* Quickly check it is an operator character */
	if(!is_opch(c))
		return E_TOKZ_INVALID_CHAR;

	switch(c){
	case '+':
	case '-':
	case '*':
/*	case '/':	 Checked elsewhere */
	case '%':
	case '^':
	case '!':
	case '=':
	case '<':
	case '>':
		c2=GETCH();
		if(c2=='='){
			op=c|(c2<<8);
		}else if(c2==c && (c2!='%' && c2!='!' && c2!='*')){
			if(c=='<' || c=='>'){
				int c3=GETCH();
				if(c3=='='){
					op=c|(c2<<8)|(c3<<16);
				}else{
					UNGETCH(c3);
					op=c|(c2<<8);
				}
			}else{
				op=c|(c2<<8);
			}
		}else{
			UNGETCH(c2);
			op=c;
		}
		break;
		
	/* It is already known that it is a operator so these are not needed
	case ':':
	case '~':
	case '?':
	case '.':
	case ';';
	case '{':
	case '}':
	case '@':		
	case '|':
	case '&':
	*/
	default:
		op=c;
	}
	
	TOK_SET_OP(tok, op);

	return 0;
}


/* */


void tokz_warn_error(const Tokenizer *tokz, int line, int e)
{
	if(e==E_TOKZ_UNEXPECTED_EOF)
		line=0;
	
	if(e<0)
		warn_obj_line(tokz->name, line, "%s", strerror(-e));
	else
		warn_obj_line(tokz->name, line, "%s", TR(errors[e]));
}


bool tokz_get_token(Tokenizer *tokz, Token *tok)
{
	int c, c2, e;
	
	assert(tokz->file);
	
	tok_free(tok);
	
	while(1){
	
		e=0;
		
		do{
			c=GETCH();
		}while(c!='\n' && c!=EOF && isspace(c));
	
		tok->line=tokz->line;
	
		switch(c){
		case EOF:
			TOK_SET_OP(tok, OP_EOF);
			return TRUE;
			
		case '\n':
			INC_LINE();
			
			if(tokz->flags&TOKZ_IGNORE_NEXTLINE)
				continue;
			
			TOK_SET_OP(tok, OP_NEXTLINE);
			
			return TRUE;
			
		case '\\':
			do{
				c=GETCH();
				if(c==EOF){
					TOK_SET_OP(tok, OP_EOF);
					return FALSE;
				}
				if(!isspace(c)){
					tokz_warn_error(tokz, tokz->line, E_TOKZ_EOL_EXPECTED);
					return FALSE;
				}
			}while(c!='\n');
			
			INC_LINE();
			continue;

		case '#':
			if(tokz->flags&TOKZ_READ_COMMENTS){
				e=scan_line_comment(tok, tokz);
				break;
			}else if((e=skip_line_comment(tokz))){
				break;
			}
			
			continue;
			
		case '/':
			{
				c2=GETCH();
				
				if(c2=='='){
					TOK_SET_OP(tok, OP_AS_DIV);
					return TRUE;
				}
				
				if(c2!='*'){
					UNGETCH(c2);
					TOK_SET_OP(tok, OP_DIV);
					return TRUE;
				}

				if(tokz->flags&TOKZ_READ_COMMENTS){
					e=scan_c_comment(tok, tokz);
					break;
				}else if((e=skip_c_comment(tokz))){
					break;
				}
				
				continue;
			}
			
		case '\"':
			e=scan_string(tok, tokz, TRUE);
			break;

		case '\'':
			e=scan_char(tok, tokz);
			break;

		default: 
			if(('0'<=c && c<='9') || c=='-' || c=='+'){
				e=scan_number(tok, tokz, c);
				break;
			}

		 	if(START_IDENT(c))
				e=scan_identifier(tok, tokz, c);
			else
				e=scan_op(tok, tokz, c);
		}
		
		if(!e)
			return TRUE;
		
		tokz_warn_error(tokz, tokz->line, e);
		return FALSE;
	}
}


Tokenizer *tokz_open(const char *fname)
{ 
	Tokenizer*tokz;
	FILE*file;
	
	file=fopen(fname, "r");
	
	if(file==NULL){
		warn_err_obj(fname);
		return NULL;
	}
	
	tokz=tokz_open_file(file);
	
	if(tokz==NULL)
		fclose(file);
	else
		tokz->name=fname;
	
	return tokz;
}


Tokenizer *tokz_open_file(FILE *file)
{
	Tokenizer*tokz;
	
	tokz=ALLOC(Tokenizer);
	
	if(tokz==NULL){
		warn_err();
		return NULL;
	}
	
	tokz->file=file;
	tokz->name=NULL;
	tokz->line=1;
	tokz->ungetc=-1;
	tokz->flags=0;
	tokz->optstack=NULL;
	tokz->nest_lvl=0;
	
	return tokz;
}


void tokz_close(Tokenizer *tokz)
{
	if(tokz->file!=NULL)
		fclose(tokz->file);

	free(tokz);
}


/* */


void tok_free(Token *tok)
{
	if(TOK_IS_STRING(tok))
		free(TOK_STRING_VAL(tok));
	
	tok->type=TOK_INVALID;
}


void tok_init(Token *tok)
{
	static Token dummy=TOK_INIT;
	
	memcpy(tok, &dummy, sizeof(*tok));
}

mercurial