/* (C) Copyright International Business Machines Corporation 23 January */
/* 1990.  All Rights Reserved. */
/*  */
/* See the file USERAGREEMENT distributed with this software for full */
/* terms and conditions of use. */
%{	/* Lexical analyzer for a "mini C" parser */
	/** Andy Lowry, Mar 1989 */

	/* This lex source describes the tokenizer for a "miniature" C */
	/* parser used to parse lint library sources and make them */
	/* available for a rule-based program that generates Hermes */
	/* definition and process modules to make the libary routines */
	/* accessible through Hermes */
%}

%{
static char sccsid[] = "@(#)minic.l	1.2 3/13/90";
%}

%{
#include <ctype.h>
#include "minic.h"		/* standard declarations */
#include "mctokens.h"		/* token definitions */

/* local declarations */
static char *strval = NULL;	/* allocated string returned via */
				/* yylval.strval, caller must copy it */
				/* to keep it because we deallocate it */
				/* automatically */


static void lexinit();		/* initializes some global vars */
static int bare_yylex();	/* this is the tokenizer generated by lex */
static int nonpp_yylex();	/* filters out and processes cpp line marks */
static int resword_id();	/* get token number for a reserved word */
static long my_atol();		/* convert any syntax ofl C integer */
				/* constant to long binary */
static char *get_qstring();	/* function to accumulate a quoted string */
static int get_qschar();	/* get a single char for a quoted */
				/* string, following std escape conventions */

/* we have our own yylex that sits on top of the lex parser... following */
/* effectively renames the one generated by lex and makes it local to */
/* this file. */
#define yylex static int bare_yylex

%}

%{	/* Normal C Code rules are active in start condition CCODE */
%}
%start		CCODE

%{	/* Comments are handled by rules in start condition COMMENT */
%}
%start		COMMENT

%{	/* C Preprocessor lines begin with '#' and are handled */
	/* specially by scanning them with start condition PPLINE */
%}
%start		PPLINE

%{	/* basic character classes */
%}
letter		[a-zA-Z]
digit		[0-9]
sign		[+-]
whitespace	[ \t]
newline		"\n"

%{	/* some basic patterns */
%}
octinteger	({sign})?0({digit})+
decinteger	({sign})?({digit})+
hexinteger	({sign})?0[xX](({digit})|[a-fA-F])+
integer		({octinteger})|({decinteger})|({hexinteger})
longinteger	({integer})[lL]
double		({sign})({digit})+.({digit})*[eE]({sign})({digit})+
symbol		(({letter})|_)(({letter})|({digit})|_)*


%%

%{	/* First char in input switches us from INITIAL to CCODE start */
	/* condition and puts the char back to be read again */
%}
<INITIAL>(.|\n) {
	lexinit();
	BEGIN CCODE;		/* switch start condition */
	unput(yytext[0]);	/* and reread the char */
}


{whitespace} {			/* horizontal whitespace ignored in */
				/* all start conditions */ 
}

%{	/* Both normal C and preprocessor lines can contain quoted */
	/* strings, so here is the rule for them */ 
%}

<CCODE,PPLINE>["'] {		/* beginning of a quoted string */
	char qchar;

	qchar = yytext[0];	/* remember delimiter char */
	if (strval != NULL)
	  free(strval);		/* free up stg for previous string if any */
	if ((strval = get_qstring(qchar)) == NULL) /* accumulate string */
	  yyerror("Badly formed string or character constant");
	else
	  switch(qchar) {	/* return token type based on delimiter */
	  case '\'': 
	    return(T_SQSTRING);
	  case '\"':
	    return(T_DQSTRING);
	  }
}

%{	/* Rules that operate only in the CCODE start condition -- */
	/* these are the rules that handle the normal C syntax */
%}
<CCODE>{newline} {
	linecount++;		/* count source lines */
}

<CCODE>^# {			/* line starts with # */
	BEGIN PPLINE;		/* it's a preprocessor line */
	return(T_PPBEGIN);	/* spit out begin marker */
}

<CCODE>"/*" {			/* '/*' encountered... */
	BEGIN COMMENT;		/* it's a comment */
}

<CCODE>{symbol} {		/* could be a reserved word or an identifier */
	int tokid;
				/* check if it's a reserved word */
	if ((tokid = resword_id(yytext)) != 0)
	  return(tokid);	/* yup, return specific token */
	else {
	  yylval.symval = lookup(yytext,-1); /* else look in symbol table */
	  if (yylval.symval->type == S_TYPEDEF) /* return correct token */
	    return(T_TYPEDEFNAME);
	  else
	    return(T_IDENT);
	}
}

<CCODE>{integer} {		/* integer, not tagged as long */
  	long int x;
	x = my_atol(yytext);	/* convert string to long first */
	yylval.intval = x;	/* then store as normal-sized int */
	if (yylval.intval == x)
	  return(T_INT);	/* value fits as an int, so use that */
	else {
	  yylval.longval = x;	/* value too big for int... use long */
	  return(T_LONG);
	}
}

<CCODE>{longinteger} {		/* integer tagged as a long */
	yylval.longval = my_atol(yytext); /* convert to binary */
	return(T_LONG);
}

<CCODE>{double} {		/* floating point constant */
	yylval.dblval = atod(yytext); /* convert to binary */
	return(T_DOUBLE);
}

%{	/* Multi-character operators */
%}
<CCODE>"++" {
	return(T_INCREMENT);
}

<CCODE>"--" {
	return(T_DECREMENT);
}

<CCODE>"->" {
	return(T_RARROW);
}

<CCODE>"<<" {
	return(T_LSHIFT);
}

<CCODE>">>" {
	return(T_RSHIFT);
}

<CCODE>"<=" {
	return(T_LEQ);
}

<CCODE>">=" {
	return(T_GEQ);
}

<CCODE>"==" {
	return(T_EQ);
}

<CCODE>"!=" {
	return(T_NEQ);
}

<CCODE>"&&" {
	return(T_AND);
}

<CCODE>"||" {
	return(T_IOR);
}

<CCODE>"+=" {
	return(T_ADD_ASSIGN);
}

<CCODE>"-=" {
	return(T_SUB_ASSIGN);
}

<CCODE>"*=" {
	return(T_MULT_ASSIGN);
}

<CCODE>"/=" {
	return(T_DIV_ASSIGN);
}

<CCODE>"%=" {
	return(T_MOD_ASSIGN);
}

<CCODE>">>=" {
	return(T_RSHIFT_ASSIGN);
}

<CCODE>"<<=" {
	return(T_LSHIFT_ASSIGN);
}

<CCODE>"&=" {
	return(T_AND_ASSIGN);
}

<CCODE>"^=" {
	return(T_XOR_ASSIGN);
}

<CCODE>"|=" {
	return(T_IOR_ASSIGN);
}

<CCODE>. {			/* any other char gets returned as itself */
	return(yytext[0]);
}

%{	/* Comment rules... we just ignore everything until the next */
	/* '*' followed by '/' to mark the end of comment.  The only */
	/* trickiness comes from wanting to maintain our line count */
	/* correctly */
%}
<COMMENT>"*/" {			/* end of comment... */
	BEGIN CCODE;		/* back to C rules */
}

<COMMENT>"*"/[^/] {		/* asterisk other than in comment end */
				/* marker we do nothing, but note that */
				/* following char will be read again */
				/* (so "**" followed by "/" works and */
				/* "*\n" counts newline) */ 
}

<COMMENT>"\n" {			/* count newlines */
	linecount++;
}

<COMMENT>[^*\n] {		/* ignore chars in body of comment */
}


%{	/* Preprocessor line rules... quoted strings are handled in */
	/* the rules above.  We also accept decimal integers.  Newline */
	/* breaks out of PPLINE and returns a PPEOL token unless */
	/* preceded by a backslash, in which case we just count the */
	/* line and ignore both backslash and newline. */
%}
<PPLINE>({digit})+ {		/* positive decimal integer */
	yylval.longval = my_atol(yytext); /* convert to binary */
	return(T_LONG);		/* and return it as a long */
}

<PPLINE>"\\\n" {		/* escaped newline... stay in PPLINE */
	linecount++;		/* but count the line */
}

<PPLINE>"\n" {			/* unescaped newline... */
	BEGIN CCODE;		/* back to C rules */
	linecount++;		/* count the line */
	return(T_PPEND);	/* and tell yacc we're out */
}

<PPLINE>. {			/* any other character */
	return(T_PPGARBAGE);	/* let yacc know it's junk */
}

%%
#undef yylex

/* LEXINIT

** Initializes a few global variables to known values for startup
**/

static void
lexinit()
{
  inputfile = copystring("???"); /* unknown input file to begin with */
  linecount = 1;		/* starting at line 1 */
}

/* NONPP_YYLEX

** This function acts as a filter to BARE_YYLEX (the one generated by
** lex, renamed as a result of our macro definition above).  We
** intercept line markers left by cpp and update the INPUTFILE and
** LINECOUNT variables as indicated.  Any preprocessor line that does
** not look like a line marker causes an error message and is
** otherwise ignored.  None of the preprocessor related tokens are
** passed through.
**/

static int
nonpp_yylex()
{
  int token;
  long newlinecount;
  char *newfile;

  while ((token = bare_yylex()) == T_PPBEGIN) {
    if ((token = bare_yylex()) == T_LONG) { /* 1st comes line # */
      newlinecount = yylval.longval;
      if ((token = bare_yylex()) == T_DQSTRING) { /* 2nd comes filename */
	newfile = copystring(yylval.strval);
	if ((token = bare_yylex()) == T_LONG) /* ignore 2nd number if any */
	  token = bare_yylex();
	if (token == T_PPEND) {	/* make sure no junk on PPLINE */
	  if (inputfile != NULL)
	    free(inputfile);	/* release prior file name */
	  inputfile = newfile;	/* install new info */
	  linecount = newlinecount;
	  continue;		/* and go get next token */
	}
      }
    }
    /* here when there was something wrong with the line marker */
    yyerror("Improper format for cpp line marker");
    while (token != T_PPEND)	/* flush remaining input to end of line */
      if ((token = bare_yylex()) == 0)
	return(0);		/* break prematurely on end of input */
  }
  return(token);		/* something other than a preprocessor line */
}

/* YYLEX

** This function is a filter for the token stream coming out of
** NONPP_YYLEX.  We do the special-case lookahead stuff required by
** the yacc parser (see comments at top of minic.y for details).
**/

int
yylex()
{
  static int savedtok = -1;	/* lookahead token */
  static YYSTYPE savedval;	/* and its associated value */

  int token;

  if (savedtok >= 0) {		/* we have a token from a prior lookahead */
    token = savedtok;		/* so use it */
    yylval = savedval;
    savedtok = -1;		/* and forget it */
  }
  else
    token = nonpp_yylex();	/* no saved token... get a new one */
  
  if (token == '(')		/* open paren... check if close paren next */
    if ((savedtok = nonpp_yylex()) == ')') {
      token = T_PARENS;		/* yes, use special token for empty parens */
      savedtok = -1;		/* and forget the close paren */
    }
  else
    savedval = yylval;		/* no, don't lose the lookahead value */

  if (token == T_PARENS) {	/* empty parens... check following context */
    savedtok = nonpp_yylex();
    savedval = yylval;
    switch (savedtok) {
    case '{':			/* tokens that can follow function */
    case T_VOID:		/* definition paremeter list */
    case T_CHAR:
    case T_SHORT:
    case T_INT:
    case T_LONG:
    case T_UNSIGNED:
    case T_FLOAT:
    case T_DOUBLE:
    case T_STRUCT:
    case T_UNION:
    case T_ENUM:
    case T_TYPEDEFNAME:
    case T_AUTO:
    case T_STATIC:
    case T_EXTERN:
    case T_REGISTER:
    case T_TYPEDEF:
      token = T_PARENS1;
      break;

    case '[':			/* tokens that can follow a declarator */
    case '(':
    case T_PARENS:
    case T_PARENS1:
    case T_PARENS2:
    case ',':
    case '=':
    case ':':
    case ';':
      token = T_PARENS2;
      break;
    }
  }
  return(token);
}

/* RESWORD_ID

** Checks to see whether the just-parsed symbol whose value is
** currently in yytext is a reserved word, and returns its specific
** TOKEN number if so.  Otherwise, 0 is returned.
**/

static struct RESWORD {		/* define table of reserved words */
  char *word;
  int token_id;
} reswords[] = {
#include "reswords.h"
};

static int
resword_id(sym)
char *sym;
{
  int lo,hi,mid,test;

  lo = 0;			/* initialize for binary search */
  hi = sizeof(reswords)/sizeof(struct RESWORD);
  while (lo <= hi) {		/* binarily we go along... */
    mid = (lo+hi)/2;
    if ((test = strcmp(sym,reswords[mid].word)) < 0)
      hi = mid-1;		/* it's in the lower half */
    else if (test > 0)
      lo = mid+1;		/* it's in the upper half */
    else
      return(reswords[mid].token_id); /* got it! */
  }
  return(0);			/* search failed */
}

/* MY_ATOL

** Converts a C integer string to a long binary.  If the first digit
** is 0, the number is treated as octal, unless the following char is
** X or x, in which case a hexadecimal conversion is performed.  In an
** octal conversion, the digits 8 and 9 are allowed, and represent the
** octal values 10 and 11 respectively (so 08 == 8 and 09 == 9 are
** both true).  A trailing L or l is ignored.  Note that this routine
** does no checking for ill-formed numbers... the input must be
** correct or nonsense may be returned.
**/

static long
my_atol(digits)
char *digits;
{
  int sign = 1, base = 10, digit;
  long value = 0;

  switch (*digits) {		/* handle initial sign char if any */
  case '-':
    sign = -1;			/* negative number... change sign & */
				/* drop thruh */
  case '+':
    digits++;			/* bump past sign in both cases */
  }

  if (*digits == '0')		/* octal or hex */
    switch(*++digits) {		/* check for following x or X */
    case 'x':
    case 'X':
      base = 16;		/* hex... remember the base */
      digits++;			/* and bump past the x */
    default:
      base = 8;			/* anything else means octal and */
				/* reread it */
    }
  while (isdigit(digit = *digits++)) /* scan all the remaining digits */
    value = (value*base) + (digit-'0');	/* and shift them in */

  return(sign*value);
}


/* GET_QSTRING

** Call this routine to accumulate following characters as a character
** string.  DELIM is the delimiter that will mark the end of the
** string.  Returns a pointer to allocated storage holding the string
** on success, NULL on failure.
**/

static char *
get_qstring(delim)
char delim;
{
  int c,done=0;
  int len,maxlen;
  char *str,*malloc(),*realloc();

  maxlen = 80;			/* big enough for most strings */
  if ((str = malloc(maxlen)) == NULL) {
    yyerror("Unable to parse quoted string: malloc failed");
    return(NULL);		/* can't get storage for string */
  }

  while ((c = get_qschar()) != 0) { /* scan incoming chars */
    if (c == delim) {
      c = '\0';			/* switch to null char to tie off string */
      done = -1;		/* and say we're finished */
    }
    else if (c == '\n') {	/* embedded unquoted newlines not allowed */
      yyerror("Embedded newline in quoted string");
      unput(c);			/* make the newline available again */
      free(str);		/* stg won't be used */
      return(NULL);
    }
    if (len >= maxlen) {
      maxlen += 25;		/* try to grow the buffer if needed */
      str = realloc(str,maxlen);
      if (str == NULL) {
	yyerror("Unable to parse quoted string: realloc failed");
	return(NULL);
      }
    }
    str[len++] = c & 0xFF;	/* deposit this char, sans quoted flag */
    if (c == '\0')		/* was this the closing quote? */
      if ((str = realloc(str,len)) == NULL) { /* shrink to minimum needed */
	yyerror("Unable to parse quoted string: final realloc failed (?)");
	return(NULL);
      }
      else
	return(str);
  }
  /* here if we reach premature end of input */
  yyerror("Unterminated quoted string at end of file");
  free(str);			/* won't need this storage */
  return(NULL);
}

/* GET_QSCHAR

** Returns the next logical character for a quoted string.  If the
** next input char is anything but '\', it is returned directly.
** Otherwise if the following char(s) match one of the standard C
** escape sequences, the corresponding char is returned with bit 0x100
** set to indicate the quoting.  If a newline is preceded by '\', both
** characters are ignored.  If none of the standard escape sequences
** matches, the char following the '\' is returned, again with the
** 0x100 bit set.
**/

static int
get_qschar()
{
  int c,c1,c2;

  for (;;) {			/* loop until we can return something */
    if ((c = input()) != '\\')	/* is this an escape sequence? */
      return(c);		/* no, just return directly */
    switch(c = input()) {	/* dispatch according to next char */
    case 0:			/* end of input... return backslash */
      return('\\');		/* (unquoted) */
    case 'n':
      return(0x100+'\n');	/* quoted newline */
    case 't':
      return(0x100+'\t');	/* quoted tab */
    case 'b':
      return(0x100+'\b');	/* quoted backspace */
    case 'r':
      return(0x100+'\r');	/* quoted return */
    case 'f':
      return(0x100+'\f');	/* quoted formfeed */
    case '\\':
      return(0x100+'\\');	/* quoted backslash */
    case '\'':
      return(0x100+'\'');	/* quoted single quote */
    case '\"':
      return(0x100+'\"');	/* quoted double quote */
    case '\n':
      continue;			/* quoted newline is ignored */
    default:
      if (c >= '0' && c <= '7') { /* octal digit... get two more */
	c1 = input();
	c2 = input();
	if (c1 < '0' || c1 > '7' ||
	    c2 < '0' || c2 > '7') { /* not both octal digits? */
	  if (c == '0')		/* special case for '\0' */
	    c = '\0';
	  if (c2 != 0)		/* put back 2nd and 3rd chars */
	    unput(c2);
	  if (c1 != 0)
	    unput(c1);
	}
	else			/* convert 3 digits to char */
	  c = (((c-'0')<<6)+((c1-'0')<<3)+(c2-'0')) & 0xFF;
      }
      return(0x100+c);		/* quote it and return it */
    }
  }
}

