iverilog/vhdlpp/lexor.lex

%option prefix="yy"
%option never-interactive
%option nounput

%{
/*
 * Copyright (c) 2011 Stephen Williams (steve@icarus.com)
 *
 *    This source code is free software; you can redistribute it
 *    and/or modify it in source code form under the terms of the GNU
 *    General Public License as published by the Free Software
 *    Foundation; either version 2 of the License, or (at your option)
 *    any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    aint64_t with this program; if not, write to the Free Software
 *    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
 */

# include  "parse_api.h"
# include  "lexor_keyword.h"
# include  "vhdlint.h"
# include  "vhdlreal.h"
# include  "parse_wrap.h"

# include  <cmath>
# include  <cassert>
# include  <iostream>
# include  <set>

//class vhdlnum;
//class vhdlreal;

extern int lexor_keyword_code (const char*str, unsigned len);

/*
 * Lexical location information is passed in the yylloc variable to th
 * parser. The file names, strings, are kept in a list so that I can
 * re-use them. The set_file_name function will return a pointer to
 * the name as it exists in the list (and delete the passed string.)
 * If the name is new, it will be added to the list.
 */
extern YYLTYPE yylloc;

static bool are_underscores_correct(char* text);
static bool is_based_correct(char* text);
static char* escape_quot_and_dup(char* text);
static char* escape_apostrophe_and_dup(char* text);

static double make_double_from_based(char* text);
static int64_t make_long_from_based(char* text);

static int64_t lpow(int64_t left, int64_t right);
static unsigned short short_from_hex_char(char ch);

static char* strdupnew(char const *str)
{
       return str ? strcpy(new char [strlen(str)+1], str) : 0;
}

static int comment_enter;

%}

%x CCOMMENT
%x LCOMMENT

W [ \t\b\f\r]+
decimal_literal		{integer}(\.{integer})?({exponent})?
integer				[0-9](_?[0-9])*
exponent			[eE][-+]?{integer}

based_literal		{integer}#{based_integer}(\.{based_integer})?#{exponent}?
based_integer		[0-9a-fA-F](_?[0-9a-fA-F])*
%%

[ \t\b\f\r] { ; }
\n { yylloc.first_line += 1; }

  /* Single-line comments start with - - and run to the end of the
     current line. These are very easy to handle. */

"--".* { comment_enter = YY_START; BEGIN(LCOMMENT); }
<LCOMMENT>.    { yymore(); }
<LCOMMENT>\n   { yylloc.first_line += 1; BEGIN(comment_enter); }


  /* The contents of C-style comments are ignored, like white space. */

"/*" { comment_enter = YY_START; BEGIN(CCOMMENT); }
<CCOMMENT>.    { ; }
<CCOMMENT>\n   { yylloc.first_line += 1; }
<CCOMMENT>"*/" { BEGIN(comment_enter); }

\'.\' {
    yylval.text = escape_apostrophe_and_dup(yytext);
    return CHARACTER_LITERAL;
}

(\"([^\"]|(\"\"))*?\")|(\"[^\"]*\")  {
/* first pattern: string literals with doubled quotation mark */
/* second pattern: string literals without doubled quotation */
    yylval.text = escape_quot_and_dup(yytext);
    assert(yylval.text);
    return STRING_LITERAL;
}

[a-zA-Z_][a-zA-Z0-9_]* {
      for (char*cp = yytext ; *cp ; cp += 1)
	    *cp = tolower(*cp);
      int rc = lexor_keyword_code(yytext, yyleng);
      switch (rc) {
	  case IDENTIFIER:
		if(!are_underscores_correct(yytext))
			std::cerr << "An invalid underscore in the identifier:"
                    << yytext << std::endl;
                //yywarn(yylloc, "An invalid underscore in the identifier");
	    yylval.text = strdupnew(yytext);
	    break;
	  default:
	    break;
      }
      return rc;
  }

\\([^\\]|\\\\)*\\ { /* extended identifiers */
    yylval.text = strdupnew(yytext);
    return IDENTIFIER;
}

{decimal_literal} {
      char*tmp = new char[strlen(yytext)+1];
      char*dst, *src;
      int rc = INT_LITERAL;
      for (dst = tmp, src = yytext ; *src ; ++src) {
	    if (*src == '_')
		  continue;
	    if (*src == '.')
		  rc = REAL_LITERAL;
	    *dst++ = *src;
      }
      *dst = 0;

      if (rc == REAL_LITERAL) {
	    yylval.uni_real = strtod(tmp, 0);
      } else {
	    yylval.uni_integer = strtoimax(tmp, 0, 10);
      }
      delete[]tmp;
      return rc;
}

{based_literal} {
    if(!are_underscores_correct(yytext) || !is_based_correct(yytext))
        std::cerr << "An invalid form of based literal:"
            << yytext << std::endl;

    if(strchr(yytext, '.'))
    {
        double val = make_double_from_based(yytext);
        yylval.uni_real = val;
        return REAL_LITERAL;
    }
    else
    {
        int64_t val = make_long_from_based(yytext);
        yylval.uni_integer = val;
        return INT_LITERAL;
    }
}


  /* Compound symbols */
"<=" { return LEQ; }
">=" { return GEQ; }
":=" { return VASSIGN; }
"/=" { return NE; }
"<>" { return BOX; }
"**" { return EXP; }
"=>" { return ARROW; }
"<<" { return DLT; }
">>" { return DGT; }
    /*
    Here comes a list of symbols that are more than strange,
    at least for the time being.

"??" { return K_CC; }
"?=" {}
"?/=" {}
"?<" {}
"?<=" {}
"?>" {}
"?>=" {}
*/

. { return yytext[0]; }

%%

extern void yyparse_set_filepath(const char*path);

/**
* This function checks if underscores in an identifier
* or in a number are correct.
*
* \return true is returned if underscores are placed
* correctly according to specification
*/
static bool are_underscores_correct(char* text)
{
	unsigned char underscore_allowed = 0;
	const char* cp;
	for( cp = text; *cp; ++cp)
	{
		if (*cp == '_')
		{
			if (!underscore_allowed || *(cp+1) == '\0')
				return 0;
			underscore_allowed = 0;
		}
		else
			underscore_allowed = 1;
	}
	return 1;
}

/**
* This function checks if the format of a based number
* is correct according to the VHDL standard
*
* \return true is returned if a based number
* is formed well according to specification
*/
static bool is_based_correct(char* text)
{
    char* ptr;
    //BASE examination
    char clean_base[4];
    clean_base[3] = '\0';
    char* clean_base_ptr = clean_base;
    for(ptr = text; ptr != strchr(text, '#'); ++ptr)
    {
        if(*ptr == '_')
            ++ptr;
        if(!(*ptr >= '0' && *ptr <= '9')) //the base uses chars other than digits
            return 0;
        if(*clean_base_ptr == '\0')
            break;
        *clean_base_ptr = *ptr;
        ++clean_base_ptr;
    }
    unsigned length = clean_base_ptr - clean_base;
    unsigned base;
    if(length > 2 || length == 0)
        return 0; //the base is too big or too small
    if(length == 2)
    {
        base = 10*(clean_base[0] - '0') + (clean_base[1] - '0');
        //the base exceeds 16 or equals 0
        if(base > 16 || base == 0)
            return 0;
    }
    else
    { //the base consists of one char and is equal to zero
        base = clean_base[0] - '0';
        if(base == 0)
            return 0;
    }
    bool point = 0;
    set<char> allowed_chars;

    unsigned c;
    if(base <= 10) {
        for(c = 0; c < base; ++c)
            allowed_chars.insert(c + '0');
    }
    else
    {
        for(c = 0; c < 10; ++c)
            allowed_chars.insert(c + '0');
        for(c = 0; c < base - 10; ++c)
            allowed_chars.insert(c + 'a');
    }
    //MANTISSA examination
    for(ptr = strchr(text, '#') + 1, length = 0; ptr != strrchr(text, '#'); ++ptr)
    {
        if(*ptr == '.')
        {
            //we found a dot and another one was already found
            if(point == 1)
                return 0;
            else
            {
                //notice the fact of finding a point and continue, without increasing the length
                point = 1;
                continue;
            }
        }
        //the number consists of other chars than allowed
        if(allowed_chars.find(*ptr) == allowed_chars.end())
            return 0;
        ++length;
    }
    if(length == 0)
        return 0;

    //EXPONENT examination
    if(strchr(text, '\0') - strrchr(text, '#') > 1) { //the number contains an exponent
        if(*(strrchr(text, '#') + 2) == '-')
            return 0;
        length = 0;
        for(ptr = strrchr(text, '#')+2; *ptr != '\0'; ++ptr)
        {
            //the exponent consists of other chars than {'0'.,'9','a'..'f'}
            if(!((*ptr >= '0' && *ptr <= '9') || (*ptr >= 'a' && *ptr <= 'f')))
                return 0;
        }
    }
    return 1;
}

/**
* This function takes a string literal, gets rid of
* quotation marks and copies the remaining characters
* to a new persistent C-string
*
* \return pointer to the new string is returned
*/
static char* escape_quot_and_dup(char* text)
{
    char* newstr = new char[strlen(text)+1];

    unsigned old_idx, new_idx;
    for(new_idx = 0, old_idx = 0; old_idx < strlen(text); )
    {
        if(text[old_idx] == '"' && old_idx == 0)
        { //the beginning of the literal
            ++old_idx;
            continue;
        }
        else
        if(text[old_idx] == '"' && text[old_idx+1] == '\0')
        { //the end
            newstr[new_idx] = '\0';
            return newstr;
        }
        else
        if(text[old_idx] == '"' && text[old_idx+1] == '"')
        {
            newstr[new_idx++] = '"';
            old_idx += 2; //jump across two chars
        }
        else
        {
            newstr[new_idx] = text[old_idx];
            ++old_idx;
            ++new_idx;
        }
    }
    //the function should never reach this point
    return 0;
}

/**
* This function takes a character literal, gets rid
* of the apostrophes and returns new C-string
*
* \return pointer to the new string is returned
*/
static char* escape_apostrophe_and_dup(char* text)
{
    char* newstr = new char[2];
    newstr[0] = text[1];
    newstr[1] = '\0';
    return newstr;
}

/**
* This function takes a floating point based number
* in form of a C-strings and converts it to a double.
*
* \return new double is returned
*/
static double make_double_from_based(char* text)
{
    char* first_hash_ptr = strchr(text, '#');
    char* second_hash_ptr = strrchr(text, '#');
    char* last_char_ptr = strchr(text, '\0') - 1;
    //put null byte in lieu of hashes
    *first_hash_ptr = '\0';
    *second_hash_ptr = '\0';

    //now lets deduce the base
    unsigned base = (unsigned)strtol(text, 0, 10) ;

    double mantissa = 0.0;
    char*ptr = first_hash_ptr + 1;
    for( ; ptr != second_hash_ptr ; ++ptr)
    {
        if(*ptr == '.')
            break;
        if(*ptr != '_')
        {
            mantissa = mantissa*base + short_from_hex_char(*ptr);
        }
    }
    double fraction = 0.0;
    double factor = 1.0/base;
    for(++ptr ; ptr != second_hash_ptr; ++ptr)
    {
        if(*ptr != '_')
        {
            fraction = fraction + short_from_hex_char(*ptr)*factor;
            factor = factor / base;
        }
    }
    if(last_char_ptr == second_hash_ptr) //there is no exponent
    {
        return mantissa + fraction;
    }

    //now calculate the value of the exponent
    double exponent = 0.0;
    //leave 'e'/'E' and '+'
    ptr = *(second_hash_ptr + 2) == '+' ? second_hash_ptr + 3 : second_hash_ptr + 2;

    for( ; *ptr != '\0'; ++ptr)
    {
        if(*ptr != '_')
        {
            exponent = exponent*base + short_from_hex_char(*ptr);
        }
    }
    return pow(mantissa + fraction, exponent);
}

/**
* This function takes a hexadecimal digit in form of
* a char and returns its litteral value as short
*/
static unsigned short short_from_hex_char(char ch)
{
    if(ch >= '0' && ch <= '9')
        return ch - '0';
    else
        return ch - 'a' + 10;
}

/**
* This function takes a based number in form of
* a C-strings and converts it to a int64_t.
*
* \return new double is returned
*/
static int64_t make_long_from_based(char* text) {
    char* first_hash_ptr = strchr(text, '#');
    char* second_hash_ptr = strrchr(text, '#');
    char* end_ptr = strrchr(text, '\0');
    //now lets deduce the base
    *first_hash_ptr = '\0';
    unsigned base = (unsigned)strtol(text, 0, 10) ;

    char *ptr = first_hash_ptr + 1;
    int64_t mantissa = 0;
    for( ; ptr != second_hash_ptr ; ++ptr)
    {
        if(*ptr != '_')
        {
            mantissa = mantissa * base + short_from_hex_char(*ptr);
        }
    }
    //if there is an exponent
    if(end_ptr - second_hash_ptr > 1)
    {
       int64_t exponent = 0L;

       ptr = *(second_hash_ptr + 2) == '+' ? second_hash_ptr + 3 : second_hash_ptr + 2;
       for( ; *ptr != '\0'; ++ptr)
       {
           if(*ptr != '_')
               exponent = base*exponent + short_from_hex_char(*ptr);
       }
       return lpow(mantissa, exponent);
    }
    else
        return mantissa;
}

/**
* Recursive power function for int64_t
*/
static int64_t lpow(int64_t left, int64_t right) {
    if(right == 0)
        return 1;
    else
        return left*lpow(left, right - 1);
}

void reset_lexor(FILE*fd, const char*path)
{
      yylloc.text = path;
      yylloc.first_line = 1;
      yyrestart(fd);

      yyparse_set_filepath(path);
}

int yywrap()
{
      return 1;
}