iverilog/vhdlpp/lexor.lex

%option prefix="yy"
%option never-interactive
%option nounput
%option reentrant
%option noyywrap

%{
/*
 * Copyright (c) 2011-2025 Stephen Williams (steve@icarus.com)
 *
 *    This source code is free software; you can redistribute it
 *    and/or modify it in source code form under the terms of the GNU
 *    General Public License as published by the Free Software
 *    Foundation; either version 2 of the License, or (at your option)
 *    any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    aint64_t with this program; if not, write to the Free Software
 *    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

# include  "parse_api.h"
# include  "lexor_keyword.h"
# include  "vhdlint.h"
# include  "vhdlreal.h"
# include  "parse_wrap.h"

# include  <cmath>
# include  <cassert>
# include  <iostream>
# include  <set>

# define YY_NO_INPUT

# define YY_DECL int yylex(YYSTYPE*yylvalp, YYLTYPE*yyllocp, yyscan_t yyscanner)
//class vhdlnum;
//class vhdlreal;

extern int lexor_keyword_code (const char*str, unsigned len);

/*
 * Lexical location information is passed in the yylloc variable to the
 * parser. The file names, strings, are kept in a list so that I can
 * re-use them. The set_file_name function will return a pointer to
 * the name as it exists in the list (and delete the passed string).
 * If the name is new, it will be added to the list.
 */
#define yylloc (*yyllocp)
#define yylval (*yylvalp)

static bool are_underscores_correct(const char* text);
static bool is_based_correct(const char* text);
static char* escape_quot_and_dup(const char* text);
static char* escape_apostrophe_and_dup(const char* text);

static double make_double_from_based(char* text);
static int64_t make_long_from_based(char* text);
static char* make_bitstring_literal(const char*text);

static int64_t lpow(int64_t left, int64_t right);
static unsigned short short_from_hex_char(char ch);

static char* strdupnew(char const *str)
{
       return str ? strcpy(new char [strlen(str)+1], str) : 0;
}

static int comment_enter;

%}

%x CCOMMENT
%x LCOMMENT

W [ \t\b\f\r]+
decimal_literal		{integer}(\.{integer})?({exponent})?
integer				[0-9](_?[0-9])*
exponent			[eE][-+]?{integer}

based_literal		{integer}#{based_integer}(\.{based_integer})?#{exponent}?
based_integer		[0-9a-fA-F](_?[0-9a-fA-F])*
time			{integer}{W}*([fFpPnNuUmM]?[sS])
%%

[ \t\b\f\r] { ; }
\n { yylloc.first_line += 1; }

  /* Single-line comments start with -- and run to the end of the
     current line. These are very easy to handle. */

"--".* { comment_enter = YY_START; BEGIN(LCOMMENT); }
<LCOMMENT>.    { yymore(); }
<LCOMMENT>\n   { yylloc.first_line += 1; BEGIN(comment_enter); }


  /* The contents of C-style comments are ignored, like white space. */

"/*" { comment_enter = YY_START; BEGIN(CCOMMENT); }
<CCOMMENT>.    { ; }
<CCOMMENT>\n   { yylloc.first_line += 1; }
<CCOMMENT>"*/" { BEGIN(comment_enter); }

\'.\' {
    yylval.text = escape_apostrophe_and_dup(yytext);
    return CHARACTER_LITERAL;
}

(\"([^\"]|(\"\"))*?\") {
    yylval.text = escape_quot_and_dup(yytext);
    assert(yylval.text);
    return STRING_LITERAL;
}

[a-zA-Z_][a-zA-Z0-9_]* {
      for (char*cp = yytext ; *cp ; cp += 1)
	    *cp = tolower(*cp);
      int rc = lexor_keyword_code(yytext, yyleng);
      switch (rc) {
	  case IDENTIFIER:
		if(!are_underscores_correct(yytext))
			std::cerr << "An invalid underscore in the identifier:"
                    << yytext << std::endl;
                //yywarn(yylloc, "An invalid underscore in the identifier");
	    yylval.text = strdupnew(yytext);
	    break;
	  default:
	    break;
      }
      return rc;
  }

\\([^\\]|\\\\)*\\ { /* extended identifiers */
    yylval.text = strdupnew(yytext);
    return IDENTIFIER;
}

{decimal_literal} {
      char*tmp = new char[strlen(yytext)+1];
      char*dst, *src;
      int rc = INT_LITERAL;
      for (dst = tmp, src = yytext ; *src ; ++src) {
	    if (*src == '_')
		  continue;
	    if (*src == '.')
		  rc = REAL_LITERAL;
	    *dst++ = *src;
      }
      *dst = 0;

      if (rc == REAL_LITERAL) {
	    yylval.uni_real = strtod(tmp, 0);
      } else {
	    yylval.uni_integer = strtoimax(tmp, 0, 10);
      }
      delete[]tmp;
      return rc;
}

{based_literal} {
    for(char*cp = yytext ; *cp ; ++cp)
        *cp = tolower(*cp);

    if(!are_underscores_correct(yytext) || !is_based_correct(yytext))
        std::cerr << "An invalid form of based literal:"
            << yytext << std::endl;

    if(strchr(yytext, '.'))
    {
        double val = make_double_from_based(yytext);
        yylval.uni_real = val;
        return REAL_LITERAL;
    }
    else
    {
        int64_t val = make_long_from_based(yytext);
        yylval.uni_integer = val;
        return INT_LITERAL;
    }
}

{integer}?[sSuU]?[xXbBoOdD]\"[^\"]+\" {
      yylval.text = make_bitstring_literal(yytext);
      return BITSTRING_LITERAL;
}

  /* Compound symbols */
"<=" { return LEQ; }
">=" { return GEQ; }
":=" { return VASSIGN; }
"/=" { return NE; }
"<>" { return BOX; }
"**" { return EXP; }
"=>" { return ARROW; }
"<<" { return DLT; }
">>" { return DGT; }
"??" { return CC; }
"?=" { return M_EQ;}
"?/=" { return M_NE;}
"?<" { return M_LT; }
"?<=" { return M_LEQ;}
"?>" { return M_GT; }
"?>=" {return M_GEQ; }

. { return yytext[0]; }

%%

extern void yyparse_set_filepath(const char*path);

/**
* This function checks if underscores in an identifier
* or in a number are correct.
*
* \return true is returned if underscores are placed
* correctly according to specification
*/
static bool are_underscores_correct(const char* text)
{
	unsigned char underscore_allowed = 0;
	const char* cp;
	for( cp = text; *cp; ++cp)
	{
		if (*cp == '_')
		{
			if (!underscore_allowed || *(cp+1) == '\0')
				return 0;
			underscore_allowed = 0;
		}
		else
			underscore_allowed = 1;
	}
	return 1;
}

static bool is_char_ok(char c, int base)
{
    if(base <= 10)
        return '0' <= c && c - '0' < base;
    else
        return isdigit(c) || (c >= 'a' && c < 'a' + base - 10);
}

/**
* This function checks if the format of a based number
* is correct according to the VHDL standard
*
* \return true is returned if a based number
* is formed well according to specification
*/
static bool is_based_correct(const char* text)
{
    const char* ptr;
    //BASE examination
    char clean_base[4] = {0,};
    const char* clean_base_end = clean_base + sizeof(clean_base);
    char* clean_base_ptr = clean_base;
    for(ptr = text; ptr != strchr(text, '#'); ++ptr)
    {
        if(*ptr == '_')
            ++ptr;
        if(!(*ptr >= '0' && *ptr <= '9')) //the base uses chars other than digits
            return 0;
        if(clean_base_ptr == clean_base_end)
            break;
        *clean_base_ptr = *ptr;
        ++clean_base_ptr;
    }
    unsigned length = clean_base_ptr - clean_base;
    unsigned base;
    if(length > 2 || length == 0)
        return 0; //the base is too big or too small
    if(length == 2)
    {
        base = 10*(clean_base[0] - '0') + (clean_base[1] - '0');
        //the base exceeds 16 or equals 0
        if(base > 16 || base == 0)
            return 0;
    }
    else
    { //the base consists of one char and is equal to zero
        base = clean_base[0] - '0';
        if(base == 0)
            return 0;
    }
    bool point = false;

    //MANTISSA examination
    for(ptr = strchr(text, '#') + 1, length = 0; ptr != strrchr(text, '#'); ++ptr)
    {
        if(*ptr == '.')
        {
            //we found a dot and another one was already found
            if(point)
                return 0;
            else
            {
                //notice the fact of finding a point and continue, without increasing the length
                point = true;
                continue;
            }
        }
        //check if the number consists of other chars than allowed
        if(!is_char_ok(*ptr, base))
            return 0;

        ++length;
    }
    if(length == 0)
        return 0;

    //EXPONENT examination
    if(strchr(text, '\0') - strrchr(text, '#') > 1) { //the number contains an exponent
        if(*(strrchr(text, '#') + 2) == '-')
            return 0;
        length = 0;
        for(ptr = strrchr(text, '#')+2; *ptr != '\0'; ++ptr)
        {
            //the exponent consists of other chars than {'0'.,'9','a'..'f'}
            if(!((*ptr >= '0' && *ptr <= '9') || (*ptr >= 'a' && *ptr <= 'f')))
                return 0;
        }
    }
    return 1;
}

/**
* This function takes a string literal, gets rid of
* quotation marks and copies the remaining characters
* to a new persistent C-string
*
* \return pointer to the new string is returned
*/
static char* escape_quot_and_dup(const char* text)
{
    char* newstr = new char[strlen(text)+1];

    unsigned old_idx, new_idx;
    for(new_idx = 0, old_idx = 0; old_idx < strlen(text); )
    {
        if(text[old_idx] == '"' && old_idx == 0)
        { //the beginning of the literal
            ++old_idx;
            continue;
        }
        else
        if(text[old_idx] == '"' && text[old_idx+1] == '\0')
        { //the end
            newstr[new_idx] = '\0';
            return newstr;
        }
        else
        if(text[old_idx] == '"' && text[old_idx+1] == '"')
        {
            newstr[new_idx++] = '"';
            old_idx += 2; //jump across two chars
        }
        else
        {
            newstr[new_idx] = text[old_idx];
            ++old_idx;
            ++new_idx;
        }
    }
    //the function should never reach this point
    return 0;
}

/**
* This function takes a character literal, gets rid
* of the apostrophes and returns new C-string
*
* \return pointer to the new string is returned
*/
static char* escape_apostrophe_and_dup(const char* text)
{
    char* newstr = new char[2];
    newstr[0] = text[1];
    newstr[1] = '\0';
    return newstr;
}

static char*make_bitstring_bin(int width_prefix, bool sflag, bool,
				  const char*src)
{
      int src_len = strlen(src);
      if (width_prefix < 0)
	    width_prefix = src_len;

      char*res = new char[width_prefix+1];
      char*rp = res;

      if (width_prefix > src_len) {
	    size_t pad = width_prefix - src_len;
	    for (size_t idx = 0 ; idx < pad ; idx += 1)
		  *rp++ = sflag? src[0] : '0';

      } else if (src_len > width_prefix) {
	    src += src_len - width_prefix;
      }

      while (*src) {
	    *rp++ = *src++;
      }
      *rp = 0;

      return res;
}

static char*make_bitstring_oct(int width_prefix, bool sflag, bool,
			       const char*src)
{
      int src_len = strlen(src);
      if (width_prefix < 0)
	    width_prefix = 3*src_len;

      char*res = new char[width_prefix+1];
      char*rp = res + width_prefix;
      *rp = 0;
      rp -= 1;

      for (const char*sp = src + src_len - 1; sp >= src ; sp -= 1) {
	    int val;
	    switch (*sp) {
		case '0':
		case '1':
		case '2':
		case '3':
		case '4':
		case '5':
		case '6':
		case '7':
		  val = *sp - '0';
		  *rp-- = (val&1)? '1' : '0';
		  if (rp >= res) *rp-- = (val&2)? '1' : '0';
		  if (rp >= res) *rp-- = (val&4)? '1' : '0';
		  break;
		default:
		  *rp-- = *sp;
		  if (rp >= res) *rp-- = *sp;
		  if (rp >= res) *rp-- = *sp;
		  break;
	    }
	    if (rp < res)
		  break;
      }

      if (rp >= res) {
	    char pad = sflag? src[0] : '0';
	    while (rp >= res)
		  *rp-- = pad;
      }

      return res;
}

static char*make_bitstring_hex(int width_prefix, bool sflag, bool,
			       const char*src)
{
      int src_len = strlen(src);
      if (width_prefix <= 0)
	    width_prefix = 4*src_len;

      char*res = new char[width_prefix+1];
      char*rp = res + width_prefix;
      *rp = 0;
      rp -= 1;

      for (const char*sp = src + src_len - 1; sp >= src ; sp -= 1) {
	    int val;
	    switch (*sp) {
		case '0':
		case '1':
		case '2':
		case '3':
		case '4':
		case '5':
		case '6':
		case '7':
		case '8':
		case '9':
		  val = *sp - '0';
		  *rp-- = (val&1)? '1' : '0';
		  if (rp >= res) *rp-- = (val&2)? '1' : '0';
		  if (rp >= res) *rp-- = (val&4)? '1' : '0';
		  if (rp >= res) *rp-- = (val&8)? '1' : '0';
		  break;
		case 'a': case 'A':
		case 'b': case 'B':
		case 'c': case 'C':
		case 'd': case 'D':
		case 'e': case 'E':
		case 'f': case 'F':
		  val = 10 + toupper(*sp) - 'A';
		  *rp-- = (val&1)? '1' : '0';
		  if (rp >= res) *rp-- = (val&2)? '1' : '0';
		  if (rp >= res) *rp-- = (val&4)? '1' : '0';
		  if (rp >= res) *rp-- = (val&8)? '1' : '0';
		  break;
		default:
		  *rp-- = *sp;
		  if (rp >= res) *rp-- = *sp;
		  if (rp >= res) *rp-- = *sp;
		  if (rp >= res) *rp-- = *sp;
		  break;
	    }
	    if (rp < res)
		  break;
      }

      if (rp >= res) {
	    char pad = sflag? src[0] : '0';
	    while (rp >= res)
		  *rp-- = pad;
      }

      return res;
}

static char*make_bitstring_dec(int, bool, bool, const char*)
{
      assert(0);
      return 0;
}

static char* make_bitstring_literal(const char*text)
{
      int width_prefix = -1;
      const char*cp = text;
      bool signed_flag = false;
      bool unsigned_flag = false;
      unsigned base = 0;

	// Parse out the explicit width, if present.
      if (size_t len = strspn(cp, "0123456789")) {
	    width_prefix = 0;
	    while (len > 0) {
		  width_prefix *= 10;
		  width_prefix += *cp - '0';
		  cp += 1;
          --len;
	    }
      } else {
	    width_prefix = -1;
      }

	// Detect and s/u flags.
      if (*cp == 's' || *cp == 'S') {
	    signed_flag = true;
	    cp += 1;
      } else if (*cp == 'u' || *cp == 'U') {
	    unsigned_flag = true;
	    cp += 1;
      }

	// Now get the base marker.
      switch (*cp) {
	  case 'b':
	  case 'B':
	    base = 2;
	    break;
	  case 'o':
	  case 'O':
	    base = 8;
	    break;
	  case 'x':
	  case 'X':
	    base = 16;
	    break;
	  case 'd':
	  case 'D':
	    base = 10;
	    break;
	  default:
	    assert(0);
      }
      cp += 1;

      char*simplified = new char [strlen(cp) + 1];
      char*dp = simplified;
      assert(*cp == '"');
      cp += 1;

      while (*cp && *cp != '"') {
	    if (*cp == '_') {
		  cp += 1;
		  continue;
	    }

	    *dp++ = *cp++;
      }
      *dp = 0;

      char*res;
      switch (base) {
	  case 2:
	    res = make_bitstring_bin(width_prefix, signed_flag, unsigned_flag, simplified);
	    break;
	  case 8:
	    res = make_bitstring_oct(width_prefix, signed_flag, unsigned_flag, simplified);
	    break;
	  case 10:
	    res = make_bitstring_dec(width_prefix, signed_flag, unsigned_flag, simplified);
	    break;
	  case 16:
	    res = make_bitstring_hex(width_prefix, signed_flag, unsigned_flag, simplified);
	    break;
	  default:
	    assert(0);
	    res = 0;
      }

      delete[]simplified;
      return res;
}

/**
* This function takes a floating point based number
* in form of a C-strings and converts it to a double.
*
* \return new double is returned
*/
static double make_double_from_based(char* text)
{
    char* first_hash_ptr = strchr(text, '#');
    char* second_hash_ptr = strrchr(text, '#');
    const char* last_char_ptr = strchr(text, '\0') - 1;
    //put null byte in lieu of hashes
    *first_hash_ptr = '\0';
    *second_hash_ptr = '\0';

    //now let's deduce the base
    unsigned base = (unsigned)strtol(text, 0, 10) ;

    double mantissa = 0.0;
    const char*ptr = first_hash_ptr + 1;
    for( ; ptr != second_hash_ptr ; ++ptr)
    {
        if(*ptr == '.')
            break;
        if(*ptr != '_')
        {
            mantissa = mantissa*base + short_from_hex_char(*ptr);
        }
    }
    double fraction = 0.0;
    double factor = 1.0/base;
    for(++ptr ; ptr != second_hash_ptr; ++ptr)
    {
        if(*ptr != '_')
        {
            fraction = fraction + short_from_hex_char(*ptr)*factor;
            factor = factor / base;
        }
    }
    if(last_char_ptr == second_hash_ptr) //there is no exponent
    {
        return mantissa + fraction;
    }

    //now calculate the value of the exponent
    double exponent = 0.0;
    //leave 'e'/'E' and '+'
    ptr = *(second_hash_ptr + 2) == '+' ? second_hash_ptr + 3 : second_hash_ptr + 2;

    for( ; *ptr != '\0'; ++ptr)
    {
        if(*ptr != '_')
        {
            exponent = exponent*base + short_from_hex_char(*ptr);
        }
    }
    return pow(mantissa + fraction, exponent);
}

/**
* This function takes a hexadecimal digit in form of
* a char and returns its litteral value as short
*/
static unsigned short short_from_hex_char(char ch)
{
    if(ch >= '0' && ch <= '9')
        return ch - '0';
    else
        return ch - 'a' + 10;
}

/**
* This function takes a based number in form of
* a C-strings and converts it to a int64_t.
*
* \return new double is returned
*/
static int64_t make_long_from_based(char* text) {
    char* first_hash_ptr = strchr(text, '#');
    char* second_hash_ptr = strrchr(text, '#');
    const char* end_ptr = strrchr(text, '\0');
    //now lets deduce the base
    *first_hash_ptr = '\0';
    unsigned base = (unsigned)strtol(text, 0, 10) ;

    const char *ptr = first_hash_ptr + 1;
    int64_t mantissa = 0;
    for( ; ptr != second_hash_ptr ; ++ptr)
    {
        if(*ptr != '_')
        {
            mantissa = mantissa * base + short_from_hex_char(*ptr);
        }
    }
    //if there is an exponent
    if(end_ptr - second_hash_ptr > 1)
    {
       int64_t exponent = 0L;

       ptr = *(second_hash_ptr + 2) == '+' ? second_hash_ptr + 3 : second_hash_ptr + 2;
       for( ; *ptr != '\0'; ++ptr)
       {
           if(*ptr != '_')
               exponent = base*exponent + short_from_hex_char(*ptr);
       }
       return lpow(mantissa, exponent);
    }
    else
        return mantissa;
}

/**
* Recursive power function for int64_t
*/
static int64_t lpow(int64_t left, int64_t right) {
    if(right == 0)
        return 1;
    else
        return left*lpow(left, right - 1);
}

yyscan_t prepare_lexor(FILE*fd)
{
      yyscan_t scanner;
      yylex_init(&scanner);
      yyrestart(fd, scanner);
      return scanner;
}

/*
 * Modern version of flex (>=2.5.9) can clean up the scanner data.
 */
void destroy_lexor(yyscan_t scanner)
{
# ifdef FLEX_SCANNER
#   if YY_FLEX_MAJOR_VERSION >= 2 && YY_FLEX_MINOR_VERSION >= 5
#     if YY_FLEX_MINOR_VERSION > 5 || defined(YY_FLEX_SUBMINOR_VERSION) && YY_FLEX_SUBMINOR_VERSION >= 9
      yylex_destroy(scanner);
#     endif
#   endif
# endif
}