From f8eabbc100bc982089a6412c69d494c8cc52ee8b Mon Sep 17 00:00:00 2001
From: Wilson Snyder <wsnyder@wsnyder.org>
Date: Tue, 6 Apr 2010 20:20:44 -0400
Subject: [PATCH] From Verilog-Perl: Fix parsing single files > 2GB.

---
 Changes            |  2 ++
 src/V3File.cpp     | 69 ++++++++++++++++++++++++++++++----------------
 src/V3File.h       |  6 +++-
 src/V3ParseImp.cpp |  6 ++--
 src/V3ParseImp.h   |  6 ++--
 src/V3PreLex.h     | 13 +++++++--
 src/V3PreLex.l     | 68 ++++++++++++++++++++++++++++++++++++++++++---
 src/V3PreProc.cpp  | 51 +++++++++++++++++++++++-----------
 src/V3PreProc.h    |  1 +
 src/verilog.l      |  3 ++
 10 files changed, 172 insertions(+), 53 deletions(-)

diff --git a/Changes b/Changes
index 1bb28b361..43ba01fc3 100644
--- a/Changes
+++ b/Changes
@@ -17,6 +17,8 @@ indicates the contributor was also the author of the fix; Thanks!
 
 **** Fix trace files with empty modules crashing some viewers.
 
+**** Fix parsing single files > 2GB.  [Jeffrey Short]
+
 * Verilator 3.801 2010/03/17
 
 ***  Support "break", "continue", "return".
diff --git a/src/V3File.cpp b/src/V3File.cpp
index 8cd6de575..1bd41944a 100644
--- a/src/V3File.cpp
+++ b/src/V3File.cpp
@@ -44,6 +44,8 @@
 #include "V3PreShell.h"
 #include "V3Ast.h"
 
+// If change this code, run a test with the below size set very small
+//#define INFILTER_IPC_BUFSIZ 16
 #define INFILTER_IPC_BUFSIZ 64*1024  // For debug, try this as a small number
 #define INFILTER_CACHE_MAX  64*1024  // Maximum bytes to cache if same file read twice
 
@@ -271,6 +273,7 @@ void V3File::createMakeDir() {
 
 class V3InFilterImp {
     typedef map<string,string> FileContentsMap;
+    typedef V3InFilter::StrList StrList;
 
     FileContentsMap	m_contentsMap;	// Cache of file contents
     bool		m_readEof;	// Received EOF on read
@@ -292,27 +295,27 @@ private:
 	return level;
     }
 
-    bool readContents(const string& filename, string& out) {
-	if (m_pid) return readContentsFilter(filename,out);
-	else return readContentsFile(filename,out);
+    bool readContents(const string& filename, StrList& outl) {
+	if (m_pid) return readContentsFilter(filename,outl);
+	else return readContentsFile(filename,outl);
     }
-    bool readContentsFile(const string& filename, string& out) {
+    bool readContentsFile(const string& filename, StrList& outl) {
 	int fd = open (filename.c_str(), O_RDONLY);
 	if (!fd) return false;
 	m_readEof = false;
-	out = readBlocks(fd, -1);
+	readBlocks(fd, -1, outl);
 	close(fd);
 	return true;
     }
-    bool readContentsFilter(const string& filename, string& out) {
-	if (filename!="" || out!="") {}  // Prevent unused
+    bool readContentsFilter(const string& filename, StrList& outl) {
+	if (filename!="" || outl.empty()) {}  // Prevent unused
 #ifdef INFILTER_PIPE
 	writeFilter("read \""+filename+"\"\n");
 	string line = readFilterLine();
 	if (line.find("Content-Length") != string::npos) {
 	    int len = 0;
 	    sscanf(line.c_str(), "Content-Length: %d\n", &len);
-	    out = readBlocks(m_readFd, len);
+	    readBlocks(m_readFd, len, outl);
 	    return true;
 	} else {
 	    if (line!="") v3error("--pipe-filter protocol error, unexpected: "<<line);
@@ -334,15 +337,19 @@ private:
 #endif
     }
 
-    string readBlocks(int fd, int size=-1) {
+    string readBlocks(int fd, int size, StrList& outl) {
 	string out;
 	char buf[INFILTER_IPC_BUFSIZ];
-	while (!m_readEof && (size<0 || size>(int)out.length())) {
-	    int todo = INFILTER_IPC_BUFSIZ;
+	ssize_t sizegot = 0;
+	while (!m_readEof && (size<0 || size>sizegot)) {
+	    ssize_t todo = INFILTER_IPC_BUFSIZ;
 	    if (size>0 && size<INFILTER_IPC_BUFSIZ) todo = size;
-	    int got = read (fd, buf, todo);
+	    ssize_t got = read (fd, buf, todo);
 	    //UINFO(9,"RD GOT g "<< got<<" e "<<errno<<" "<<strerror(errno)<<endl);  usleep(50*1000);
-	    if (got>0) out.append(buf, got);
+	    if (got>0) {
+		outl.push_back(string(buf, got));
+		sizegot += got;
+	    }
 	    else if (errno == EINTR || errno == EAGAIN
 #ifdef EWOULDBLOCK
 		     || errno == EWOULDBLOCK
@@ -358,9 +365,11 @@ private:
 	UINFO(9,"readFilterLine\n");
 	string line;
 	while (!m_readEof) {
-	    string c = readBlocks(m_readFd, 1);
-	    line += c;
-	    if (c == "\n") {
+	    StrList outl;
+	    readBlocks(m_readFd, 1, outl);
+	    string onechar = listString(outl);
+	    line += onechar;
+	    if (onechar == "\n") {
 		if (line == "\n") { line=""; continue; }
 		else break;
 	    }
@@ -477,21 +486,35 @@ private:
 protected:
     friend class V3InFilter;
     // Read file contents and return it
-    bool readWholefile(const string& filename, string& out) {
+    bool readWholefile(const string& filename, StrList& outl) {
 	FileContentsMap::iterator it = m_contentsMap.find(filename);
 	if (it != m_contentsMap.end()) {
-	    out = it->second;
+	    outl.push_back(it->second);
 	    return true;
 	}
-	if (!readContents(filename, out)) return false;
-	if (out.length() < INFILTER_CACHE_MAX) {
+	if (!readContents(filename, outl)) return false;
+	if (listSize(outl) < INFILTER_CACHE_MAX) {
 	    // Cache small files (only to save space)
 	    // It's quite common to `include "timescale" thousands of times
 	    // This isn't so important if it's just a open(), but filtering can be slow
-	    m_contentsMap.insert(make_pair(filename,out));
+	    m_contentsMap.insert(make_pair(filename,listString(outl)));
 	}
 	return true;
     }
+    size_t listSize(StrList& sl) {
+	size_t out = 0;
+	for (StrList::iterator it=sl.begin(); it!=sl.end(); ++it) {
+	    out += it->length();
+	}
+	return out;
+    }
+    string listString(StrList& sl) {
+	string out;
+	for (StrList::iterator it=sl.begin(); it!=sl.end(); ++it) {
+	    out += *it;
+	}
+	return out;
+    }
     // CONSTRUCTORS
     V3InFilterImp(const string& command) {
 	m_readEof = false;
@@ -512,9 +535,9 @@ protected:
 V3InFilter::V3InFilter(const string& command) { m_impp = new V3InFilterImp(command); }
 V3InFilter::~V3InFilter() { if (m_impp) delete m_impp; m_impp=NULL; }
 
-bool V3InFilter::readWholefile(const string& filename, string& out) {
+bool V3InFilter::readWholefile(const string& filename, V3InFilter::StrList& outl) {
     if (!m_impp) v3fatalSrc("readWholefile on invalid filter");
-    return m_impp->readWholefile(filename, out);
+    return m_impp->readWholefile(filename, outl);
 }
 
 //######################################################################
diff --git a/src/V3File.h b/src/V3File.h
index 169ecf020..ab3af576d 100644
--- a/src/V3File.h
+++ b/src/V3File.h
@@ -28,6 +28,7 @@
 #include <cstdio>
 #include <stack>
 #include <set>
+#include <list>
 #include <fstream>
 
 //============================================================================
@@ -79,9 +80,12 @@ class V3InFilterImp;
 class V3InFilter {
     V3InFilterImp* m_impp;
 public:
+    // TYPES
+    typedef list<string> StrList;
+
     // METHODS
     // Read file contents and return it.  Return true on success.   
-    bool readWholefile(const string& filename, string& out);
+    bool readWholefile(const string& filename, StrList& outl);
 
     // CONSTRUCTORS
     V3InFilter(const string& command);
diff --git a/src/V3ParseImp.cpp b/src/V3ParseImp.cpp
index 574bf888a..967b76c2d 100644
--- a/src/V3ParseImp.cpp
+++ b/src/V3ParseImp.cpp
@@ -66,12 +66,12 @@ V3ParseImp::~V3ParseImp() {
     if (debug()>=9) { UINFO(0,"~V3ParseImp\n"); symp()->dump(cout, "-vpi: "); }
 }
 
-int V3ParseImp::ppInputToLex(char* buf, int max_size) {
-    int got = 0;
+size_t V3ParseImp::ppInputToLex(char* buf, size_t max_size) {
+    size_t got = 0;
     while (got < max_size	// Haven't got enough
 	   && !m_ppBuffers.empty()) {	// And something buffered
 	string front = m_ppBuffers.front(); m_ppBuffers.pop_front();
-	int len = front.length();
+	size_t len = front.length();
 	if (len > (max_size-got)) {  // Front string too big
 	    string remainder = front.substr(max_size-got);
 	    front = front.substr(0, max_size-got);
diff --git a/src/V3ParseImp.h b/src/V3ParseImp.h
index 8b7ddd8de..c78f3cd7b 100644
--- a/src/V3ParseImp.h
+++ b/src/V3ParseImp.h
@@ -260,7 +260,7 @@ public:
     static const char* tokenName(int tok);
 
     void ppPushText(const string& text) { m_ppBuffers.push_back(text); }
-    int ppInputToLex(char* buf, int max_size);
+    size_t ppInputToLex(char* buf, size_t max_size);
 
     static V3ParseImp* parsep() { return s_parsep; }
 
@@ -278,7 +278,7 @@ public:
 	m_stringps.push_back(strp);
 	return strp;
     }
-    string* newString(const char* text, int length) {
+    string* newString(const char* text, size_t length) {
 	string* strp = new string (text, length);
 	m_stringps.push_back(strp);
 	return strp;
@@ -307,7 +307,7 @@ public:
     void statePushVlg();	// Parser -> lexer communication
     void statePop();	// Parser -> lexer communication
     int stateVerilogRecent();	// Parser -> lexer communication
-    int flexPpInputToLex(char* buf, int max_size) { return ppInputToLex(buf,max_size); }
+    size_t flexPpInputToLex(char* buf, size_t max_size) { return ppInputToLex(buf,max_size); }
 
     //==== Symbol tables
     V3ParseSym* symp() { return &m_sym; }
diff --git a/src/V3PreLex.h b/src/V3PreLex.h
index 7cfd21b98..be60293ee 100644
--- a/src/V3PreLex.h
+++ b/src/V3PreLex.h
@@ -27,6 +27,7 @@
 #ifndef _VPREPROCLEX_H_		// Guard
 #define _VPREPROCLEX_H_ 1
 
+#include <deque>
 #include <stack>
 
 #include "V3Error.h"
@@ -112,7 +113,7 @@ void yy_delete_buffer( YY_BUFFER_STATE b );
 #define KEEPCMT_SUB 2
 
 //======================================================================
-// Class entry for each per-lexter state
+// Class entry for each per-lexer state
 
 class V3PreLex {
   public:	// Used only by V3PreLex.cpp and V3PreProc.cpp
@@ -120,6 +121,7 @@ class V3PreLex {
 
     // Parse state
     stack<YY_BUFFER_STATE> m_bufferStack; // Stack of inserted text above current point
+    deque<string>	m_buffers;	///< Buffer of characters to process
 
     // State to lexer
     static V3PreLex* s_currentLexp;	// Current lexing point
@@ -143,13 +145,15 @@ class V3PreLex {
 	m_defCmtSlash = false;
 	m_pslParenLevel = 0;
 	m_pslMoreNeeded = false;
+	initFirstBuffer();
     }
     ~V3PreLex() {
 	while (!m_bufferStack.empty()) { yy_delete_buffer(m_bufferStack.top()); m_bufferStack.pop(); }
     }
+    void initFirstBuffer();
 
     // Called by V3PreLex.l from lexer
-    void appendDefValue(const char* text, int len);
+    void appendDefValue(const char* text, size_t len);
     void lineDirective(const char* text);
     void incLineno() { m_curFilelinep->incLineno(); }
     // Called by V3PreProc.cpp to inform lexer
@@ -157,10 +161,13 @@ class V3PreLex {
     void pushStateDefForm();
     void pushStateDefValue();
     void pushStateIncFilename();
-    void scanBytes(const string& strg);
+    void scanBytes(const char* strp, size_t len);
+    void scanBytesBack(const string& str);
+    size_t inputToLex(char* buf, size_t max_size);
     /// Called by VPreproc.cpp to get data from lexer
     YY_BUFFER_STATE currentBuffer();
     int	 currentStartState();
+    void dumpSummary();
     void dumpStack();
 };
 
diff --git a/src/V3PreLex.l b/src/V3PreLex.l
index 6793a72db..ef3fbaba8 100644
--- a/src/V3PreLex.l
+++ b/src/V3PreLex.l
@@ -33,6 +33,9 @@ V3PreLex* V3PreLex::s_currentLexp = NULL;	// Current lexing point
 
 #define LEXP V3PreLex::s_currentLexp
 
+#define YY_INPUT(buf,result,max_size) \
+    result = LEXP->inputToLex(buf,max_size);
+
 // Accessors, because flex keeps changing the type of yyleng
 char* yyourtext() { return yytext; }
 size_t yyourleng() { return yyleng; }
@@ -44,7 +47,7 @@ static bool optPsl() { return V3PreProc::optPsl(); }
 static bool pedantic() { return LEXP->m_pedantic; }
 static void yyerror(char* msg) { LEXP->m_curFilelinep->v3error(msg); }
 static void yyerrorf(const char* msg) { LEXP->m_curFilelinep->v3error(msg); }
-static void appendDefValue(const char* t,int l) { LEXP->appendDefValue(t,l); }
+static void appendDefValue(const char* t, size_t l) { LEXP->appendDefValue(t,l); }
 static int  pslParenLevel()    { return LEXP->m_pslParenLevel; }
 static void pslParenLevelInc() { LEXP->m_pslParenLevel++; }
 static void pslParenLevelDec() { if (pslParenLevel()) LEXP->m_pslParenLevel--; }
@@ -78,6 +81,7 @@ crnl		[\r]*[\n]
 quote		[\"]
 backslash	[\\]
 symb		([a-zA-Z_][a-zA-Z0-9_$]*|\\[^ \t\f\r\n]+)
+word		[a-zA-Z0-9_]+
 drop		[\032]
 psl		[p]sl
 
@@ -113,6 +117,7 @@ psl		[p]sl
 <INITIAL,PSLMULM,PSLONEM>{quote}	{ yy_push_state(STRMODE); yymore(); }
 <STRMODE><<EOF>>	{ linenoInc(); yyerrorf("EOF in unterminated string"); yyleng=0; yyterminate(); }
 <STRMODE>{crnl}		{ linenoInc(); yyerrorf("Unterminated string"); BEGIN(INITIAL); }
+<STRMODE>{word}		{ yymore(); }
 <STRMODE>[^\"\\]	{ yymore(); }
 <STRMODE>{backslash}{crnl}	{ linenoInc(); yymore(); }
 <STRMODE>{backslash}.	{ yymore(); }
@@ -167,6 +172,7 @@ psl		[p]sl
 <DEFVAL>[\\]{crnl}	{ linenoInc(); appendDefValue((char*)"\n",1); } /* Return, but not \ is part of define value */
 <DEFVAL>[^\/\*\n\r\\]+	|
 <DEFVAL>[\\][^\n\r]	|
+<DEFVAL>{word}		{ appendDefValue(yytext,yyleng); }
 <DEFVAL>.		{ appendDefValue(yytext,yyleng); }
 
 	/* Comments inside define values - if embedded get added to define value per spec */
@@ -177,6 +183,7 @@ psl		[p]sl
 	 		  appendDefValue(yytext,yyleng-2); appendDefValue((char*)"\n",1); }  /* Return but not \ */
 <DEFCMT>{crnl}		{ linenoInc(); yymore(); if (LEXP->m_defCmtSlash) yyerrorf("One line of /* ... */ is missing \\ before newline");
 			  BEGIN(CMTMODE); }
+<DEFCMT>{word}		{ yymore(); }
 <DEFCMT>.		{ yymore(); }
 <DEFCMT><<EOF>>		{ yyerrorf("EOF in '/* ... */' block comment\n"); yyleng=0; yyterminate(); }
 
@@ -241,6 +248,7 @@ psl		[p]sl
 <CMTBEGM,CMTMODE>"*/"		{ yy_pop_state(); return(VP_COMMENT); }
 <CMTBEGM,CMTMODE>{crnl}		{ linenoInc(); yymore(); }
 <CMTBEGM,CMTMODE><<EOF>>	{ yyerrorf("EOF in '/* ... */' block comment\n"); yyleng=0; yyterminate(); }
+<CMTMODE>{word}			{ yymore(); }
 <CMTBEGM>.			{ BEGIN CMTMODE; yymore(); }	/* Non 'psl' beginning in comment */
 <CMTMODE>.			{ yymore(); }
 
@@ -290,12 +298,57 @@ void V3PreLex::pushStateIncFilename() {
     yymore();
 }
 
-void V3PreLex::scanBytes(const string& strg) {
-    yy_scan_bytes(strg.c_str(), strg.length());
+void V3PreLex::initFirstBuffer() {
+    // Called from constructor to make first buffer
+    // yy_create_buffer also sets yy_fill_buffer=1 so reads from YY_INPUT
+    yy_switch_to_buffer(yy_create_buffer(NULL, YY_BUF_SIZE));
+    m_bufferStack.push(currentBuffer());
+    yyrestart(NULL);
+}
+
+size_t V3PreLex::inputToLex(char* buf, size_t max_size) {
+    // We need a custom YY_INPUT because we can't use flex buffers.
+    // Flex buffers are limited to 2GB, and we can't chop into 2G pieces
+    // because buffers can't end in the middle of tokens.
+    // m_buffers only applies to the "base" buffer when there's no scanBytes outstanding
+    // It won't be called on scan_buffers as they don't have yy_fill_buffer set.
+    //
+    //if (debug()) { cout<<"-  pp:inputToLex ITL s="<<max_size<<" bs="<<m_bufferStack.size()<<endl;  dumpSummary(); }
+    // For testing, use really small chunks
+    //if (max_size > 13) max_size=13;
+    size_t got = 0;
+    while (got < max_size	// Haven't got enough
+	   && !m_buffers.empty()) {	// And something buffered
+	string front = m_buffers.front(); m_buffers.pop_front();
+	size_t len = front.length();
+	if (len > (max_size-got)) {  // Front string too big
+	    string remainder = front.substr(max_size-got);
+	    front = front.substr(0, max_size-got);
+	    m_buffers.push_front(remainder);  // Put back remainder for next time
+	    len = (max_size-got);
+	}
+	strncpy(buf+got, front.c_str(), len);
+	got += len;
+    }
+    //if (debug()) { cout<<"-  pp::inputToLex  got="<<got<<" '"<<string(buf,got)<<"'"<<endl; }
+    return got;
+}
+
+void V3PreLex::scanBytes(const char* strp, size_t len) {
+    // Note buffers also appended in ::scanBytesBack
+    // Not "m_buffers.push_front(string(strp,len))" as we need a `define
+    // to take effect immediately, in the middle of the current buffer
+    yy_scan_bytes(strp, len);
     m_bufferStack.push(currentBuffer());  // yy_scan_bytes makes new buffer
 }
 
-void V3PreLex::appendDefValue(const char* textp, int len) {
+void V3PreLex::scanBytesBack(const string& str) {
+    // Initial creation, that will pull from YY_INPUT==inputToLex
+    // Note buffers also appended in ::scanBytes
+    m_buffers.push_back(str);
+}
+
+void V3PreLex::appendDefValue(const char* textp, size_t len) {
     // Append given text to current definition value being formed
     m_defValue.append(textp,len);
 }
@@ -314,8 +367,15 @@ void V3PreLex::lineDirective(const char* textp) {
     V3File::addSrcDepend(m_curFilelinep->filename());
 }
 
+void V3PreLex::dumpSummary() {
+    cout<<"-  pp::dumpSummary  curBuf="<<(void*)(currentBuffer())
+	<<" nBuf="<<m_bufferStack.size()
+	<<" yyfill="<<currentBuffer()->yy_fill_buffer<<endl;
+}
+
 void V3PreLex::dumpStack() {
     // For debug use
+    dumpSummary();
     stack<YY_BUFFER_STATE> tmpstack = m_bufferStack;
     printf("  bufferStack[%p]:",this);
     while (!tmpstack.empty()) {
diff --git a/src/V3PreProc.cpp b/src/V3PreProc.cpp
index 81a739b90..926c1cc9b 100644
--- a/src/V3PreProc.cpp
+++ b/src/V3PreProc.cpp
@@ -23,12 +23,14 @@
 #include "config_build.h"
 #include "verilatedos.h"
 #include <cstdio>
+#include <cstdlib>
 #include <cstdarg>
 #include <unistd.h>
 #include <fstream>
 #include <stack>
 #include <vector>
 #include <map>
+#include <list>
 
 #include "V3Error.h"
 #include "V3Global.h"
@@ -108,6 +110,7 @@ public:
 struct V3PreProcImp : public V3PreProc {
     // TYPES
     typedef std::map<string,V3Define> DefinesMap;
+    typedef V3InFilter::StrList StrList;
 
     // debug() -> see V3PreShellImp::debug
 
@@ -165,7 +168,7 @@ private:
 
     bool commentTokenMatch(string& cmdr, const char* strg);
     string trimWhitespace(const string& strg, bool trailing);
-    void unputString(const string& strg, bool first=false);
+    void unputString(const string& strg);
 
     void parsingOn() {
 	m_off--;
@@ -414,18 +417,19 @@ const char* V3PreProcImp::tokenName(int tok) {
     }
 }
 
-void V3PreProcImp::unputString(const string& strg, bool first) {
+void V3PreProcImp::unputString(const string& strg) {
+    // Note: The preliminary call in ::openFile bypasses this function
     // We used to just m_lexp->unputString(strg.c_str());
     // However this can lead to "flex scanner push-back overflow"
     // so instead we scan from a temporary buffer, then on EOF return.
     // This is also faster than the old scheme, amazingly.
-    if (!first) {  // Else the initial creation
+    if (1) {
 	if (m_lexp->m_bufferStack.empty() || m_lexp->m_bufferStack.top()!=m_lexp->currentBuffer()) {
 	    fileline()->v3fatalSrc("bufferStack missing current buffer; will return incorrectly");
 	    // Hard to debug lost text as won't know till much later
 	}
     }
-    m_lexp->scanBytes(strg);
+    m_lexp->scanBytes(strg.c_str(), strg.length());
 }
 
 string V3PreProcImp::trimWhitespace(const string& strg, bool trailing) {
@@ -600,22 +604,14 @@ void V3PreProcImp::openFile(FileLine* fl, V3InFilter* filterp, const string& fil
 
     V3File::addSrcDepend(filename);
 
-    string wholefile;
+    // Read a list<string> with the whole file.
+    StrList wholefile;
     bool ok = filterp->readWholefile(filename, wholefile/*ref*/);
     if (!ok) {
 	fileline()->v3error("File not found: "+filename+"\n");
 	return;
     }
 
-    // Filter all DOS CR's en-mass.  This avoids bugs with lexing CRs in the wrong places.
-    // This will also strip them from strings, but strings aren't supposed to be multi-line without a "\"
-    string wholefilecr;
-    size_t wholesize = wholefile.length();
-    for (size_t i=0; i<wholesize; i++) {  // Not a c_str(), as we keep '\0's for now.
-	if (wholefile[i] != '\r' && wholefile[i] != '\0') wholefilecr += wholefile[i];
-    }
-    wholefile.resize(0); // free memory
-
     if (m_lexp) {
 	// We allow the same include file twice, because occasionally it pops
 	// up, with guards preventing a real recursion.
@@ -636,7 +632,30 @@ void V3PreProcImp::openFile(FileLine* fl, V3InFilter* filterp, const string& fil
     addLineComment(1); // Enter
 
     yy_flex_debug = (debug()>4)?1:0;
-    unputString(wholefilecr,true);
+
+    // Filter all DOS CR's en-mass.  This avoids bugs with lexing CRs in the wrong places.
+    // This will also strip them from strings, but strings aren't supposed to be multi-line without a "\"
+    for (StrList::iterator it=wholefile.begin(); it!=wholefile.end(); ++it) {
+	// We don't test for \0 as we allow and strip mid-string '\0's (for now).
+	// We also edit in place.  This is nasty to other users of the string, but
+	// there aren't any, and it avoids needing 2x the memory on very large files.
+	const char* sp = it->data();
+	const char* ep = sp + it->length();
+	char* cp = (char*) sp;
+	for (; sp<ep; sp++) {
+	    if (*sp != '\r' && *sp != '\0') {
+		*cp++ = *sp;
+	    }
+	}
+	size_t len = cp - it->data();
+	// Truncate old string
+	it->erase(len);
+
+	// Push the data to an internal buffer.
+	m_lexp->scanBytesBack(*it);
+	// Reclaim memory; the push saved the string contents for us
+	*it = "";
+    }
 }
 
 void V3PreProcImp::insertUnreadbackAtBol(const string& text) {
@@ -771,7 +790,7 @@ int V3PreProcImp::getToken() {
 	    // We're off or processed the comment specially.  If there are newlines
 	    // in it, we also return the newlines as TEXT so that the linenumber
 	    // count is maintained for downstream tools
-	    for (int len=0; len<yyourleng(); len++) { if (yyourtext()[len]=='\n') m_lineAdd++; }
+	    for (size_t len=0; len<yyourleng(); len++) { if (yyourtext()[len]=='\n') m_lineAdd++; }
 	    goto next_tok;
 	}
 	if (tok==VP_LINE) {
diff --git a/src/V3PreProc.h b/src/V3PreProc.h
index 0b5f42317..2cb8c17c2 100644
--- a/src/V3PreProc.h
+++ b/src/V3PreProc.h
@@ -29,6 +29,7 @@
 
 #include <string>
 #include <map>
+#include <list>
 #include <iostream>
 
 class V3InFilter;
diff --git a/src/verilog.l b/src/verilog.l
index e0f8558ba..26aa50531 100644
--- a/src/verilog.l
+++ b/src/verilog.l
@@ -144,6 +144,7 @@ crnl	[\r]*[\n]
 id	[a-zA-Z_][a-zA-Z0-9_$]*
 	/* escaped identifier */
 escid	\\[^ \t\f\r\n]+
+word	[a-zA-Z0-9_]+
 
 %%
 
@@ -832,12 +833,14 @@ escid	\\[^ \t\f\r\n]+
 <STRING>\" 		{ yy_pop_state();
 			  FL; yylval.strp = PARSEP->newString(yytext+1,yyleng-2);
 			  return yaSTRING; }
+<STRING>{word}		{ yymore(); }
 <STRING>.		{ yymore(); }
 
   /************************************************************************/
   /* Attributes */
 <ATTRMODE>{crnl}	{ yymore(); NEXTLINE(); }
 <ATTRMODE>"*)"		{ yy_pop_state(); }
+<ATTRMODE>{word}	{ yymore(); }
 <ATTRMODE>. 		{ yymore(); }
 <ATTRMODE><<EOF>>	{ yyerrorf("EOF in (*");
 			  yyleng = 0; yy_pop_state(); }