From f8eabbc100bc982089a6412c69d494c8cc52ee8b Mon Sep 17 00:00:00 2001 From: Wilson Snyder Date: Tue, 6 Apr 2010 20:20:44 -0400 Subject: [PATCH] From Verilog-Perl: Fix parsing single files > 2GB. --- Changes | 2 ++ src/V3File.cpp | 69 ++++++++++++++++++++++++++++++---------------- src/V3File.h | 6 +++- src/V3ParseImp.cpp | 6 ++-- src/V3ParseImp.h | 6 ++-- src/V3PreLex.h | 13 +++++++-- src/V3PreLex.l | 68 ++++++++++++++++++++++++++++++++++++++++++--- src/V3PreProc.cpp | 51 +++++++++++++++++++++++----------- src/V3PreProc.h | 1 + src/verilog.l | 3 ++ 10 files changed, 172 insertions(+), 53 deletions(-) diff --git a/Changes b/Changes index 1bb28b361..43ba01fc3 100644 --- a/Changes +++ b/Changes @@ -17,6 +17,8 @@ indicates the contributor was also the author of the fix; Thanks! **** Fix trace files with empty modules crashing some viewers. +**** Fix parsing single files > 2GB. [Jeffrey Short] + * Verilator 3.801 2010/03/17 *** Support "break", "continue", "return". diff --git a/src/V3File.cpp b/src/V3File.cpp index 8cd6de575..1bd41944a 100644 --- a/src/V3File.cpp +++ b/src/V3File.cpp @@ -44,6 +44,8 @@ #include "V3PreShell.h" #include "V3Ast.h" +// If change this code, run a test with the below size set very small +//#define INFILTER_IPC_BUFSIZ 16 #define INFILTER_IPC_BUFSIZ 64*1024 // For debug, try this as a small number #define INFILTER_CACHE_MAX 64*1024 // Maximum bytes to cache if same file read twice @@ -271,6 +273,7 @@ void V3File::createMakeDir() { class V3InFilterImp { typedef map FileContentsMap; + typedef V3InFilter::StrList StrList; FileContentsMap m_contentsMap; // Cache of file contents bool m_readEof; // Received EOF on read @@ -292,27 +295,27 @@ private: return level; } - bool readContents(const string& filename, string& out) { - if (m_pid) return readContentsFilter(filename,out); - else return readContentsFile(filename,out); + bool readContents(const string& filename, StrList& outl) { + if (m_pid) return readContentsFilter(filename,outl); + else return readContentsFile(filename,outl); } - bool readContentsFile(const string& filename, string& out) { + bool readContentsFile(const string& filename, StrList& outl) { int fd = open (filename.c_str(), O_RDONLY); if (!fd) return false; m_readEof = false; - out = readBlocks(fd, -1); + readBlocks(fd, -1, outl); close(fd); return true; } - bool readContentsFilter(const string& filename, string& out) { - if (filename!="" || out!="") {} // Prevent unused + bool readContentsFilter(const string& filename, StrList& outl) { + if (filename!="" || outl.empty()) {} // Prevent unused #ifdef INFILTER_PIPE writeFilter("read \""+filename+"\"\n"); string line = readFilterLine(); if (line.find("Content-Length") != string::npos) { int len = 0; sscanf(line.c_str(), "Content-Length: %d\n", &len); - out = readBlocks(m_readFd, len); + readBlocks(m_readFd, len, outl); return true; } else { if (line!="") v3error("--pipe-filter protocol error, unexpected: "<(int)out.length())) { - int todo = INFILTER_IPC_BUFSIZ; + ssize_t sizegot = 0; + while (!m_readEof && (size<0 || size>sizegot)) { + ssize_t todo = INFILTER_IPC_BUFSIZ; if (size>0 && size0) out.append(buf, got); + if (got>0) { + outl.push_back(string(buf, got)); + sizegot += got; + } else if (errno == EINTR || errno == EAGAIN #ifdef EWOULDBLOCK || errno == EWOULDBLOCK @@ -358,9 +365,11 @@ private: UINFO(9,"readFilterLine\n"); string line; while (!m_readEof) { - string c = readBlocks(m_readFd, 1); - line += c; - if (c == "\n") { + StrList outl; + readBlocks(m_readFd, 1, outl); + string onechar = listString(outl); + line += onechar; + if (onechar == "\n") { if (line == "\n") { line=""; continue; } else break; } @@ -477,21 +486,35 @@ private: protected: friend class V3InFilter; // Read file contents and return it - bool readWholefile(const string& filename, string& out) { + bool readWholefile(const string& filename, StrList& outl) { FileContentsMap::iterator it = m_contentsMap.find(filename); if (it != m_contentsMap.end()) { - out = it->second; + outl.push_back(it->second); return true; } - if (!readContents(filename, out)) return false; - if (out.length() < INFILTER_CACHE_MAX) { + if (!readContents(filename, outl)) return false; + if (listSize(outl) < INFILTER_CACHE_MAX) { // Cache small files (only to save space) // It's quite common to `include "timescale" thousands of times // This isn't so important if it's just a open(), but filtering can be slow - m_contentsMap.insert(make_pair(filename,out)); + m_contentsMap.insert(make_pair(filename,listString(outl))); } return true; } + size_t listSize(StrList& sl) { + size_t out = 0; + for (StrList::iterator it=sl.begin(); it!=sl.end(); ++it) { + out += it->length(); + } + return out; + } + string listString(StrList& sl) { + string out; + for (StrList::iterator it=sl.begin(); it!=sl.end(); ++it) { + out += *it; + } + return out; + } // CONSTRUCTORS V3InFilterImp(const string& command) { m_readEof = false; @@ -512,9 +535,9 @@ protected: V3InFilter::V3InFilter(const string& command) { m_impp = new V3InFilterImp(command); } V3InFilter::~V3InFilter() { if (m_impp) delete m_impp; m_impp=NULL; } -bool V3InFilter::readWholefile(const string& filename, string& out) { +bool V3InFilter::readWholefile(const string& filename, V3InFilter::StrList& outl) { if (!m_impp) v3fatalSrc("readWholefile on invalid filter"); - return m_impp->readWholefile(filename, out); + return m_impp->readWholefile(filename, outl); } //###################################################################### diff --git a/src/V3File.h b/src/V3File.h index 169ecf020..ab3af576d 100644 --- a/src/V3File.h +++ b/src/V3File.h @@ -28,6 +28,7 @@ #include #include #include +#include #include //============================================================================ @@ -79,9 +80,12 @@ class V3InFilterImp; class V3InFilter { V3InFilterImp* m_impp; public: + // TYPES + typedef list StrList; + // METHODS // Read file contents and return it. Return true on success. - bool readWholefile(const string& filename, string& out); + bool readWholefile(const string& filename, StrList& outl); // CONSTRUCTORS V3InFilter(const string& command); diff --git a/src/V3ParseImp.cpp b/src/V3ParseImp.cpp index 574bf888a..967b76c2d 100644 --- a/src/V3ParseImp.cpp +++ b/src/V3ParseImp.cpp @@ -66,12 +66,12 @@ V3ParseImp::~V3ParseImp() { if (debug()>=9) { UINFO(0,"~V3ParseImp\n"); symp()->dump(cout, "-vpi: "); } } -int V3ParseImp::ppInputToLex(char* buf, int max_size) { - int got = 0; +size_t V3ParseImp::ppInputToLex(char* buf, size_t max_size) { + size_t got = 0; while (got < max_size // Haven't got enough && !m_ppBuffers.empty()) { // And something buffered string front = m_ppBuffers.front(); m_ppBuffers.pop_front(); - int len = front.length(); + size_t len = front.length(); if (len > (max_size-got)) { // Front string too big string remainder = front.substr(max_size-got); front = front.substr(0, max_size-got); diff --git a/src/V3ParseImp.h b/src/V3ParseImp.h index 8b7ddd8de..c78f3cd7b 100644 --- a/src/V3ParseImp.h +++ b/src/V3ParseImp.h @@ -260,7 +260,7 @@ public: static const char* tokenName(int tok); void ppPushText(const string& text) { m_ppBuffers.push_back(text); } - int ppInputToLex(char* buf, int max_size); + size_t ppInputToLex(char* buf, size_t max_size); static V3ParseImp* parsep() { return s_parsep; } @@ -278,7 +278,7 @@ public: m_stringps.push_back(strp); return strp; } - string* newString(const char* text, int length) { + string* newString(const char* text, size_t length) { string* strp = new string (text, length); m_stringps.push_back(strp); return strp; @@ -307,7 +307,7 @@ public: void statePushVlg(); // Parser -> lexer communication void statePop(); // Parser -> lexer communication int stateVerilogRecent(); // Parser -> lexer communication - int flexPpInputToLex(char* buf, int max_size) { return ppInputToLex(buf,max_size); } + size_t flexPpInputToLex(char* buf, size_t max_size) { return ppInputToLex(buf,max_size); } //==== Symbol tables V3ParseSym* symp() { return &m_sym; } diff --git a/src/V3PreLex.h b/src/V3PreLex.h index 7cfd21b98..be60293ee 100644 --- a/src/V3PreLex.h +++ b/src/V3PreLex.h @@ -27,6 +27,7 @@ #ifndef _VPREPROCLEX_H_ // Guard #define _VPREPROCLEX_H_ 1 +#include #include #include "V3Error.h" @@ -112,7 +113,7 @@ void yy_delete_buffer( YY_BUFFER_STATE b ); #define KEEPCMT_SUB 2 //====================================================================== -// Class entry for each per-lexter state +// Class entry for each per-lexer state class V3PreLex { public: // Used only by V3PreLex.cpp and V3PreProc.cpp @@ -120,6 +121,7 @@ class V3PreLex { // Parse state stack m_bufferStack; // Stack of inserted text above current point + deque m_buffers; ///< Buffer of characters to process // State to lexer static V3PreLex* s_currentLexp; // Current lexing point @@ -143,13 +145,15 @@ class V3PreLex { m_defCmtSlash = false; m_pslParenLevel = 0; m_pslMoreNeeded = false; + initFirstBuffer(); } ~V3PreLex() { while (!m_bufferStack.empty()) { yy_delete_buffer(m_bufferStack.top()); m_bufferStack.pop(); } } + void initFirstBuffer(); // Called by V3PreLex.l from lexer - void appendDefValue(const char* text, int len); + void appendDefValue(const char* text, size_t len); void lineDirective(const char* text); void incLineno() { m_curFilelinep->incLineno(); } // Called by V3PreProc.cpp to inform lexer @@ -157,10 +161,13 @@ class V3PreLex { void pushStateDefForm(); void pushStateDefValue(); void pushStateIncFilename(); - void scanBytes(const string& strg); + void scanBytes(const char* strp, size_t len); + void scanBytesBack(const string& str); + size_t inputToLex(char* buf, size_t max_size); /// Called by VPreproc.cpp to get data from lexer YY_BUFFER_STATE currentBuffer(); int currentStartState(); + void dumpSummary(); void dumpStack(); }; diff --git a/src/V3PreLex.l b/src/V3PreLex.l index 6793a72db..ef3fbaba8 100644 --- a/src/V3PreLex.l +++ b/src/V3PreLex.l @@ -33,6 +33,9 @@ V3PreLex* V3PreLex::s_currentLexp = NULL; // Current lexing point #define LEXP V3PreLex::s_currentLexp +#define YY_INPUT(buf,result,max_size) \ + result = LEXP->inputToLex(buf,max_size); + // Accessors, because flex keeps changing the type of yyleng char* yyourtext() { return yytext; } size_t yyourleng() { return yyleng; } @@ -44,7 +47,7 @@ static bool optPsl() { return V3PreProc::optPsl(); } static bool pedantic() { return LEXP->m_pedantic; } static void yyerror(char* msg) { LEXP->m_curFilelinep->v3error(msg); } static void yyerrorf(const char* msg) { LEXP->m_curFilelinep->v3error(msg); } -static void appendDefValue(const char* t,int l) { LEXP->appendDefValue(t,l); } +static void appendDefValue(const char* t, size_t l) { LEXP->appendDefValue(t,l); } static int pslParenLevel() { return LEXP->m_pslParenLevel; } static void pslParenLevelInc() { LEXP->m_pslParenLevel++; } static void pslParenLevelDec() { if (pslParenLevel()) LEXP->m_pslParenLevel--; } @@ -78,6 +81,7 @@ crnl [\r]*[\n] quote [\"] backslash [\\] symb ([a-zA-Z_][a-zA-Z0-9_$]*|\\[^ \t\f\r\n]+) +word [a-zA-Z0-9_]+ drop [\032] psl [p]sl @@ -113,6 +117,7 @@ psl [p]sl {quote} { yy_push_state(STRMODE); yymore(); } <> { linenoInc(); yyerrorf("EOF in unterminated string"); yyleng=0; yyterminate(); } {crnl} { linenoInc(); yyerrorf("Unterminated string"); BEGIN(INITIAL); } +{word} { yymore(); } [^\"\\] { yymore(); } {backslash}{crnl} { linenoInc(); yymore(); } {backslash}. { yymore(); } @@ -167,6 +172,7 @@ psl [p]sl [\\]{crnl} { linenoInc(); appendDefValue((char*)"\n",1); } /* Return, but not \ is part of define value */ [^\/\*\n\r\\]+ | [\\][^\n\r] | +{word} { appendDefValue(yytext,yyleng); } . { appendDefValue(yytext,yyleng); } /* Comments inside define values - if embedded get added to define value per spec */ @@ -177,6 +183,7 @@ psl [p]sl appendDefValue(yytext,yyleng-2); appendDefValue((char*)"\n",1); } /* Return but not \ */ {crnl} { linenoInc(); yymore(); if (LEXP->m_defCmtSlash) yyerrorf("One line of /* ... */ is missing \\ before newline"); BEGIN(CMTMODE); } +{word} { yymore(); } . { yymore(); } <> { yyerrorf("EOF in '/* ... */' block comment\n"); yyleng=0; yyterminate(); } @@ -241,6 +248,7 @@ psl [p]sl "*/" { yy_pop_state(); return(VP_COMMENT); } {crnl} { linenoInc(); yymore(); } <> { yyerrorf("EOF in '/* ... */' block comment\n"); yyleng=0; yyterminate(); } +{word} { yymore(); } . { BEGIN CMTMODE; yymore(); } /* Non 'psl' beginning in comment */ . { yymore(); } @@ -290,12 +298,57 @@ void V3PreLex::pushStateIncFilename() { yymore(); } -void V3PreLex::scanBytes(const string& strg) { - yy_scan_bytes(strg.c_str(), strg.length()); +void V3PreLex::initFirstBuffer() { + // Called from constructor to make first buffer + // yy_create_buffer also sets yy_fill_buffer=1 so reads from YY_INPUT + yy_switch_to_buffer(yy_create_buffer(NULL, YY_BUF_SIZE)); + m_bufferStack.push(currentBuffer()); + yyrestart(NULL); +} + +size_t V3PreLex::inputToLex(char* buf, size_t max_size) { + // We need a custom YY_INPUT because we can't use flex buffers. + // Flex buffers are limited to 2GB, and we can't chop into 2G pieces + // because buffers can't end in the middle of tokens. + // m_buffers only applies to the "base" buffer when there's no scanBytes outstanding + // It won't be called on scan_buffers as they don't have yy_fill_buffer set. + // + //if (debug()) { cout<<"- pp:inputToLex ITL s="<filename()); } +void V3PreLex::dumpSummary() { + cout<<"- pp::dumpSummary curBuf="<<(void*)(currentBuffer()) + <<" nBuf="< +#include #include #include #include #include #include #include +#include #include "V3Error.h" #include "V3Global.h" @@ -108,6 +110,7 @@ public: struct V3PreProcImp : public V3PreProc { // TYPES typedef std::map DefinesMap; + typedef V3InFilter::StrList StrList; // debug() -> see V3PreShellImp::debug @@ -165,7 +168,7 @@ private: bool commentTokenMatch(string& cmdr, const char* strg); string trimWhitespace(const string& strg, bool trailing); - void unputString(const string& strg, bool first=false); + void unputString(const string& strg); void parsingOn() { m_off--; @@ -414,18 +417,19 @@ const char* V3PreProcImp::tokenName(int tok) { } } -void V3PreProcImp::unputString(const string& strg, bool first) { +void V3PreProcImp::unputString(const string& strg) { + // Note: The preliminary call in ::openFile bypasses this function // We used to just m_lexp->unputString(strg.c_str()); // However this can lead to "flex scanner push-back overflow" // so instead we scan from a temporary buffer, then on EOF return. // This is also faster than the old scheme, amazingly. - if (!first) { // Else the initial creation + if (1) { if (m_lexp->m_bufferStack.empty() || m_lexp->m_bufferStack.top()!=m_lexp->currentBuffer()) { fileline()->v3fatalSrc("bufferStack missing current buffer; will return incorrectly"); // Hard to debug lost text as won't know till much later } } - m_lexp->scanBytes(strg); + m_lexp->scanBytes(strg.c_str(), strg.length()); } string V3PreProcImp::trimWhitespace(const string& strg, bool trailing) { @@ -600,22 +604,14 @@ void V3PreProcImp::openFile(FileLine* fl, V3InFilter* filterp, const string& fil V3File::addSrcDepend(filename); - string wholefile; + // Read a list with the whole file. + StrList wholefile; bool ok = filterp->readWholefile(filename, wholefile/*ref*/); if (!ok) { fileline()->v3error("File not found: "+filename+"\n"); return; } - // Filter all DOS CR's en-mass. This avoids bugs with lexing CRs in the wrong places. - // This will also strip them from strings, but strings aren't supposed to be multi-line without a "\" - string wholefilecr; - size_t wholesize = wholefile.length(); - for (size_t i=0; i4)?1:0; - unputString(wholefilecr,true); + + // Filter all DOS CR's en-mass. This avoids bugs with lexing CRs in the wrong places. + // This will also strip them from strings, but strings aren't supposed to be multi-line without a "\" + for (StrList::iterator it=wholefile.begin(); it!=wholefile.end(); ++it) { + // We don't test for \0 as we allow and strip mid-string '\0's (for now). + // We also edit in place. This is nasty to other users of the string, but + // there aren't any, and it avoids needing 2x the memory on very large files. + const char* sp = it->data(); + const char* ep = sp + it->length(); + char* cp = (char*) sp; + for (; spdata(); + // Truncate old string + it->erase(len); + + // Push the data to an internal buffer. + m_lexp->scanBytesBack(*it); + // Reclaim memory; the push saved the string contents for us + *it = ""; + } } void V3PreProcImp::insertUnreadbackAtBol(const string& text) { @@ -771,7 +790,7 @@ int V3PreProcImp::getToken() { // We're off or processed the comment specially. If there are newlines // in it, we also return the newlines as TEXT so that the linenumber // count is maintained for downstream tools - for (int len=0; len #include +#include #include class V3InFilter; diff --git a/src/verilog.l b/src/verilog.l index e0f8558ba..26aa50531 100644 --- a/src/verilog.l +++ b/src/verilog.l @@ -144,6 +144,7 @@ crnl [\r]*[\n] id [a-zA-Z_][a-zA-Z0-9_$]* /* escaped identifier */ escid \\[^ \t\f\r\n]+ +word [a-zA-Z0-9_]+ %% @@ -832,12 +833,14 @@ escid \\[^ \t\f\r\n]+ \" { yy_pop_state(); FL; yylval.strp = PARSEP->newString(yytext+1,yyleng-2); return yaSTRING; } +{word} { yymore(); } . { yymore(); } /************************************************************************/ /* Attributes */ {crnl} { yymore(); NEXTLINE(); } "*)" { yy_pop_state(); } +{word} { yymore(); } . { yymore(); } <> { yyerrorf("EOF in (*"); yyleng = 0; yy_pop_state(); }