[ELF] ScriptLexer: generate tokens lazily

The current tokenize-whole-file approach has a few limitations. * Lack of state information: `maybeSplitExpr` is needed to parse expressions. It's infeasible to add new states to behave more like GNU ld. * `readInclude` may insert tokens in the middle, leading to a time complexity issue with N-nested `INCLUDE`. * line/column information for diagnostics are inaccurate, especially after an `INCLUDE`. * `getLineNumber` cannot be made more efficient without significant code complexity and memory consumption. https://reviews.llvm.org/D104137 The patch switches to a traditional lexer that generates tokens lazily. * `atEOF` behavior is modified: we need to call `peek` to determine EOF. * `peek` and `next` cannot call `setError` upon `atEOF`. * Since `consume` no longer reports an error upon `atEOF`, the idiom `while (!errorCount() && !consume(")"))` would cause a dead loop. Use `while (peek() != ")" && !atEOF()) { ... } expect(")")` instead. * An include stack is introduced to handle `readInclude`. This can be utilized to address llvm#93947 properly. * `tokens` and `pos` are removed. * `commandString` is reimplemented. Since it is used in -Map output, `\n` needs to be replaced with space. Pull Request: llvm#100493
DimitryAndric · Jul 26, 2024 · 1978c21 · 1978c21
1 parent b33ef5b
commit 1978c21
Show file tree

Hide file tree

Showing 7 changed files with 163 additions and 170 deletions.
diff --git a/lld/ELF/ScriptLexer.cpp b/lld/ELF/ScriptLexer.cpp
@@ -36,24 +36,24 @@ using namespace llvm;
 using namespace lld;
 using namespace lld::elf;
 
+ScriptLexer::ScriptLexer(MemoryBufferRef mb) : curBuf(mb), mbs(1, mb) {}
+
 // Returns a whole line containing the current token.
 StringRef ScriptLexer::getLine() {
  StringRef s = getCurrentMB().getBuffer();
- StringRef tok = tokens[pos - 1];
 
- size_t pos = s.rfind('\n', tok.data() - s.data());
+ size_t pos = s.rfind('\n', prevTok.data() - s.data());
  if (pos != StringRef::npos)
  s = s.substr(pos + 1);
  return s.substr(0, s.find_first_of("\r\n"));
 }
 
 // Returns 1-based line number of the current token.
 size_t ScriptLexer::getLineNumber() {
- if (pos == 0)
+ if (prevTok.empty())
  return 1;
  StringRef s = getCurrentMB().getBuffer();
- StringRef tok = tokens[pos - 1];
- const size_t tokOffset = tok.data() - s.data();
+ const size_t tokOffset = prevTok.data() - s.data();
 
  // For the first token, or when going backwards, start from the beginning of
  // the buffer. If this token is after the previous token, start from the
@@ -76,40 +76,41 @@ size_t ScriptLexer::getLineNumber() {
 
 // Returns 0-based column number of the current token.
 size_t ScriptLexer::getColumnNumber() {
- StringRef tok = tokens[pos - 1];
- return tok.data() - getLine().data();
+ return prevTok.data() - getLine().data();
 }
 
 std::string ScriptLexer::getCurrentLocation() {
  std::string filename = std::string(getCurrentMB().getBufferIdentifier());
  return (filename + ":" + Twine(getLineNumber())).str();
 }
 
-ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }
-
 // We don't want to record cascading errors. Keep only the first one.
 void ScriptLexer::setError(const Twine &msg) {
  if (errorCount())
  return;
 
  std::string s = (getCurrentLocation() + ": " + msg).str();
- if (pos)
+ if (prevTok.size())
  s += "\n>>> " + getLine().str() + "\n>>> " +
  std::string(getColumnNumber(), ' ') + "^";
  error(s);
 }
 
-// Split S into linker script tokens.
-void ScriptLexer::tokenize(MemoryBufferRef mb) {
- std::vector<StringRef> vec;
- mbs.push_back(mb);
- StringRef s = mb.getBuffer();
- StringRef begin = s;
-
+void ScriptLexer::lex() {
  for (;;) {
+ StringRef &s = curBuf.s;
  s = skipSpace(s);
- if (s.empty())
- break;
+ if (s.empty()) {
+ // If this buffer is from an INCLUDE command, switch to the "return
+ // value"; otherwise, mark EOF.
+ if (buffers.empty()) {
+ eof = true;
+ return;
+ }
+ curBuf = buffers.pop_back_val();
+ continue;
+ }
+ curTokState = inExpr;
 
  // Quoted token. Note that double-quote characters are parts of a token
  // because, in a glob match context, only unquoted tokens are interpreted
@@ -118,45 +119,53 @@ void ScriptLexer::tokenize(MemoryBufferRef mb) {
  if (s.starts_with("\"")) {
  size_t e = s.find("\"", 1);
  if (e == StringRef::npos) {
- StringRef filename = mb.getBufferIdentifier();
- size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
- error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
+ size_t lineno =
+  StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n');
+ error(curBuf.filename + ":" + Twine(lineno + 1) + ": unclosed quote");
  return;
  }
 
- vec.push_back(s.take_front(e + 1));
+ curTok = s.take_front(e + 1);
  s = s.substr(e + 1);
- continue;
+ return;
  }
 
  // Some operators form separate tokens.
  if (s.starts_with("<<=") || s.starts_with(">>=")) {
- vec.push_back(s.substr(0, 3));
+ curTok = s.substr(0, 3);
  s = s.substr(3);
- continue;
+ return;
  }
- if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) ||
- (s[0] == s[1] && strchr("<>&|", s[0])))) {
- vec.push_back(s.substr(0, 2));
+ if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) {
+ curTok = s.substr(0, 2);
  s = s.substr(2);
- continue;
+ return;
  }
 
- // Unquoted token. This is more relaxed than tokens in C-like language,
- // so that you can write "file-name.cpp" as one bare token, for example.
- size_t pos = s.find_first_not_of(
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
- "0123456789_.$/\\~=+[]*?-!^:");
+ // Unquoted token. The non-expression token is more relaxed than tokens in
+ // C-like languages, so that you can write "file-name.cpp" as one bare
+ // token.
+ size_t pos;
+ if (inExpr) {
+ pos = s.find_first_not_of(
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ "0123456789_.$");
+ if (pos == 0 && s.size() >= 2 &&
+ ((s[0] == s[1] && strchr("<>&|", s[0])) ||
+ is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2))))
+ pos = 2;
+ } else {
+ pos = s.find_first_not_of(
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ "0123456789_.$/\\~=+[]*?-!^:");
+ }
 
- // A character that cannot start a word (which is usually a
- // punctuation) forms a single character token.
  if (pos == 0)
  pos = 1;
- vec.push_back(s.substr(0, pos));
+ curTok = s.substr(0, pos);
  s = s.substr(pos);
+ break;
  }
-
- tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
 }
 
 // Skip leading whitespace characters or comments.
@@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
  }
 }
 
-// An erroneous token is handled as if it were the last token before EOF.
-bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
-
-// Split a given string as an expression.
-// This function returns "3", "*" and "5" for "3*5" for example.
-static std::vector<StringRef> tokenizeExpr(StringRef s) {
- StringRef ops = "!~*/+-<>?^:="; // List of operators
-
- // Quoted strings are literal strings, so we don't want to split it.
- if (s.starts_with("\""))
- return {s};
-
- // Split S with operators as separators.
- std::vector<StringRef> ret;
- while (!s.empty()) {
- size_t e = s.find_first_of(ops);
-
- // No need to split if there is no operator.
- if (e == StringRef::npos) {
- ret.push_back(s);
- break;
- }
-
- // Get a token before the operator.
- if (e != 0)
- ret.push_back(s.substr(0, e));
-
- // Get the operator as a token.
- // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
- if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
- s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
- s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
- ret.push_back(s.substr(e, 2));
- s = s.substr(e + 2);
- } else {
- ret.push_back(s.substr(e, 1));
- s = s.substr(e + 1);
- }
- }
- return ret;
-}
-
-// In contexts where expressions are expected, the lexer should apply
-// different tokenization rules than the default one. By default,
-// arithmetic operator characters are regular characters, but in the
-// expression context, they should be independent tokens.
-//
-// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
-// in the expression context.
-//
-// This function may split the current token into multiple tokens.
-void ScriptLexer::maybeSplitExpr() {
- if (!inExpr || errorCount() || atEOF())
- return;
-
- std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
- if (v.size() == 1)
- return;
- tokens.erase(tokens.begin() + pos);
- tokens.insert(tokens.begin() + pos, v.begin(), v.end());
-}
+// Used to determine whether to stop parsing. Treat errors like EOF.
+bool ScriptLexer::atEOF() { return eof || errorCount(); }
 
 StringRef ScriptLexer::next() {
- maybeSplitExpr();
-
- if (errorCount())
- return "";
- if (atEOF()) {
- setError("unexpected EOF");
- return "";
- }
- return tokens[pos++];
+ prevTok = peek();
+ return std::exchange(curTok, StringRef(curBuf.s.data(), 0));
 }
 
 StringRef ScriptLexer::peek() {
- StringRef tok = next();
- if (errorCount())
- return "";
- pos = pos - 1;
- return tok;
+ // curTok is invalid if curTokState and inExpr mismatch.
+ if (curTok.size() && curTokState != inExpr) {
+ curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data());
+ curTok = {};
+ }
+ if (curTok.empty())
+ lex();
+ return curTok;
 }
 
 bool ScriptLexer::consume(StringRef tok) {
- if (next() == tok)
- return true;
- --pos;
- return false;
+ if (peek() != tok)
+ return false;
+ next();
+ return true;
 }
 
 void ScriptLexer::skip() { (void)next(); }
@@ -280,8 +226,12 @@ void ScriptLexer::expect(StringRef expect) {
  if (errorCount())
  return;
  StringRef tok = next();
- if (tok != expect)
- setError(expect + " expected, but got " + tok);
+ if (tok != expect) {
+ if (atEOF())
+ setError("unexpected EOF");
+ else
+ setError(expect + " expected, but got " + tok);
+ }
 }
 
 // Returns true if S encloses T.
@@ -292,10 +242,8 @@ static bool encloses(StringRef s, StringRef t) {
 MemoryBufferRef ScriptLexer::getCurrentMB() {
  // Find input buffer containing the current token.
  assert(!mbs.empty());
- if (pos == 0)
- return mbs.back();
  for (MemoryBufferRef mb : mbs)
- if (encloses(mb.getBuffer(), tokens[pos - 1]))
+ if (encloses(mb.getBuffer(), curBuf.s))
  return mb;
  llvm_unreachable("getCurrentMB: failed to find a token");
 }
diff --git a/lld/ELF/ScriptLexer.h b/lld/ELF/ScriptLexer.h
@@ -10,18 +10,43 @@
 #define LLD_ELF_SCRIPT_LEXER_H
 
 #include "lld/Common/LLVM.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include <vector>
 
 namespace lld::elf {
 
 class ScriptLexer {
+protected:
+ struct Buffer {
+ // The remaining content to parse and the filename.
+ StringRef s, filename;
+ const char *begin = nullptr;
+ Buffer() = default;
+ Buffer(MemoryBufferRef mb)
+ : s(mb.getBuffer()), filename(mb.getBufferIdentifier()),
+ begin(mb.getBufferStart()) {}
+ };
+ // The current buffer and parent buffers due to INCLUDE.
+ Buffer curBuf;
+ SmallVector<Buffer, 0> buffers;
+
+ // The token before the last next().
+ StringRef prevTok;
+ // Rules for what is a token are different when we are in an expression.
+ // curTok holds the cached return value of peek() and is invalid when the
+ // expression state changes.
+ StringRef curTok;
+ // The inExpr state when curTok is cached.
+ bool curTokState = false;
+ bool eof = false;
+
 public:
  explicit ScriptLexer(MemoryBufferRef mb);
 
  void setError(const Twine &msg);
- void tokenize(MemoryBufferRef mb);
+ void lex();
  StringRef skipSpace(StringRef s);
  bool atEOF();
  StringRef next();
@@ -33,15 +58,12 @@ class ScriptLexer {
  MemoryBufferRef getCurrentMB();
 
  std::vector<MemoryBufferRef> mbs;
- std::vector<StringRef> tokens;
  bool inExpr = false;
- size_t pos = 0;
 
  size_t lastLineNumber = 0;
  size_t lastLineNumberOffset = 0;
 
 private:
- void maybeSplitExpr();
  StringRef getLine();
  size_t getLineNumber();
  size_t getColumnNumber();