Speed up BCPL comment lexing by looking aggressively for newlines and then scannig backwards to see if the newline is escaped.

3% speedup in preprocessing all of clang with -Eonly. Also includes a small testcase for coverage. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@139116 91177308-0d34-0410-b5e6-96231b3b80d8
author: Benjamin Kramer <benny.kra@googlemail.com> 2011-09-05 07:19:39 +0000
committer: Benjamin Kramer <benny.kra@googlemail.com> 2011-09-05 07:19:39 +0000
commit: 1daa58ea8973854b62e79f0939fa37cb6fed00e6 (patch)
tree: 61ad3d6044f6fe6a13e4431903f9ede0f7a440f4 /lib/Lex/Lexer.cpp
parent: 5d6ae288bc661572e52ea71fc14b249eb5e2d9e9 (diff)
1 files changed, 24 insertions, 9 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index a635338494..26996027d8 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -1635,20 +1635,28 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
   char C;
   do {
     C = *CurPtr;
-    // FIXME: Speedup BCPL comment lexing.  Just scan for a \n or \r character.
-    // If we find a \n character, scan backwards, checking to see if it's an
-    // escaped newline, like we do for block comments.
-
     // Skip over characters in the fast loop.
     while (C != 0 &&                // Potentially EOF.
-           C != '\\' &&             // Potentially escaped newline.
-           C != '?' &&              // Potentially trigraph.
            C != '\n' && C != '\r')  // Newline or DOS-style newline.
       C = *++CurPtr;
 
-    // If this is a newline, we're done.
-    if (C == '\n' || C == '\r')
-      break;  // Found the newline? Break out!
+    const char *NextLine = CurPtr;
+    if (C != 0) {
+      // We found a newline, see if it's escaped.
+      const char *EscapePtr = CurPtr-1;
+      while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace.
+        --EscapePtr;
+
+      if (*EscapePtr == '\\') // Escaped newline.
+        CurPtr = EscapePtr;
+      else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
+               EscapePtr[-2] == '?') // Trigraph-escaped newline.
+        CurPtr = EscapePtr-2;
+      else
+        break; // This is a newline, we're done.
+
+      C = *CurPtr;
+    }
 
     // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
     // properly decode the character.  Read it in raw mode to avoid emitting
@@ -1660,6 +1668,13 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
     C = getAndAdvanceChar(CurPtr, Result);
     LexingRawMode = OldRawMode;
 
+    // If we only read only one character, then no special handling is needed.
+    // We're done and can skip forward to the newline.
+    if (C != 0 && CurPtr == OldPtr+1) {
+      CurPtr = NextLine;
+      break;
+    }
+
     // If the char that we finally got was a \n, then we must have had something
     // like \<newline><newline>.  We don't want to have consumed the second
     // newline, we want CurPtr, to end up pointing to it down below.
author	Benjamin Kramer <benny.kra@googlemail.com>	2011-09-05 07:19:39 +0000
committer	Benjamin Kramer <benny.kra@googlemail.com>	2011-09-05 07:19:39 +0000
commit	1daa58ea8973854b62e79f0939fa37cb6fed00e6 (patch)
tree	61ad3d6044f6fe6a13e4431903f9ede0f7a440f4 /lib/Lex/Lexer.cpp
parent	5d6ae288bc661572e52ea71fc14b249eb5e2d9e9 (diff)