Add support for C++0x raw string literals.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@137298 91177308-0d34-0410-b5e6-96231b3b80d8
author: Craig Topper <craig.topper@gmail.com> 2011-08-11 04:06:15 +0000
committer: Craig Topper <craig.topper@gmail.com> 2011-08-11 04:06:15 +0000
commit: 2fa4e86b4fdada3b9ecbbbd99965b83ed879f69b (patch)
tree: a57887c9c34ac5314596b5a98d0850a58c3aa295 /lib/Lex/Lexer.cpp
parent: 777d6e56ad9b1fed9866daf3ee6486d85c5b7d32 (diff)
1 files changed, 156 insertions, 19 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 0664cbc21b..0c32c8d9ba 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <cctype>
+#include <cstring>
 using namespace clang;
 
 static void InitCharacterInfo();
@@ -760,7 +761,8 @@ enum {
   CHAR_LETTER   = 0x04,  // a-z,A-Z
   CHAR_NUMBER   = 0x08,  // 0-9
   CHAR_UNDER    = 0x10,  // _
-  CHAR_PERIOD   = 0x20   // .
+  CHAR_PERIOD   = 0x20,  // .
+  CHAR_RAWDEL   = 0x40   // {}[]#<>%:;?*+-/^&|~!=,"'
 };
 
 // Statically initialize CharInfo table based on ASCII character set
@@ -785,20 +787,20 @@ static const unsigned char CharInfo[256] =
    0           , 0           , 0           , 0           ,
 //32 SP         33  !         34  "         35  #
 //36  $         37  %         38  &         39  '
-   CHAR_HORZ_WS, 0           , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
+   CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
 //40  (         41  )         42  *         43  +
 //44  ,         45  -         46  .         47  /
-   0           , 0           , 0           , 0           ,
-   0           , 0           , CHAR_PERIOD , 0           ,
+   0           , 0           , CHAR_RAWDEL , CHAR_RAWDEL ,
+   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
 //48  0         49  1         50  2         51  3
 //52  4         53  5         54  6         55  7
    CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
    CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
 //56  8         57  9         58  :         59  ;
 //60  <         61  =         62  >         63  ?
-   CHAR_NUMBER , CHAR_NUMBER , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
+   CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
+   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
 //64  @         65  A         66  B         67  C
 //68  D         69  E         70  F         71  G
    0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
@@ -813,8 +815,8 @@ static const unsigned char CharInfo[256] =
    CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
 //88  X         89  Y         90  Z         91  [
 //92  \         93  ]         94  ^         95  _
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0           ,
-   0           , 0           , 0           , CHAR_UNDER  ,
+   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
+   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER  ,
 //96  `         97  a         98  b         99  c
 //100  d       101  e        102  f        103  g
    0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
@@ -828,9 +830,9 @@ static const unsigned char CharInfo[256] =
    CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
    CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
 //120  x       121  y        122  z        123  {
-//124  |        125  }        126  ~        127 DEL
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0           ,
-   0           , 0           , 0           , 0
+//124  |       125  }        126  ~        127 DEL
+   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
+   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
 };
 
 static void InitCharacterInfo() {
@@ -888,6 +890,14 @@ static inline bool isNumberBody(unsigned char c) {
     true : false;
 }
 
+/// isRawStringDelimBody - Return true if this is the body character of a
+/// raw string delimiter.
+static inline bool isRawStringDelimBody(unsigned char c) {
+  return (CharInfo[c] &
+          (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
+    true : false;
+}
+
 
 //===----------------------------------------------------------------------===//
 // Diagnostics forwarding code.
@@ -1363,6 +1373,78 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
   Result.setLiteralData(TokStart);
 }
 
+/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
+/// having lexed R", LR", u8R", uR", or UR".
+void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
+                                tok::TokenKind Kind) {
+  // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
+  //  Between the initial and final double quote characters of the raw string,
+  //  any transformations performed in phases 1 and 2 (trigraphs,
+  //  universal-character-names, and line splicing) are reverted.
+
+  unsigned PrefixLen = 0;
+
+  while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
+    ++PrefixLen;
+
+  // If the last character was not a '(', then we didn't lex a valid delimiter.
+  if (CurPtr[PrefixLen] != '(') {
+    if (!isLexingRawMode()) {
+      const char *PrefixEnd = &CurPtr[PrefixLen];
+      if (PrefixLen == 16) {
+        Diag(PrefixEnd, diag::err_raw_delim_too_long);
+      } else {
+        Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
+          << StringRef(PrefixEnd, 1);
+      }
+    }
+
+    // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
+    // it's possible the '"' was intended to be part of the raw string, but
+    // there's not much we can do about that.
+    while (1) {
+      char C = *CurPtr++;
+
+      if (C == '"')
+        break;
+      if (C == 0 && CurPtr-1 == BufferEnd) {
+        --CurPtr;
+        break;
+      }
+    }
+
+    FormTokenWithChars(Result, CurPtr, tok::unknown);
+    return;
+  }
+
+  // Save prefix and move CurPtr past it
+  const char *Prefix = CurPtr;
+  CurPtr += PrefixLen + 1; // skip over prefix and '('
+
+  while (1) {
+    char C = *CurPtr++;
+
+    if (C == ')') {
+      // Check for prefix match and closing quote.
+      if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
+        CurPtr += PrefixLen + 1; // skip over prefix and '"'
+        break;
+      }
+    } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
+      if (!isLexingRawMode())
+        Diag(BufferPtr, diag::err_unterminated_raw_string)
+          << StringRef(Prefix, PrefixLen);
+      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
+      return;
+    }
+  }
+
+  // Update the location of token as well as BufferPtr.
+  const char *TokStart = BufferPtr;
+  FormTokenWithChars(Result, CurPtr, Kind);
+  Result.setLiteralData(TokStart);
+}
+
 /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
 /// after having lexed the '<' character.  This is used for #include filenames.
 void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
@@ -2262,12 +2344,36 @@ LexNextToken:
         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
                                tok::utf16_char_constant);
 
-      // UTF-8 string literal
-      if (Char == '8' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
-        return LexStringLiteral(Result,
-                              ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
-                                          SizeTmp2, Result),
-                              tok::utf8_string_literal);
+      // UTF-16 raw string literal
+      if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+        return LexRawStringLiteral(Result,
+                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               tok::utf16_string_literal);
+
+      if (Char == '8') {
+        char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
+
+        // UTF-8 string literal
+        if (Char2 == '"')
+          return LexStringLiteral(Result,
+                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               tok::utf8_string_literal);
+
+        if (Char2 == 'R') {
+          unsigned SizeTmp3;
+          char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
+          // UTF-8 raw string literal
+          if (Char3 == '"') {
+            return LexRawStringLiteral(Result,
+                   ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               SizeTmp3, Result),
+                   tok::utf8_string_literal);
+          }
+        }
+      }
     }
 
     // treat u like the start of an identifier.
@@ -2289,11 +2395,34 @@ LexNextToken:
       if (Char == '\'')
         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
                                tok::utf32_char_constant);
+
+      // UTF-32 raw string literal
+      if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+        return LexRawStringLiteral(Result,
+                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               tok::utf32_string_literal);
     }
 
     // treat U like the start of an identifier.
     return LexIdentifier(Result, CurPtr);
 
+  case 'R': // Identifier or C++0x raw string literal
+    // Notify MIOpt that we read a non-whitespace/non-comment token.
+    MIOpt.ReadToken();
+
+    if (Features.CPlusPlus0x) {
+      Char = getCharAndSize(CurPtr, SizeTmp);
+
+      if (Char == '"')
+        return LexRawStringLiteral(Result,
+                                   ConsumeChar(CurPtr, SizeTmp, Result),
+                                   tok::string_literal);
+    }
+
+    // treat R like the start of an identifier.
+    return LexIdentifier(Result, CurPtr);
+
   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
     // Notify MIOpt that we read a non-whitespace/non-comment token.
     MIOpt.ReadToken();
@@ -2304,6 +2433,14 @@ LexNextToken:
       return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
                               tok::wide_string_literal);
 
+    // Wide raw string literal.
+    if (Features.CPlusPlus0x && Char == 'R' &&
+        getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+      return LexRawStringLiteral(Result,
+                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                           SizeTmp2, Result),
+                               tok::wide_string_literal);
+
     // Wide character constant.
     if (Char == '\'')
       return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
@@ -2313,7 +2450,7 @@ LexNextToken:
   // C99 6.4.2: Identifiers.
   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
   case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
-  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T':    /*'U'*/
+  case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
   case 'V': case 'W': case 'X': case 'Y': case 'Z':
   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
author	Craig Topper <craig.topper@gmail.com>	2011-08-11 04:06:15 +0000
committer	Craig Topper <craig.topper@gmail.com>	2011-08-11 04:06:15 +0000
commit	2fa4e86b4fdada3b9ecbbbd99965b83ed879f69b (patch)
tree	a57887c9c34ac5314596b5a98d0850a58c3aa295 /lib/Lex/Lexer.cpp
parent	777d6e56ad9b1fed9866daf3ee6486d85c5b7d32 (diff)