Handle universal character names and Unicode characters outside of literals.

This is a missing piece for C99 conformance. This patch handles UCNs by adding a '\\' case to LexTokenInternal and LexIdentifier -- if we see a backslash, we tentatively try to read in a UCN. If the UCN is not syntactically well-formed, we fall back to the old treatment: a backslash followed by an identifier beginning with 'u' (or 'U'). Because the spelling of an identifier with UCNs still has the UCN in it, we need to convert that to UTF-8 in Preprocessor::LookUpIdentifierInfo. Of course, valid code that does *not* use UCNs will see only a very minimal performance hit (checks after each identifier for non-ASCII characters, checks when converting raw_identifiers to identifiers that they do not contain UCNs, and checks when getting the spelling of an identifier that it does not contain a UCN). This patch also adds basic support for actual UTF-8 in the source. This is treated almost exactly the same as UCNs except that we consider stray Unicode characters to be mistakes and offer a fixit to remove them. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@173369 91177308-0d34-0410-b5e6-96231b3b80d8
author: Jordan Rose <jordan_rose@apple.com> 2013-01-24 20:50:46 +0000
committer: Jordan Rose <jordan_rose@apple.com> 2013-01-24 20:50:46 +0000
commit: c7629d941557f7179eb8fa8a2e2a74d749cbaf7c (patch)
tree: 5d1833c5ca29ada22f679ebee8488273fcaf9777 /lib/Lex/Preprocessor.cpp
parent: 5209e2bc4d18e679dcacfd6f6a0120aa1d4a757f (diff)
1 files changed, 56 insertions, 4 deletions
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index c01019cf43..b933a5fd75 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -27,6 +27,7 @@
 
 #include "clang/Lex/Preprocessor.h"
 #include "MacroArgs.h"
+#include "clang/Basic/ConvertUTF.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
@@ -43,6 +44,8 @@
 #include "clang/Lex/ScratchBuffer.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Capacity.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -396,7 +399,7 @@ StringRef Preprocessor::getSpelling(const Token &Tok,
                                           SmallVectorImpl<char> &Buffer,
                                           bool *Invalid) const {
   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
-  if (Tok.isNot(tok::raw_identifier)) {
+  if (Tok.isNot(tok::raw_identifier) && !Tok.hasUCN()) {
     // Try the fast path.
     if (const IdentifierInfo *II = Tok.getIdentifierInfo())
       return II->getName();
@@ -494,6 +497,48 @@ void Preprocessor::EndSourceFile() {
 // Lexer Event Handling.
 //===----------------------------------------------------------------------===//
 
+static void appendCodePoint(unsigned Codepoint,
+                            llvm::SmallVectorImpl<char> &Str) {
+  char ResultBuf[4];
+  char *ResultPtr = ResultBuf;
+  bool Res = ConvertCodePointToUTF8(Codepoint, ResultPtr);
+  (void)Res;
+  assert(Res && "Unexpected conversion failure");
+  Str.append(ResultBuf, ResultPtr);
+}
+
+static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
+  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
+    if (*I != '\\') {
+      Buf.push_back(*I);
+      continue;
+    }
+
+    ++I;
+    assert(*I == 'u' || *I == 'U');
+
+    unsigned NumHexDigits;
+    if (*I == 'u')
+      NumHexDigits = 4;
+    else
+      NumHexDigits = 8;
+
+    assert(I + NumHexDigits <= E);
+
+    uint32_t CodePoint = 0;
+    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
+      unsigned Value = llvm::hexDigitValue(*I);
+      assert(Value != -1U);
+
+      CodePoint <<= 4;
+      CodePoint += Value;
+    }
+
+    appendCodePoint(CodePoint, Buf);
+    --I;
+  }
+}
+
 /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
 /// identifier information for the token and install it into the token,
 /// updating the token kind accordingly.
@@ -502,15 +547,22 @@ IdentifierInfo *Preprocessor::LookUpIdentifierInfo(Token &Identifier) const {
 
   // Look up this token, see if it is a macro, or if it is a language keyword.
   IdentifierInfo *II;
-  if (!Identifier.needsCleaning()) {
+  if (!Identifier.needsCleaning() && !Identifier.hasUCN()) {
     // No cleaning needed, just use the characters from the lexed buffer.
     II = getIdentifierInfo(StringRef(Identifier.getRawIdentifierData(),
-                                           Identifier.getLength()));
+                                     Identifier.getLength()));
   } else {
     // Cleaning needed, alloca a buffer, clean into it, then use the buffer.
     SmallString<64> IdentifierBuffer;
     StringRef CleanedStr = getSpelling(Identifier, IdentifierBuffer);
-    II = getIdentifierInfo(CleanedStr);
+
+    if (Identifier.hasUCN()) {
+      SmallString<64> UCNIdentifierBuffer;
+      expandUCNs(UCNIdentifierBuffer, CleanedStr);
+      II = getIdentifierInfo(UCNIdentifierBuffer);
+    } else {
+      II = getIdentifierInfo(CleanedStr);
+    }
   }
 
   // Update the token info (identifier info and appropriate token kind).
author	Jordan Rose <jordan_rose@apple.com>	2013-01-24 20:50:46 +0000
committer	Jordan Rose <jordan_rose@apple.com>	2013-01-24 20:50:46 +0000
commit	c7629d941557f7179eb8fa8a2e2a74d749cbaf7c (patch)
tree	5d1833c5ca29ada22f679ebee8488273fcaf9777 /lib/Lex/Preprocessor.cpp
parent	5209e2bc4d18e679dcacfd6f6a0120aa1d4a757f (diff)