diff options
author | Jordan Rose <jordan_rose@apple.com> | 2013-01-24 20:50:46 +0000 |
---|---|---|
committer | Jordan Rose <jordan_rose@apple.com> | 2013-01-24 20:50:46 +0000 |
commit | c7629d941557f7179eb8fa8a2e2a74d749cbaf7c (patch) | |
tree | 5d1833c5ca29ada22f679ebee8488273fcaf9777 /include/clang/Lex | |
parent | 5209e2bc4d18e679dcacfd6f6a0120aa1d4a757f (diff) |
Handle universal character names and Unicode characters outside of literals.
This is a missing piece for C99 conformance.
This patch handles UCNs by adding a '\\' case to LexTokenInternal and
LexIdentifier -- if we see a backslash, we tentatively try to read in a UCN.
If the UCN is not syntactically well-formed, we fall back to the old
treatment: a backslash followed by an identifier beginning with 'u' (or 'U').
Because the spelling of an identifier with UCNs still has the UCN in it, we
need to convert that to UTF-8 in Preprocessor::LookUpIdentifierInfo.
Of course, valid code that does *not* use UCNs will see only a very minimal
performance hit (checks after each identifier for non-ASCII characters,
checks when converting raw_identifiers to identifiers that they do not
contain UCNs, and checks when getting the spelling of an identifier that it
does not contain a UCN).
This patch also adds basic support for actual UTF-8 in the source. This is
treated almost exactly the same as UCNs except that we consider stray
Unicode characters to be mistakes and offer a fixit to remove them.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@173369 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'include/clang/Lex')
-rw-r--r-- | include/clang/Lex/Lexer.h | 20 | ||||
-rw-r--r-- | include/clang/Lex/Token.h | 8 |
2 files changed, 26 insertions, 2 deletions
diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h index d36189fccd..535baf588f 100644 --- a/include/clang/Lex/Lexer.h +++ b/include/clang/Lex/Lexer.h @@ -437,6 +437,11 @@ private: /// void LexTokenInternal(Token &Result); + /// Given that a token begins with the Unicode character \p C, figure out + /// what kind of token it is and dispatch to the appropriate lexing helper + /// function. + void LexUnicode(Token &Result, uint32_t C, const char *CurPtr); + /// FormTokenWithChars - When we lex a token, we have identified a span /// starting at BufferPtr, going to TokEnd that forms the token. This method /// takes that range and assigns it to the token as its location and size. In @@ -579,6 +584,21 @@ private: void cutOffLexing() { BufferPtr = BufferEnd; } bool isHexaLiteral(const char *Start, const LangOptions &LangOpts); + + + /// Read a universal character name. + /// + /// \param CurPtr The position in the source buffer after the initial '\'. + /// If the UCN is syntactically well-formed (but not necessarily + /// valid), this parameter will be updated to point to the + /// character after the UCN. + /// \param SlashLoc The position in the source buffer of the '\'. + /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics + /// and handle token formation in the caller. + /// + /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is + /// invalid. + uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok); }; diff --git a/include/clang/Lex/Token.h b/include/clang/Lex/Token.h index 06ff56ea9c..bcbe9c913b 100644 --- a/include/clang/Lex/Token.h +++ b/include/clang/Lex/Token.h @@ -74,9 +74,10 @@ public: StartOfLine = 0x01, // At start of line or only after whitespace. LeadingSpace = 0x02, // Whitespace exists before this token. DisableExpand = 0x04, // This identifier may never be macro expanded. - NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. + NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. LeadingEmptyMacro = 0x10, // Empty macro exists before this token. - HasUDSuffix = 0x20 // This string or character literal has a ud-suffix. + HasUDSuffix = 0x20, // This string or character literal has a ud-suffix. + HasUCN = 0x40 // This identifier contains a UCN. }; tok::TokenKind getKind() const { return (tok::TokenKind)Kind; } @@ -257,6 +258,9 @@ public: /// \brief Return true if this token is a string or character literal which /// has a ud-suffix. bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; } + + /// Returns true if this token contains a universal character name. + bool hasUCN() const { return (Flags & HasUCN) ? true : false; } }; /// \brief Information about the conditional stack (\#if directives) |