diff options
Diffstat (limited to 'include/clang')
-rw-r--r-- | include/clang/Basic/ConvertUTF.h | 10 | ||||
-rw-r--r-- | include/clang/Basic/DiagnosticLexKinds.td | 34 | ||||
-rw-r--r-- | include/clang/Lex/Lexer.h | 20 | ||||
-rw-r--r-- | include/clang/Lex/Token.h | 8 |
4 files changed, 63 insertions, 9 deletions
diff --git a/include/clang/Basic/ConvertUTF.h b/include/clang/Basic/ConvertUTF.h index fb05afdae7..38956ee340 100644 --- a/include/clang/Basic/ConvertUTF.h +++ b/include/clang/Basic/ConvertUTF.h @@ -161,6 +161,16 @@ Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); unsigned getNumBytesForUTF8(UTF8 firstByte); +static inline ConversionResult convertUTF8Sequence(const UTF8 **source, + const UTF8 *sourceEnd, + UTF32 *target, + ConversionFlags flags) { + unsigned size = getNumBytesForUTF8(**source); + if (size > sourceEnd - *source) + return sourceExhausted; + return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags); +} + #ifdef __cplusplus } diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index c8b44230c9..00b385ef12 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -93,15 +93,29 @@ def ext_multichar_character_literal : ExtWarn< "multi-character character constant">, InGroup<MultiChar>; def ext_four_char_character_literal : Extension< "multi-character character constant">, InGroup<FourByteMultiChar>; - -// Literal -def ext_nonstandard_escape : Extension< - "use of non-standard escape character '\\%0'">; -def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">; -def err_hex_escape_no_digits : Error<"\\%0 used with no following hex digits">; + +// Unicode and UCNs +def err_invalid_utf8 : Error< + "source file is not valid UTF-8">; +def err_non_ascii : Error< + "non-ASCII characters are not allowed outside of literals and identifiers">; +def ext_unicode_whitespace : ExtWarn< + "treating Unicode character as whitespace">, + InGroup<DiagGroup<"unicode-whitespace">>; + +def err_hex_escape_no_digits : Error< + "\\%0 used with no following hex digits">; +def warn_ucn_escape_no_digits : Warning< + "\\%0 used with no following hex digits; " + "treating as '\\' followed by identifier">, InGroup<Unicode>; +def err_ucn_escape_incomplete : Error< + "incomplete universal character name">; +def warn_ucn_escape_incomplete : Warning< + "incomplete universal character name; " + "treating as '\\' followed by identifier">, InGroup<Unicode>; def err_ucn_escape_invalid : Error<"invalid universal character">; -def err_ucn_escape_incomplete : Error<"incomplete universal character name">; + def err_ucn_escape_basic_scs : Error< "character '%0' cannot be specified by a universal character name">; def err_ucn_control_character : Error< @@ -112,6 +126,12 @@ def warn_cxx98_compat_literal_ucn_escape_basic_scs : Warning< def warn_cxx98_compat_literal_ucn_control_character : Warning< "universal character name referring to a control character " "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore; + + +// Literal +def ext_nonstandard_escape : Extension< + "use of non-standard escape character '\\%0'">; +def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">; def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">; def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">; def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">; diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h index d36189fccd..535baf588f 100644 --- a/include/clang/Lex/Lexer.h +++ b/include/clang/Lex/Lexer.h @@ -437,6 +437,11 @@ private: /// void LexTokenInternal(Token &Result); + /// Given that a token begins with the Unicode character \p C, figure out + /// what kind of token it is and dispatch to the appropriate lexing helper + /// function. + void LexUnicode(Token &Result, uint32_t C, const char *CurPtr); + /// FormTokenWithChars - When we lex a token, we have identified a span /// starting at BufferPtr, going to TokEnd that forms the token. This method /// takes that range and assigns it to the token as its location and size. In @@ -579,6 +584,21 @@ private: void cutOffLexing() { BufferPtr = BufferEnd; } bool isHexaLiteral(const char *Start, const LangOptions &LangOpts); + + + /// Read a universal character name. + /// + /// \param CurPtr The position in the source buffer after the initial '\'. + /// If the UCN is syntactically well-formed (but not necessarily + /// valid), this parameter will be updated to point to the + /// character after the UCN. + /// \param SlashLoc The position in the source buffer of the '\'. + /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics + /// and handle token formation in the caller. + /// + /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is + /// invalid. + uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok); }; diff --git a/include/clang/Lex/Token.h b/include/clang/Lex/Token.h index 06ff56ea9c..bcbe9c913b 100644 --- a/include/clang/Lex/Token.h +++ b/include/clang/Lex/Token.h @@ -74,9 +74,10 @@ public: StartOfLine = 0x01, // At start of line or only after whitespace. LeadingSpace = 0x02, // Whitespace exists before this token. DisableExpand = 0x04, // This identifier may never be macro expanded. - NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. + NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. LeadingEmptyMacro = 0x10, // Empty macro exists before this token. - HasUDSuffix = 0x20 // This string or character literal has a ud-suffix. + HasUDSuffix = 0x20, // This string or character literal has a ud-suffix. + HasUCN = 0x40 // This identifier contains a UCN. }; tok::TokenKind getKind() const { return (tok::TokenKind)Kind; } @@ -257,6 +258,9 @@ public: /// \brief Return true if this token is a string or character literal which /// has a ud-suffix. bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; } + + /// Returns true if this token contains a universal character name. + bool hasUCN() const { return (Flags & HasUCN) ? true : false; } }; /// \brief Information about the conditional stack (\#if directives) |