aboutsummaryrefslogtreecommitdiff
path: root/include/clang
diff options
context:
space:
mode:
Diffstat (limited to 'include/clang')
-rw-r--r--include/clang/Basic/ConvertUTF.h10
-rw-r--r--include/clang/Basic/DiagnosticLexKinds.td34
-rw-r--r--include/clang/Lex/Lexer.h20
-rw-r--r--include/clang/Lex/Token.h8
4 files changed, 63 insertions, 9 deletions
diff --git a/include/clang/Basic/ConvertUTF.h b/include/clang/Basic/ConvertUTF.h
index fb05afdae7..38956ee340 100644
--- a/include/clang/Basic/ConvertUTF.h
+++ b/include/clang/Basic/ConvertUTF.h
@@ -161,6 +161,16 @@ Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
unsigned getNumBytesForUTF8(UTF8 firstByte);
+static inline ConversionResult convertUTF8Sequence(const UTF8 **source,
+ const UTF8 *sourceEnd,
+ UTF32 *target,
+ ConversionFlags flags) {
+ unsigned size = getNumBytesForUTF8(**source);
+ if (size > sourceEnd - *source)
+ return sourceExhausted;
+ return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
+}
+
#ifdef __cplusplus
}
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td
index c8b44230c9..00b385ef12 100644
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -93,15 +93,29 @@ def ext_multichar_character_literal : ExtWarn<
"multi-character character constant">, InGroup<MultiChar>;
def ext_four_char_character_literal : Extension<
"multi-character character constant">, InGroup<FourByteMultiChar>;
-
-// Literal
-def ext_nonstandard_escape : Extension<
- "use of non-standard escape character '\\%0'">;
-def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
-def err_hex_escape_no_digits : Error<"\\%0 used with no following hex digits">;
+
+// Unicode and UCNs
+def err_invalid_utf8 : Error<
+ "source file is not valid UTF-8">;
+def err_non_ascii : Error<
+ "non-ASCII characters are not allowed outside of literals and identifiers">;
+def ext_unicode_whitespace : ExtWarn<
+ "treating Unicode character as whitespace">,
+ InGroup<DiagGroup<"unicode-whitespace">>;
+
+def err_hex_escape_no_digits : Error<
+ "\\%0 used with no following hex digits">;
+def warn_ucn_escape_no_digits : Warning<
+ "\\%0 used with no following hex digits; "
+ "treating as '\\' followed by identifier">, InGroup<Unicode>;
+def err_ucn_escape_incomplete : Error<
+ "incomplete universal character name">;
+def warn_ucn_escape_incomplete : Warning<
+ "incomplete universal character name; "
+ "treating as '\\' followed by identifier">, InGroup<Unicode>;
def err_ucn_escape_invalid : Error<"invalid universal character">;
-def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
+
def err_ucn_escape_basic_scs : Error<
"character '%0' cannot be specified by a universal character name">;
def err_ucn_control_character : Error<
@@ -112,6 +126,12 @@ def warn_cxx98_compat_literal_ucn_escape_basic_scs : Warning<
def warn_cxx98_compat_literal_ucn_control_character : Warning<
"universal character name referring to a control character "
"is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
+
+
+// Literal
+def ext_nonstandard_escape : Extension<
+ "use of non-standard escape character '\\%0'">;
+def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h
index d36189fccd..535baf588f 100644
--- a/include/clang/Lex/Lexer.h
+++ b/include/clang/Lex/Lexer.h
@@ -437,6 +437,11 @@ private:
///
void LexTokenInternal(Token &Result);
+ /// Given that a token begins with the Unicode character \p C, figure out
+ /// what kind of token it is and dispatch to the appropriate lexing helper
+ /// function.
+ void LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
+
/// FormTokenWithChars - When we lex a token, we have identified a span
/// starting at BufferPtr, going to TokEnd that forms the token. This method
/// takes that range and assigns it to the token as its location and size. In
@@ -579,6 +584,21 @@ private:
void cutOffLexing() { BufferPtr = BufferEnd; }
bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
+
+
+ /// Read a universal character name.
+ ///
+ /// \param CurPtr The position in the source buffer after the initial '\'.
+ /// If the UCN is syntactically well-formed (but not necessarily
+ /// valid), this parameter will be updated to point to the
+ /// character after the UCN.
+ /// \param SlashLoc The position in the source buffer of the '\'.
+ /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics
+ /// and handle token formation in the caller.
+ ///
+ /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
+ /// invalid.
+ uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
};
diff --git a/include/clang/Lex/Token.h b/include/clang/Lex/Token.h
index 06ff56ea9c..bcbe9c913b 100644
--- a/include/clang/Lex/Token.h
+++ b/include/clang/Lex/Token.h
@@ -74,9 +74,10 @@ public:
StartOfLine = 0x01, // At start of line or only after whitespace.
LeadingSpace = 0x02, // Whitespace exists before this token.
DisableExpand = 0x04, // This identifier may never be macro expanded.
- NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
+ NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
- HasUDSuffix = 0x20 // This string or character literal has a ud-suffix.
+ HasUDSuffix = 0x20, // This string or character literal has a ud-suffix.
+ HasUCN = 0x40 // This identifier contains a UCN.
};
tok::TokenKind getKind() const { return (tok::TokenKind)Kind; }
@@ -257,6 +258,9 @@ public:
/// \brief Return true if this token is a string or character literal which
/// has a ud-suffix.
bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; }
+
+ /// Returns true if this token contains a universal character name.
+ bool hasUCN() const { return (Flags & HasUCN) ? true : false; }
};
/// \brief Information about the conditional stack (\#if directives)