aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan Rose <jordan_rose@apple.com>2013-02-08 22:30:22 +0000
committerJordan Rose <jordan_rose@apple.com>2013-02-08 22:30:22 +0000
commit9893902eceba7f01dd1521349d33866f77254d78 (patch)
tree663def75b1c867719f47927560893f869a761668
parente22cef5cb2e460bae88563cfc5fcf98d742d6215 (diff)
Pull Lexer's CharInfo table out for general use throughout Clang.
Rewriting the same predicates over and over again is bad for code size and code maintainence. Using the functions in <ctype.h> is generally unsafe unless they are specified to be locale-independent (i.e. only isdigit and isxdigit). The next commit will try to clean up uses of <ctype.h> functions within Clang. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@174765 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/clang/Basic/CharInfo.h162
-rw-r--r--lib/Basic/CMakeLists.txt1
-rw-r--r--lib/Basic/CharInfo.cpp80
-rw-r--r--lib/Lex/Lexer.cpp175
-rw-r--r--unittests/Basic/CMakeLists.txt1
-rw-r--r--unittests/Basic/CharInfoTest.cpp377
6 files changed, 626 insertions, 170 deletions
diff --git a/include/clang/Basic/CharInfo.h b/include/clang/Basic/CharInfo.h
new file mode 100644
index 0000000000..f9b7b7311d
--- /dev/null
+++ b/include/clang/Basic/CharInfo.h
@@ -0,0 +1,162 @@
+//===--- clang/Basic/CharInfo.h - Classifying ASCII Characters ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_BASIC_CHARINFO_H
+#define CLANG_BASIC_CHARINFO_H
+
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace clang {
+namespace charinfo {
+ extern const uint16_t InfoTable[256];
+
+ enum {
+ CHAR_HORZ_WS = 0x0001, // '\t', '\f', '\v'. Note, no '\0'
+ CHAR_VERT_WS = 0x0002, // '\r', '\n'
+ CHAR_SPACE = 0x0004, // ' '
+ CHAR_DIGIT = 0x0008, // 0-9
+ CHAR_XLETTER = 0x0010, // a-f,A-F
+ CHAR_UPPER = 0x0020, // A-Z
+ CHAR_LOWER = 0x0040, // a-z
+ CHAR_UNDER = 0x0080, // _
+ CHAR_PERIOD = 0x0100, // .
+ CHAR_RAWDEL = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"'
+ CHAR_PUNCT = 0x0400 // `$@()
+ };
+
+ enum {
+ CHAR_XUPPER = CHAR_XLETTER | CHAR_UPPER,
+ CHAR_XLOWER = CHAR_XLETTER | CHAR_LOWER
+ };
+} // end namespace charinfo
+
+/// Returns true if this is an ASCII character.
+LLVM_READNONE static inline bool isASCII(char c) {
+ return static_cast<unsigned char>(c) <= 127;
+}
+
+/// Returns true if this is a valid first character of a C identifier,
+/// which is [a-zA-Z_].
+LLVM_READONLY static inline bool isIdentifierHead(unsigned char c,
+ bool AllowDollar = false) {
+ using namespace charinfo;
+ if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER))
+ return true;
+ return AllowDollar && c == '$';
+}
+
+/// Returns true if this is a body character of a C identifier,
+/// which is [a-zA-Z0-9_].
+LLVM_READONLY static inline bool isIdentifierBody(unsigned char c,
+ bool AllowDollar = false) {
+ using namespace charinfo;
+ if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER))
+ return true;
+ return AllowDollar && c == '$';
+}
+
+/// Returns true if this character is horizontal ASCII whitespace:
+/// ' ', '\\t', '\\f', '\\v'.
+///
+/// Note that this returns false for '\\0'.
+LLVM_READONLY static inline bool isHorizontalWhitespace(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0;
+}
+
+/// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'.
+///
+/// Note that this returns false for '\\0'.
+LLVM_READONLY static inline bool isVerticalWhitespace(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & CHAR_VERT_WS) != 0;
+}
+
+/// Return true if this character is horizontal or vertical ASCII whitespace:
+/// ' ', '\\t', '\\f', '\\v', '\\n', '\\r'.
+///
+/// Note that this returns false for '\\0'.
+LLVM_READONLY static inline bool isWhitespace(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_VERT_WS|CHAR_SPACE)) != 0;
+}
+
+/// Return true if this character is an ASCII digit: [0-9]
+LLVM_READONLY static inline bool isDigit(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & CHAR_DIGIT) != 0;
+}
+
+/// Return true if this character is a lowercase ASCII letter: [a-z]
+LLVM_READONLY static inline bool isLowercase(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & CHAR_LOWER) != 0;
+}
+
+/// Return true if this character is an uppercase ASCII letter: [A-Z]
+LLVM_READONLY static inline bool isUppercase(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & CHAR_UPPER) != 0;
+}
+
+/// Return true if this character is an ASCII letter: [a-zA-Z]
+LLVM_READONLY static inline bool isLetter(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER)) != 0;
+}
+
+/// Return true if this character is an ASCII letter or digit: [a-zA-Z0-9]
+LLVM_READONLY static inline bool isAlphanumeric(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_DIGIT|CHAR_UPPER|CHAR_LOWER)) != 0;
+}
+
+/// Return true if this character is an ASCII hex digit: [0-9a-fA-F]
+LLVM_READONLY static inline bool isHexDigit(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_DIGIT|CHAR_XLETTER)) != 0;
+ return true;
+}
+
+/// Return true if this character is an ASCII punctuation character.
+///
+/// Note that '_' is both a punctuation character and an identifier character!
+LLVM_READONLY static inline bool isPunctuation(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0;
+}
+
+/// Return true if this character is an ASCII printable character; that is, a
+/// character that should take exactly one column to print in a fixed-width
+/// terminal.
+LLVM_READONLY static inline bool isPrintable(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT|
+ CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0;
+}
+
+/// Return true if this is the body character of a C preprocessing number,
+/// which is [a-zA-Z0-9_.].
+LLVM_READONLY static inline bool isPreprocessingNumberBody(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] &
+ (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER|CHAR_PERIOD)) != 0;
+}
+
+/// Return true if this is the body character of a C++ raw string delimiter.
+LLVM_READONLY static inline bool isRawStringDelimBody(unsigned char c) {
+ using namespace charinfo;
+ return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|
+ CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0;
+}
+
+} // end namespace clang
+
+#endif
diff --git a/lib/Basic/CMakeLists.txt b/lib/Basic/CMakeLists.txt
index 7c5e42c984..37efcb1220 100644
--- a/lib/Basic/CMakeLists.txt
+++ b/lib/Basic/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS mc)
add_clang_library(clangBasic
Builtins.cpp
+ CharInfo.cpp
Diagnostic.cpp
DiagnosticIDs.cpp
FileManager.cpp
diff --git a/lib/Basic/CharInfo.cpp b/lib/Basic/CharInfo.cpp
new file mode 100644
index 0000000000..a1a4b390bf
--- /dev/null
+++ b/lib/Basic/CharInfo.cpp
@@ -0,0 +1,80 @@
+//===--- CharInfo.cpp - Static Data for Classifying ASCII Characters ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/CharInfo.h"
+
+// Statically initialize CharInfo table based on ASCII character set
+// Reference: FreeBSD 7.2 /usr/share/misc/ascii
+const uint16_t clang::charinfo::InfoTable[256] =
+{
+ // 0 NUL 1 SOH 2 STX 3 ETX
+ // 4 EOT 5 ENQ 6 ACK 7 BEL
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+ // 8 BS 9 HT 10 NL 11 VT
+ //12 NP 13 CR 14 SO 15 SI
+ 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
+ CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 ,
+ //16 DLE 17 DC1 18 DC2 19 DC3
+ //20 DC4 21 NAK 22 SYN 23 ETB
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+ //24 CAN 25 EM 26 SUB 27 ESC
+ //28 FS 29 GS 30 RS 31 US
+ 0 , 0 , 0 , 0 ,
+ 0 , 0 , 0 , 0 ,
+ //32 SP 33 ! 34 " 35 #
+ //36 $ 37 % 38 & 39 '
+ CHAR_SPACE , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+ CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+ //40 ( 41 ) 42 * 43 +
+ //44 , 45 - 46 . 47 /
+ CHAR_PUNCT , CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL ,
+ CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
+ //48 0 49 1 50 2 51 3
+ //52 4 53 5 54 6 55 7
+ CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT ,
+ CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT ,
+ //56 8 57 9 58 : 59 ;
+ //60 < 61 = 62 > 63 ?
+ CHAR_DIGIT , CHAR_DIGIT , CHAR_RAWDEL , CHAR_RAWDEL ,
+ CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+ //64 @ 65 A 66 B 67 C
+ //68 D 69 E 70 F 71 G
+ CHAR_PUNCT , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER ,
+ CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , CHAR_UPPER ,
+ //72 H 73 I 74 J 75 K
+ //76 L 77 M 78 N 79 O
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
+ //80 P 81 Q 82 R 83 S
+ //84 T 85 U 86 V 87 W
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER ,
+ //88 X 89 Y 90 Z 91 [
+ //92 \ 93 ] 94 ^ 95 _
+ CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_RAWDEL ,
+ CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER ,
+ //96 ` 97 a 98 b 99 c
+ //100 d 101 e 102 f 103 g
+ CHAR_PUNCT , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER ,
+ CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , CHAR_LOWER ,
+ //104 h 105 i 106 j 107 k
+ //108 l 109 m 110 n 111 o
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
+ //112 p 113 q 114 r 115 s
+ //116 t 117 u 118 v 119 w
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER ,
+ //120 x 121 y 122 z 123 {
+ //124 | 125 } 126 ~ 127 DEL
+ CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_RAWDEL ,
+ CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
+};
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 1b064c88ff..6aae4e17fa 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -25,6 +25,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Lex/Lexer.h"
+#include "clang/Basic/CharInfo.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Lex/CodeCompletionHandler.h"
#include "clang/Lex/LexDiagnostic.h"
@@ -38,8 +39,6 @@
#include <cstring>
using namespace clang;
-static void InitCharacterInfo();
-
//===----------------------------------------------------------------------===//
// Token Class Implementation
//===----------------------------------------------------------------------===//
@@ -66,8 +65,6 @@ void Lexer::anchor() { }
void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
const char *BufEnd) {
- InitCharacterInfo();
-
BufferStart = BufStart;
BufferPtr = BufPtr;
BufferEnd = BufEnd;
@@ -408,9 +405,6 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
}
-
-static bool isWhitespace(unsigned char c);
-
/// MeasureTokenLength - Relex the token at the specified location and return
/// its length in bytes in the input file. If the token needs cleaning (e.g.
/// includes a trigraph or an escaped newline) then this count includes bytes
@@ -1008,163 +1002,8 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
}
-//===----------------------------------------------------------------------===//
-// Character information.
-//===----------------------------------------------------------------------===//
-
-enum {
- CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0'
- CHAR_VERT_WS = 0x02, // '\r', '\n'
- CHAR_LETTER = 0x04, // a-z,A-Z
- CHAR_NUMBER = 0x08, // 0-9
- CHAR_UNDER = 0x10, // _
- CHAR_PERIOD = 0x20, // .
- CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"'
-};
-
-// Statically initialize CharInfo table based on ASCII character set
-// Reference: FreeBSD 7.2 /usr/share/misc/ascii
-static const unsigned char CharInfo[256] =
-{
-// 0 NUL 1 SOH 2 STX 3 ETX
-// 4 EOT 5 ENQ 6 ACK 7 BEL
- 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
-// 8 BS 9 HT 10 NL 11 VT
-//12 NP 13 CR 14 SO 15 SI
- 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
- CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 ,
-//16 DLE 17 DC1 18 DC2 19 DC3
-//20 DC4 21 NAK 22 SYN 23 ETB
- 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
-//24 CAN 25 EM 26 SUB 27 ESC
-//28 FS 29 GS 30 RS 31 US
- 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
-//32 SP 33 ! 34 " 35 #
-//36 $ 37 % 38 & 39 '
- CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
- 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//40 ( 41 ) 42 * 43 +
-//44 , 45 - 46 . 47 /
- 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL ,
- CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
-//48 0 49 1 50 2 51 3
-//52 4 53 5 54 6 55 7
- CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
- CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
-//56 8 57 9 58 : 59 ;
-//60 < 61 = 62 > 63 ?
- CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
- CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//64 @ 65 A 66 B 67 C
-//68 D 69 E 70 F 71 G
- 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//72 H 73 I 74 J 75 K
-//76 L 77 M 78 N 79 O
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//80 P 81 Q 82 R 83 S
-//84 T 85 U 86 V 87 W
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//88 X 89 Y 90 Z 91 [
-//92 \ 93 ] 94 ^ 95 _
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
- 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER ,
-//96 ` 97 a 98 b 99 c
-//100 d 101 e 102 f 103 g
- 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//104 h 105 i 106 j 107 k
-//108 l 109 m 110 n 111 o
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//112 p 113 q 114 r 115 s
-//116 t 117 u 118 v 119 w
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//120 x 121 y 122 z 123 {
-//124 | 125 } 126 ~ 127 DEL
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
- CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
-};
-
-static void InitCharacterInfo() {
- static bool isInited = false;
- if (isInited) return;
- // check the statically-initialized CharInfo table
- assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
- assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
- assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
- assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
- assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
- assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
- assert(CHAR_UNDER == CharInfo[(int)'_']);
- assert(CHAR_PERIOD == CharInfo[(int)'.']);
- for (unsigned i = 'a'; i <= 'z'; ++i) {
- assert(CHAR_LETTER == CharInfo[i]);
- assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
- }
- for (unsigned i = '0'; i <= '9'; ++i)
- assert(CHAR_NUMBER == CharInfo[i]);
-
- isInited = true;
-}
-
-
-/// isIdentifierHead - Return true if this is the first character of an
-/// identifier, which is [a-zA-Z_].
-static inline bool isIdentifierHead(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false;
-}
-
-/// isIdentifierBody - Return true if this is the body character of an
-/// identifier, which is [a-zA-Z0-9_].
-static inline bool isIdentifierBody(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
-}
-
-/// isHorizontalWhitespace - Return true if this character is horizontal
-/// whitespace: ' ', '\\t', '\\f', '\\v'. Note that this returns false for
-/// '\\0'.
-static inline bool isHorizontalWhitespace(unsigned char c) {
- return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
-}
-
-/// isVerticalWhitespace - Return true if this character is vertical
-/// whitespace: '\\n', '\\r'. Note that this returns false for '\\0'.
-static inline bool isVerticalWhitespace(unsigned char c) {
- return (CharInfo[c] & CHAR_VERT_WS) ? true : false;
-}
-
-/// isWhitespace - Return true if this character is horizontal or vertical
-/// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. Note that this returns
-/// false for '\\0'.
-static inline bool isWhitespace(unsigned char c) {
- return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
-}
-
-/// isNumberBody - Return true if this is the body character of an
-/// preprocessing number, which is [a-zA-Z0-9_.].
-static inline bool isNumberBody(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
- true : false;
-}
-
-/// isRawStringDelimBody - Return true if this is the body character of a
-/// raw string delimiter.
-static inline bool isRawStringDelimBody(unsigned char c) {
- return (CharInfo[c] &
- (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
- true : false;
-}
-
-// Allow external clients to make use of CharInfo.
bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
- return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents);
+ return isIdentifierBody(c, LangOpts.DollarIdents);
}
@@ -1578,10 +1417,6 @@ static bool isAllowedInitiallyIDChar(uint32_t c) {
!(0xFE20 <= c && c <= 0xFE2F);
}
-static inline bool isASCII(char C) {
- return static_cast<signed char>(C) >= 0;
-}
-
void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@@ -1595,8 +1430,8 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Fast path, no $,\,? in identifier found. '\' might be an escaped newline
// or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
//
- // TODO: Could merge these checks into a CharInfo flag to make the comparison
- // cheaper
+ // TODO: Could merge these checks into an InfoTable flag to make the
+ // comparison cheaper
if (isASCII(C) && C != '\\' && C != '?' &&
(C != '$' || !LangOpts.DollarIdents)) {
FinishIdentifier:
@@ -1700,7 +1535,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
unsigned Size;
char C = getCharAndSize(CurPtr, Size);
char PrevCh = 0;
- while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+ while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
CurPtr = ConsumeChar(CurPtr, Size, Result);
PrevCh = C;
C = getCharAndSize(CurPtr, Size);
diff --git a/unittests/Basic/CMakeLists.txt b/unittests/Basic/CMakeLists.txt
index 300dcd5cb8..51db6ce9e2 100644
--- a/unittests/Basic/CMakeLists.txt
+++ b/unittests/Basic/CMakeLists.txt
@@ -1,4 +1,5 @@
add_clang_unittest(BasicTests
+ CharInfoTest.cpp
FileManagerTest.cpp
SourceManagerTest.cpp
)
diff --git a/unittests/Basic/CharInfoTest.cpp b/unittests/Basic/CharInfoTest.cpp
new file mode 100644
index 0000000000..9b3d1b3b5f
--- /dev/null
+++ b/unittests/Basic/CharInfoTest.cpp
@@ -0,0 +1,377 @@
+//===- unittests/Basic/CharInfoTest.cpp -- ASCII classification tests -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/CharInfo.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace clang;
+
+// Check that the CharInfo table has been constructed reasonably.
+TEST(CharInfoTest, validateInfoTable) {
+ using namespace charinfo;
+ EXPECT_EQ((unsigned)CHAR_SPACE, InfoTable[(unsigned)' ']);
+ EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\t']);
+ EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\f']); // ??
+ EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\v']); // ??
+ EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\n']);
+ EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\r']);
+ EXPECT_EQ((unsigned)CHAR_UNDER, InfoTable[(unsigned)'_']);
+ EXPECT_EQ((unsigned)CHAR_PERIOD, InfoTable[(unsigned)'.']);
+
+ for (unsigned i = 'a'; i <= 'f'; ++i) {
+ EXPECT_EQ((unsigned)CHAR_XLOWER, InfoTable[i]);
+ EXPECT_EQ((unsigned)CHAR_XUPPER, InfoTable[i+'A'-'a']);
+ }
+
+ for (unsigned i = 'g'; i <= 'z'; ++i) {
+ EXPECT_EQ((unsigned)CHAR_LOWER, InfoTable[i]);
+ EXPECT_EQ((unsigned)CHAR_UPPER, InfoTable[i+'A'-'a']);
+ }
+
+ for (unsigned i = '0'; i <= '9'; ++i)
+ EXPECT_EQ((unsigned)CHAR_DIGIT, InfoTable[i]);
+}
+
+// Check various predicates.
+TEST(CharInfoTest, isASCII) {
+ EXPECT_TRUE(isASCII('\0'));
+ EXPECT_TRUE(isASCII('\n'));
+ EXPECT_TRUE(isASCII(' '));
+ EXPECT_TRUE(isASCII('a'));
+ EXPECT_TRUE(isASCII('\x7f'));
+ EXPECT_FALSE(isASCII('\x80'));
+ EXPECT_FALSE(isASCII('\xc2'));
+ EXPECT_FALSE(isASCII('\xff'));
+}
+
+TEST(CharInfoTest, isIdentifierHead) {
+ EXPECT_TRUE(isIdentifierHead('a'));
+ EXPECT_TRUE(isIdentifierHead('A'));
+ EXPECT_TRUE(isIdentifierHead('z'));
+ EXPECT_TRUE(isIdentifierHead('Z'));
+ EXPECT_TRUE(isIdentifierHead('_'));
+
+ EXPECT_FALSE(isIdentifierHead('0'));
+ EXPECT_FALSE(isIdentifierHead('.'));
+ EXPECT_FALSE(isIdentifierHead('`'));
+ EXPECT_FALSE(isIdentifierHead('\0'));
+
+ EXPECT_FALSE(isIdentifierHead('$'));
+ EXPECT_TRUE(isIdentifierHead('$', /*AllowDollar=*/true));
+
+ EXPECT_FALSE(isIdentifierHead('\x80'));
+ EXPECT_FALSE(isIdentifierHead('\xc2'));
+ EXPECT_FALSE(isIdentifierHead('\xff'));
+}
+
+TEST(CharInfoTest, isIdentifierBody) {
+ EXPECT_TRUE(isIdentifierBody('a'));
+ EXPECT_TRUE(isIdentifierBody('A'));
+ EXPECT_TRUE(isIdentifierBody('z'));
+ EXPECT_TRUE(isIdentifierBody('Z'));
+ EXPECT_TRUE(isIdentifierBody('_'));
+
+ EXPECT_TRUE(isIdentifierBody('0'));
+ EXPECT_FALSE(isIdentifierBody('.'));
+ EXPECT_FALSE(isIdentifierBody('`'));
+ EXPECT_FALSE(isIdentifierBody('\0'));
+
+ EXPECT_FALSE(isIdentifierBody('$'));
+ EXPECT_TRUE(isIdentifierBody('$', /*AllowDollar=*/true));
+
+ EXPECT_FALSE(isIdentifierBody('\x80'));
+ EXPECT_FALSE(isIdentifierBody('\xc2'));
+ EXPECT_FALSE(isIdentifierBody('\xff'));
+}
+
+TEST(CharInfoTest, isHorizontalWhitespace) {
+ EXPECT_FALSE(isHorizontalWhitespace('a'));
+ EXPECT_FALSE(isHorizontalWhitespace('_'));
+ EXPECT_FALSE(isHorizontalWhitespace('0'));
+ EXPECT_FALSE(isHorizontalWhitespace('.'));
+ EXPECT_FALSE(isHorizontalWhitespace('`'));
+ EXPECT_FALSE(isHorizontalWhitespace('\0'));
+ EXPECT_FALSE(isHorizontalWhitespace('\x7f'));
+
+ EXPECT_TRUE(isHorizontalWhitespace(' '));
+ EXPECT_TRUE(isHorizontalWhitespace('\t'));
+ EXPECT_TRUE(isHorizontalWhitespace('\f')); // ??
+ EXPECT_TRUE(isHorizontalWhitespace('\v')); // ??
+
+ EXPECT_FALSE(isHorizontalWhitespace('\n'));
+ EXPECT_FALSE(isHorizontalWhitespace('\r'));
+
+ EXPECT_FALSE(isHorizontalWhitespace('\x80'));
+ EXPECT_FALSE(isHorizontalWhitespace('\xc2'));
+ EXPECT_FALSE(isHorizontalWhitespace('\xff'));
+}
+
+TEST(CharInfoTest, isVerticalWhitespace) {
+ EXPECT_FALSE(isVerticalWhitespace('a'));
+ EXPECT_FALSE(isVerticalWhitespace('_'));
+ EXPECT_FALSE(isVerticalWhitespace('0'));
+ EXPECT_FALSE(isVerticalWhitespace('.'));
+ EXPECT_FALSE(isVerticalWhitespace('`'));
+ EXPECT_FALSE(isVerticalWhitespace('\0'));
+ EXPECT_FALSE(isVerticalWhitespace('\x7f'));
+
+ EXPECT_FALSE(isVerticalWhitespace(' '));
+ EXPECT_FALSE(isVerticalWhitespace('\t'));
+ EXPECT_FALSE(isVerticalWhitespace('\f')); // ??
+ EXPECT_FALSE(isVerticalWhitespace('\v')); // ??
+
+ EXPECT_TRUE(isVerticalWhitespace('\n'));
+ EXPECT_TRUE(isVerticalWhitespace('\r'));
+
+ EXPECT_FALSE(isVerticalWhitespace('\x80'));
+ EXPECT_FALSE(isVerticalWhitespace('\xc2'));
+ EXPECT_FALSE(isVerticalWhitespace('\xff'));
+}
+
+TEST(CharInfoTest, isWhitespace) {
+ EXPECT_FALSE(isWhitespace('a'));
+ EXPECT_FALSE(isWhitespace('_'));
+ EXPECT_FALSE(isWhitespace('0'));
+ EXPECT_FALSE(isWhitespace('.'));
+ EXPECT_FALSE(isWhitespace('`'));
+ EXPECT_FALSE(isWhitespace('\0'));
+ EXPECT_FALSE(isWhitespace('\x7f'));
+
+ EXPECT_TRUE(isWhitespace(' '));
+ EXPECT_TRUE(isWhitespace('\t'));
+ EXPECT_TRUE(isWhitespace('\f'));
+ EXPECT_TRUE(isWhitespace('\v'));
+
+ EXPECT_TRUE(isWhitespace('\n'));
+ EXPECT_TRUE(isWhitespace('\r'));
+
+ EXPECT_FALSE(isWhitespace('\x80'));
+ EXPECT_FALSE(isWhitespace('\xc2'));
+ EXPECT_FALSE(isWhitespace('\xff'));
+}
+
+TEST(CharInfoTest, isDigit) {
+ EXPECT_TRUE(isDigit('0'));
+ EXPECT_TRUE(isDigit('9'));
+
+ EXPECT_FALSE(isDigit('a'));
+ EXPECT_FALSE(isDigit('A'));
+
+ EXPECT_FALSE(isDigit('z'));
+ EXPECT_FALSE(isDigit('Z'));
+
+ EXPECT_FALSE(isDigit('.'));
+ EXPECT_FALSE(isDigit('_'));
+
+ EXPECT_FALSE(isDigit('/'));
+ EXPECT_FALSE(isDigit('\0'));
+
+ EXPECT_FALSE(isDigit('\x80'));
+ EXPECT_FALSE(isDigit('\xc2'));
+ EXPECT_FALSE(isDigit('\xff'));
+}
+
+TEST(CharInfoTest, isHexDigit) {
+ EXPECT_TRUE(isHexDigit('0'));
+ EXPECT_TRUE(isHexDigit('9'));
+
+ EXPECT_TRUE(isHexDigit('a'));
+ EXPECT_TRUE(isHexDigit('A'));
+
+ EXPECT_FALSE(isHexDigit('z'));
+ EXPECT_FALSE(isHexDigit('Z'));
+
+ EXPECT_FALSE(isHexDigit('.'));
+ EXPECT_FALSE(isHexDigit('_'));
+
+ EXPECT_FALSE(isHexDigit('/'));
+ EXPECT_FALSE(isHexDigit('\0'));
+
+ EXPECT_FALSE(isHexDigit('\x80'));
+ EXPECT_FALSE(isHexDigit('\xc2'));
+ EXPECT_FALSE(isHexDigit('\xff'));
+}
+
+TEST(CharInfoTest, isLetter) {
+ EXPECT_FALSE(isLetter('0'));
+ EXPECT_FALSE(isLetter('9'));
+
+ EXPECT_TRUE(isLetter('a'));
+ EXPECT_TRUE(isLetter('A'));
+
+ EXPECT_TRUE(isLetter('z'));
+ EXPECT_TRUE(isLetter('Z'));
+
+ EXPECT_FALSE(isLetter('.'));
+ EXPECT_FALSE(isLetter('_'));
+
+ EXPECT_FALSE(isLetter('/'));
+ EXPECT_FALSE(isLetter('('));
+ EXPECT_FALSE(isLetter('\0'));
+
+ EXPECT_FALSE(isLetter('\x80'));
+ EXPECT_FALSE(isLetter('\xc2'));
+ EXPECT_FALSE(isLetter('\xff'));
+}
+
+TEST(CharInfoTest, isLowercase) {
+ EXPECT_FALSE(isLowercase('0'));
+ EXPECT_FALSE(isLowercase('9'));
+
+ EXPECT_TRUE(isLowercase('a'));
+ EXPECT_FALSE(isLowercase('A'));
+
+ EXPECT_TRUE(isLowercase('z'));
+ EXPECT_FALSE(isLowercase('Z'));
+
+ EXPECT_FALSE(isLowercase('.'));
+ EXPECT_FALSE(isLowercase('_'));
+
+ EXPECT_FALSE(isLowercase('/'));
+ EXPECT_FALSE(isLowercase('('));
+ EXPECT_FALSE(isLowercase('\0'));
+
+ EXPECT_FALSE(isLowercase('\x80'));
+ EXPECT_FALSE(isLowercase('\xc2'));
+ EXPECT_FALSE(isLowercase('\xff'));
+}
+
+TEST(CharInfoTest, isUppercase) {
+ EXPECT_FALSE(isUppercase('0'));
+ EXPECT_FALSE(isUppercase('9'));
+
+ EXPECT_FALSE(isUppercase('a'));
+ EXPECT_TRUE(isUppercase('A'));
+
+ EXPECT_FALSE(isUppercase('z'));
+ EXPECT_TRUE(isUppercase('Z'));
+
+ EXPECT_FALSE(isUppercase('.'));
+ EXPECT_FALSE(isUppercase('_'));
+
+ EXPECT_FALSE(isUppercase('/'));
+ EXPECT_FALSE(isUppercase('('));
+ EXPECT_FALSE(isUppercase('\0'));
+
+ EXPECT_FALSE(isUppercase('\x80'));
+ EXPECT_FALSE(isUppercase('\xc2'));
+ EXPECT_FALSE(isUppercase('\xff'));
+}
+
+TEST(CharInfoTest, isAlphanumeric) {
+ EXPECT_TRUE(isAlphanumeric('0'));
+ EXPECT_TRUE(isAlphanumeric('9'));
+
+ EXPECT_TRUE(isAlphanumeric('a'));
+ EXPECT_TRUE(isAlphanumeric('A'));
+
+ EXPECT_TRUE(isAlphanumeric('z'));
+ EXPECT_TRUE(isAlphanumeric('Z'));
+
+ EXPECT_FALSE(isAlphanumeric('.'));
+ EXPECT_FALSE(isAlphanumeric('_'));
+
+ EXPECT_FALSE(isAlphanumeric('/'));
+ EXPECT_FALSE(isAlphanumeric('('));
+ EXPECT_FALSE(isAlphanumeric('\0'));
+
+ EXPECT_FALSE(isAlphanumeric('\x80'));
+ EXPECT_FALSE(isAlphanumeric('\xc2'));
+ EXPECT_FALSE(isAlphanumeric('\xff'));
+}
+
+TEST(CharInfoTest, isPunctuation) {
+ EXPECT_FALSE(isPunctuation('0'));
+ EXPECT_FALSE(isPunctuation('9'));
+
+ EXPECT_FALSE(isPunctuation('a'));
+ EXPECT_FALSE(isPunctuation('A'));
+
+ EXPECT_FALSE(isPunctuation('z'));
+ EXPECT_FALSE(isPunctuation('Z'));
+
+ EXPECT_TRUE(isPunctuation('.'));
+ EXPECT_TRUE(isPunctuation('_'));
+
+ EXPECT_TRUE(isPunctuation('/'));
+ EXPECT_TRUE(isPunctuation('('));
+
+ EXPECT_FALSE(isPunctuation(' '));
+ EXPECT_FALSE(isPunctuation('\n'));
+ EXPECT_FALSE(isPunctuation('\0'));
+
+ EXPECT_FALSE(isPunctuation('\x80'));
+ EXPECT_FALSE(isPunctuation('\xc2'));
+ EXPECT_FALSE(isPunctuation('\xff'));
+}
+
+TEST(CharInfoTest, isPrintable) {
+ EXPECT_TRUE(isPrintable('0'));
+ EXPECT_TRUE(isPrintable('9'));
+
+ EXPECT_TRUE(isPrintable('a'));
+ EXPECT_TRUE(isPrintable('A'));
+
+ EXPECT_TRUE(isPrintable('z'));
+ EXPECT_TRUE(isPrintable('Z'));
+
+ EXPECT_TRUE(isPrintable('.'));
+ EXPECT_TRUE(isPrintable('_'));
+
+ EXPECT_TRUE(isPrintable('/'));
+ EXPECT_TRUE(isPrintable('('));
+
+ EXPECT_TRUE(isPrintable(' '));
+ EXPECT_FALSE(isPrintable('\t'));
+ EXPECT_FALSE(isPrintable('\n'));
+ EXPECT_FALSE(isPrintable('\0'));
+
+ EXPECT_FALSE(isPrintable('\x80'));
+ EXPECT_FALSE(isPrintable('\xc2'));
+ EXPECT_FALSE(isPrintable('\xff'));
+}
+
+TEST(CharInfoTest, isPreprocessingNumberBody) {
+ EXPECT_TRUE(isPreprocessingNumberBody('0'));
+ EXPECT_TRUE(isPreprocessingNumberBody('9'));
+
+ EXPECT_TRUE(isPreprocessingNumberBody('a'));
+ EXPECT_TRUE(isPreprocessingNumberBody('A'));
+
+ EXPECT_TRUE(isPreprocessingNumberBody('z'));
+ EXPECT_TRUE(isPreprocessingNumberBody('Z'));
+ EXPECT_TRUE(isPreprocessingNumberBody('.'));
+ EXPECT_TRUE(isPreprocessingNumberBody('_'));
+
+ EXPECT_FALSE(isPreprocessingNumberBody('/'));
+ EXPECT_FALSE(isPreprocessingNumberBody('('));
+ EXPECT_FALSE(isPreprocessingNumberBody('\0'));
+
+ EXPECT_FALSE(isPreprocessingNumberBody('\x80'));
+ EXPECT_FALSE(isPreprocessingNumberBody('\xc2'));
+ EXPECT_FALSE(isPreprocessingNumberBody('\xff'));
+}
+
+TEST(CharInfoTest, isRawStringDelimBody) {
+ EXPECT_TRUE(isRawStringDelimBody('0'));
+ EXPECT_TRUE(isRawStringDelimBody('9'));
+
+ EXPECT_TRUE(isRawStringDelimBody('a'));
+ EXPECT_TRUE(isRawStringDelimBody('A'));
+
+ EXPECT_TRUE(isRawStringDelimBody('z'));
+ EXPECT_TRUE(isRawStringDelimBody('Z'));
+ EXPECT_TRUE(isRawStringDelimBody('.'));
+ EXPECT_TRUE(isRawStringDelimBody('_'));
+
+ EXPECT_TRUE(isRawStringDelimBody('/'));
+ EXPECT_FALSE(isRawStringDelimBody('('));
+ EXPECT_FALSE(isRawStringDelimBody('\0'));
+}