diff options
author | Jordan Rose <jordan_rose@apple.com> | 2013-02-08 22:30:22 +0000 |
---|---|---|
committer | Jordan Rose <jordan_rose@apple.com> | 2013-02-08 22:30:22 +0000 |
commit | 9893902eceba7f01dd1521349d33866f77254d78 (patch) | |
tree | 663def75b1c867719f47927560893f869a761668 | |
parent | e22cef5cb2e460bae88563cfc5fcf98d742d6215 (diff) |
Pull Lexer's CharInfo table out for general use throughout Clang.
Rewriting the same predicates over and over again is bad for code size and
code maintainence. Using the functions in <ctype.h> is generally unsafe
unless they are specified to be locale-independent (i.e. only isdigit and
isxdigit).
The next commit will try to clean up uses of <ctype.h> functions within Clang.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@174765 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/clang/Basic/CharInfo.h | 162 | ||||
-rw-r--r-- | lib/Basic/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Basic/CharInfo.cpp | 80 | ||||
-rw-r--r-- | lib/Lex/Lexer.cpp | 175 | ||||
-rw-r--r-- | unittests/Basic/CMakeLists.txt | 1 | ||||
-rw-r--r-- | unittests/Basic/CharInfoTest.cpp | 377 |
6 files changed, 626 insertions, 170 deletions
diff --git a/include/clang/Basic/CharInfo.h b/include/clang/Basic/CharInfo.h new file mode 100644 index 0000000000..f9b7b7311d --- /dev/null +++ b/include/clang/Basic/CharInfo.h @@ -0,0 +1,162 @@ +//===--- clang/Basic/CharInfo.h - Classifying ASCII Characters ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_BASIC_CHARINFO_H +#define CLANG_BASIC_CHARINFO_H + +#include "llvm/Support/Compiler.h" +#include "llvm/Support/DataTypes.h" + +namespace clang { +namespace charinfo { + extern const uint16_t InfoTable[256]; + + enum { + CHAR_HORZ_WS = 0x0001, // '\t', '\f', '\v'. Note, no '\0' + CHAR_VERT_WS = 0x0002, // '\r', '\n' + CHAR_SPACE = 0x0004, // ' ' + CHAR_DIGIT = 0x0008, // 0-9 + CHAR_XLETTER = 0x0010, // a-f,A-F + CHAR_UPPER = 0x0020, // A-Z + CHAR_LOWER = 0x0040, // a-z + CHAR_UNDER = 0x0080, // _ + CHAR_PERIOD = 0x0100, // . + CHAR_RAWDEL = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"' + CHAR_PUNCT = 0x0400 // `$@() + }; + + enum { + CHAR_XUPPER = CHAR_XLETTER | CHAR_UPPER, + CHAR_XLOWER = CHAR_XLETTER | CHAR_LOWER + }; +} // end namespace charinfo + +/// Returns true if this is an ASCII character. +LLVM_READNONE static inline bool isASCII(char c) { + return static_cast<unsigned char>(c) <= 127; +} + +/// Returns true if this is a valid first character of a C identifier, +/// which is [a-zA-Z_]. +LLVM_READONLY static inline bool isIdentifierHead(unsigned char c, + bool AllowDollar = false) { + using namespace charinfo; + if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER)) + return true; + return AllowDollar && c == '$'; +} + +/// Returns true if this is a body character of a C identifier, +/// which is [a-zA-Z0-9_]. +LLVM_READONLY static inline bool isIdentifierBody(unsigned char c, + bool AllowDollar = false) { + using namespace charinfo; + if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER)) + return true; + return AllowDollar && c == '$'; +} + +/// Returns true if this character is horizontal ASCII whitespace: +/// ' ', '\\t', '\\f', '\\v'. +/// +/// Note that this returns false for '\\0'. +LLVM_READONLY static inline bool isHorizontalWhitespace(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0; +} + +/// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'. +/// +/// Note that this returns false for '\\0'. +LLVM_READONLY static inline bool isVerticalWhitespace(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & CHAR_VERT_WS) != 0; +} + +/// Return true if this character is horizontal or vertical ASCII whitespace: +/// ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. +/// +/// Note that this returns false for '\\0'. +LLVM_READONLY static inline bool isWhitespace(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_VERT_WS|CHAR_SPACE)) != 0; +} + +/// Return true if this character is an ASCII digit: [0-9] +LLVM_READONLY static inline bool isDigit(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & CHAR_DIGIT) != 0; +} + +/// Return true if this character is a lowercase ASCII letter: [a-z] +LLVM_READONLY static inline bool isLowercase(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & CHAR_LOWER) != 0; +} + +/// Return true if this character is an uppercase ASCII letter: [A-Z] +LLVM_READONLY static inline bool isUppercase(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & CHAR_UPPER) != 0; +} + +/// Return true if this character is an ASCII letter: [a-zA-Z] +LLVM_READONLY static inline bool isLetter(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER)) != 0; +} + +/// Return true if this character is an ASCII letter or digit: [a-zA-Z0-9] +LLVM_READONLY static inline bool isAlphanumeric(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_DIGIT|CHAR_UPPER|CHAR_LOWER)) != 0; +} + +/// Return true if this character is an ASCII hex digit: [0-9a-fA-F] +LLVM_READONLY static inline bool isHexDigit(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_DIGIT|CHAR_XLETTER)) != 0; + return true; +} + +/// Return true if this character is an ASCII punctuation character. +/// +/// Note that '_' is both a punctuation character and an identifier character! +LLVM_READONLY static inline bool isPunctuation(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0; +} + +/// Return true if this character is an ASCII printable character; that is, a +/// character that should take exactly one column to print in a fixed-width +/// terminal. +LLVM_READONLY static inline bool isPrintable(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT| + CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0; +} + +/// Return true if this is the body character of a C preprocessing number, +/// which is [a-zA-Z0-9_.]. +LLVM_READONLY static inline bool isPreprocessingNumberBody(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & + (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER|CHAR_PERIOD)) != 0; +} + +/// Return true if this is the body character of a C++ raw string delimiter. +LLVM_READONLY static inline bool isRawStringDelimBody(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD| + CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0; +} + +} // end namespace clang + +#endif diff --git a/lib/Basic/CMakeLists.txt b/lib/Basic/CMakeLists.txt index 7c5e42c984..37efcb1220 100644 --- a/lib/Basic/CMakeLists.txt +++ b/lib/Basic/CMakeLists.txt @@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS mc) add_clang_library(clangBasic Builtins.cpp + CharInfo.cpp Diagnostic.cpp DiagnosticIDs.cpp FileManager.cpp diff --git a/lib/Basic/CharInfo.cpp b/lib/Basic/CharInfo.cpp new file mode 100644 index 0000000000..a1a4b390bf --- /dev/null +++ b/lib/Basic/CharInfo.cpp @@ -0,0 +1,80 @@ +//===--- CharInfo.cpp - Static Data for Classifying ASCII Characters ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/CharInfo.h" + +// Statically initialize CharInfo table based on ASCII character set +// Reference: FreeBSD 7.2 /usr/share/misc/ascii +const uint16_t clang::charinfo::InfoTable[256] = +{ + // 0 NUL 1 SOH 2 STX 3 ETX + // 4 EOT 5 ENQ 6 ACK 7 BEL + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , + // 8 BS 9 HT 10 NL 11 VT + //12 NP 13 CR 14 SO 15 SI + 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, + CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , + //16 DLE 17 DC1 18 DC2 19 DC3 + //20 DC4 21 NAK 22 SYN 23 ETB + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , + //24 CAN 25 EM 26 SUB 27 ESC + //28 FS 29 GS 30 RS 31 US + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , + //32 SP 33 ! 34 " 35 # + //36 $ 37 % 38 & 39 ' + CHAR_SPACE , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , + //40 ( 41 ) 42 * 43 + + //44 , 45 - 46 . 47 / + CHAR_PUNCT , CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , + //48 0 49 1 50 2 51 3 + //52 4 53 5 54 6 55 7 + CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , + CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , + //56 8 57 9 58 : 59 ; + //60 < 61 = 62 > 63 ? + CHAR_DIGIT , CHAR_DIGIT , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , + //64 @ 65 A 66 B 67 C + //68 D 69 E 70 F 71 G + CHAR_PUNCT , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , + CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , CHAR_UPPER , + //72 H 73 I 74 J 75 K + //76 L 77 M 78 N 79 O + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + //80 P 81 Q 82 R 83 S + //84 T 85 U 86 V 87 W + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + //88 X 89 Y 90 Z 91 [ + //92 \ 93 ] 94 ^ 95 _ + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_RAWDEL , + CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , + //96 ` 97 a 98 b 99 c + //100 d 101 e 102 f 103 g + CHAR_PUNCT , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , + CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , CHAR_LOWER , + //104 h 105 i 106 j 107 k + //108 l 109 m 110 n 111 o + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + //112 p 113 q 114 r 115 s + //116 t 117 u 118 v 119 w + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + //120 x 121 y 122 z 123 { + //124 | 125 } 126 ~ 127 DEL + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_RAWDEL , + CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 +}; diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 1b064c88ff..6aae4e17fa 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -25,6 +25,7 @@ //===----------------------------------------------------------------------===// #include "clang/Lex/Lexer.h" +#include "clang/Basic/CharInfo.h" #include "clang/Basic/SourceManager.h" #include "clang/Lex/CodeCompletionHandler.h" #include "clang/Lex/LexDiagnostic.h" @@ -38,8 +39,6 @@ #include <cstring> using namespace clang; -static void InitCharacterInfo(); - //===----------------------------------------------------------------------===// // Token Class Implementation //===----------------------------------------------------------------------===// @@ -66,8 +65,6 @@ void Lexer::anchor() { } void Lexer::InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd) { - InitCharacterInfo(); - BufferStart = BufStart; BufferPtr = BufPtr; BufferEnd = BufEnd; @@ -408,9 +405,6 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, } - -static bool isWhitespace(unsigned char c); - /// MeasureTokenLength - Relex the token at the specified location and return /// its length in bytes in the input file. If the token needs cleaning (e.g. /// includes a trigraph or an escaped newline) then this count includes bytes @@ -1008,163 +1002,8 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc, return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); } -//===----------------------------------------------------------------------===// -// Character information. -//===----------------------------------------------------------------------===// - -enum { - CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' - CHAR_VERT_WS = 0x02, // '\r', '\n' - CHAR_LETTER = 0x04, // a-z,A-Z - CHAR_NUMBER = 0x08, // 0-9 - CHAR_UNDER = 0x10, // _ - CHAR_PERIOD = 0x20, // . - CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"' -}; - -// Statically initialize CharInfo table based on ASCII character set -// Reference: FreeBSD 7.2 /usr/share/misc/ascii -static const unsigned char CharInfo[256] = -{ -// 0 NUL 1 SOH 2 STX 3 ETX -// 4 EOT 5 ENQ 6 ACK 7 BEL - 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , -// 8 BS 9 HT 10 NL 11 VT -//12 NP 13 CR 14 SO 15 SI - 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, - CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , -//16 DLE 17 DC1 18 DC2 19 DC3 -//20 DC4 21 NAK 22 SYN 23 ETB - 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , -//24 CAN 25 EM 26 SUB 27 ESC -//28 FS 29 GS 30 RS 31 US - 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , -//32 SP 33 ! 34 " 35 # -//36 $ 37 % 38 & 39 ' - CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , - 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , -//40 ( 41 ) 42 * 43 + -//44 , 45 - 46 . 47 / - 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL , - CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , -//48 0 49 1 50 2 51 3 -//52 4 53 5 54 6 55 7 - CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , - CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , -//56 8 57 9 58 : 59 ; -//60 < 61 = 62 > 63 ? - CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL , - CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , -//64 @ 65 A 66 B 67 C -//68 D 69 E 70 F 71 G - 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//72 H 73 I 74 J 75 K -//76 L 77 M 78 N 79 O - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//80 P 81 Q 82 R 83 S -//84 T 85 U 86 V 87 W - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//88 X 89 Y 90 Z 91 [ -//92 \ 93 ] 94 ^ 95 _ - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , - 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , -//96 ` 97 a 98 b 99 c -//100 d 101 e 102 f 103 g - 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//104 h 105 i 106 j 107 k -//108 l 109 m 110 n 111 o - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//112 p 113 q 114 r 115 s -//116 t 117 u 118 v 119 w - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//120 x 121 y 122 z 123 { -//124 | 125 } 126 ~ 127 DEL - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , - CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 -}; - -static void InitCharacterInfo() { - static bool isInited = false; - if (isInited) return; - // check the statically-initialized CharInfo table - assert(CHAR_HORZ_WS == CharInfo[(int)' ']); - assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); - assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); - assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); - assert(CHAR_VERT_WS == CharInfo[(int)'\n']); - assert(CHAR_VERT_WS == CharInfo[(int)'\r']); - assert(CHAR_UNDER == CharInfo[(int)'_']); - assert(CHAR_PERIOD == CharInfo[(int)'.']); - for (unsigned i = 'a'; i <= 'z'; ++i) { - assert(CHAR_LETTER == CharInfo[i]); - assert(CHAR_LETTER == CharInfo[i+'A'-'a']); - } - for (unsigned i = '0'; i <= '9'; ++i) - assert(CHAR_NUMBER == CharInfo[i]); - - isInited = true; -} - - -/// isIdentifierHead - Return true if this is the first character of an -/// identifier, which is [a-zA-Z_]. -static inline bool isIdentifierHead(unsigned char c) { - return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false; -} - -/// isIdentifierBody - Return true if this is the body character of an -/// identifier, which is [a-zA-Z0-9_]. -static inline bool isIdentifierBody(unsigned char c) { - return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; -} - -/// isHorizontalWhitespace - Return true if this character is horizontal -/// whitespace: ' ', '\\t', '\\f', '\\v'. Note that this returns false for -/// '\\0'. -static inline bool isHorizontalWhitespace(unsigned char c) { - return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; -} - -/// isVerticalWhitespace - Return true if this character is vertical -/// whitespace: '\\n', '\\r'. Note that this returns false for '\\0'. -static inline bool isVerticalWhitespace(unsigned char c) { - return (CharInfo[c] & CHAR_VERT_WS) ? true : false; -} - -/// isWhitespace - Return true if this character is horizontal or vertical -/// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. Note that this returns -/// false for '\\0'. -static inline bool isWhitespace(unsigned char c) { - return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; -} - -/// isNumberBody - Return true if this is the body character of an -/// preprocessing number, which is [a-zA-Z0-9_.]. -static inline bool isNumberBody(unsigned char c) { - return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? - true : false; -} - -/// isRawStringDelimBody - Return true if this is the body character of a -/// raw string delimiter. -static inline bool isRawStringDelimBody(unsigned char c) { - return (CharInfo[c] & - (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ? - true : false; -} - -// Allow external clients to make use of CharInfo. bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { - return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents); + return isIdentifierBody(c, LangOpts.DollarIdents); } @@ -1578,10 +1417,6 @@ static bool isAllowedInitiallyIDChar(uint32_t c) { !(0xFE20 <= c && c <= 0xFE2F); } -static inline bool isASCII(char C) { - return static_cast<signed char>(C) >= 0; -} - void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] @@ -1595,8 +1430,8 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { // Fast path, no $,\,? in identifier found. '\' might be an escaped newline // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. // - // TODO: Could merge these checks into a CharInfo flag to make the comparison - // cheaper + // TODO: Could merge these checks into an InfoTable flag to make the + // comparison cheaper if (isASCII(C) && C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) { FinishIdentifier: @@ -1700,7 +1535,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { unsigned Size; char C = getCharAndSize(CurPtr, Size); char PrevCh = 0; - while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix. + while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix. CurPtr = ConsumeChar(CurPtr, Size, Result); PrevCh = C; C = getCharAndSize(CurPtr, Size); diff --git a/unittests/Basic/CMakeLists.txt b/unittests/Basic/CMakeLists.txt index 300dcd5cb8..51db6ce9e2 100644 --- a/unittests/Basic/CMakeLists.txt +++ b/unittests/Basic/CMakeLists.txt @@ -1,4 +1,5 @@ add_clang_unittest(BasicTests + CharInfoTest.cpp FileManagerTest.cpp SourceManagerTest.cpp ) diff --git a/unittests/Basic/CharInfoTest.cpp b/unittests/Basic/CharInfoTest.cpp new file mode 100644 index 0000000000..9b3d1b3b5f --- /dev/null +++ b/unittests/Basic/CharInfoTest.cpp @@ -0,0 +1,377 @@ +//===- unittests/Basic/CharInfoTest.cpp -- ASCII classification tests -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/CharInfo.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace clang; + +// Check that the CharInfo table has been constructed reasonably. +TEST(CharInfoTest, validateInfoTable) { + using namespace charinfo; + EXPECT_EQ((unsigned)CHAR_SPACE, InfoTable[(unsigned)' ']); + EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\t']); + EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\f']); // ?? + EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\v']); // ?? + EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\n']); + EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\r']); + EXPECT_EQ((unsigned)CHAR_UNDER, InfoTable[(unsigned)'_']); + EXPECT_EQ((unsigned)CHAR_PERIOD, InfoTable[(unsigned)'.']); + + for (unsigned i = 'a'; i <= 'f'; ++i) { + EXPECT_EQ((unsigned)CHAR_XLOWER, InfoTable[i]); + EXPECT_EQ((unsigned)CHAR_XUPPER, InfoTable[i+'A'-'a']); + } + + for (unsigned i = 'g'; i <= 'z'; ++i) { + EXPECT_EQ((unsigned)CHAR_LOWER, InfoTable[i]); + EXPECT_EQ((unsigned)CHAR_UPPER, InfoTable[i+'A'-'a']); + } + + for (unsigned i = '0'; i <= '9'; ++i) + EXPECT_EQ((unsigned)CHAR_DIGIT, InfoTable[i]); +} + +// Check various predicates. +TEST(CharInfoTest, isASCII) { + EXPECT_TRUE(isASCII('\0')); + EXPECT_TRUE(isASCII('\n')); + EXPECT_TRUE(isASCII(' ')); + EXPECT_TRUE(isASCII('a')); + EXPECT_TRUE(isASCII('\x7f')); + EXPECT_FALSE(isASCII('\x80')); + EXPECT_FALSE(isASCII('\xc2')); + EXPECT_FALSE(isASCII('\xff')); +} + +TEST(CharInfoTest, isIdentifierHead) { + EXPECT_TRUE(isIdentifierHead('a')); + EXPECT_TRUE(isIdentifierHead('A')); + EXPECT_TRUE(isIdentifierHead('z')); + EXPECT_TRUE(isIdentifierHead('Z')); + EXPECT_TRUE(isIdentifierHead('_')); + + EXPECT_FALSE(isIdentifierHead('0')); + EXPECT_FALSE(isIdentifierHead('.')); + EXPECT_FALSE(isIdentifierHead('`')); + EXPECT_FALSE(isIdentifierHead('\0')); + + EXPECT_FALSE(isIdentifierHead('$')); + EXPECT_TRUE(isIdentifierHead('$', /*AllowDollar=*/true)); + + EXPECT_FALSE(isIdentifierHead('\x80')); + EXPECT_FALSE(isIdentifierHead('\xc2')); + EXPECT_FALSE(isIdentifierHead('\xff')); +} + +TEST(CharInfoTest, isIdentifierBody) { + EXPECT_TRUE(isIdentifierBody('a')); + EXPECT_TRUE(isIdentifierBody('A')); + EXPECT_TRUE(isIdentifierBody('z')); + EXPECT_TRUE(isIdentifierBody('Z')); + EXPECT_TRUE(isIdentifierBody('_')); + + EXPECT_TRUE(isIdentifierBody('0')); + EXPECT_FALSE(isIdentifierBody('.')); + EXPECT_FALSE(isIdentifierBody('`')); + EXPECT_FALSE(isIdentifierBody('\0')); + + EXPECT_FALSE(isIdentifierBody('$')); + EXPECT_TRUE(isIdentifierBody('$', /*AllowDollar=*/true)); + + EXPECT_FALSE(isIdentifierBody('\x80')); + EXPECT_FALSE(isIdentifierBody('\xc2')); + EXPECT_FALSE(isIdentifierBody('\xff')); +} + +TEST(CharInfoTest, isHorizontalWhitespace) { + EXPECT_FALSE(isHorizontalWhitespace('a')); + EXPECT_FALSE(isHorizontalWhitespace('_')); + EXPECT_FALSE(isHorizontalWhitespace('0')); + EXPECT_FALSE(isHorizontalWhitespace('.')); + EXPECT_FALSE(isHorizontalWhitespace('`')); + EXPECT_FALSE(isHorizontalWhitespace('\0')); + EXPECT_FALSE(isHorizontalWhitespace('\x7f')); + + EXPECT_TRUE(isHorizontalWhitespace(' ')); + EXPECT_TRUE(isHorizontalWhitespace('\t')); + EXPECT_TRUE(isHorizontalWhitespace('\f')); // ?? + EXPECT_TRUE(isHorizontalWhitespace('\v')); // ?? + + EXPECT_FALSE(isHorizontalWhitespace('\n')); + EXPECT_FALSE(isHorizontalWhitespace('\r')); + + EXPECT_FALSE(isHorizontalWhitespace('\x80')); + EXPECT_FALSE(isHorizontalWhitespace('\xc2')); + EXPECT_FALSE(isHorizontalWhitespace('\xff')); +} + +TEST(CharInfoTest, isVerticalWhitespace) { + EXPECT_FALSE(isVerticalWhitespace('a')); + EXPECT_FALSE(isVerticalWhitespace('_')); + EXPECT_FALSE(isVerticalWhitespace('0')); + EXPECT_FALSE(isVerticalWhitespace('.')); + EXPECT_FALSE(isVerticalWhitespace('`')); + EXPECT_FALSE(isVerticalWhitespace('\0')); + EXPECT_FALSE(isVerticalWhitespace('\x7f')); + + EXPECT_FALSE(isVerticalWhitespace(' ')); + EXPECT_FALSE(isVerticalWhitespace('\t')); + EXPECT_FALSE(isVerticalWhitespace('\f')); // ?? + EXPECT_FALSE(isVerticalWhitespace('\v')); // ?? + + EXPECT_TRUE(isVerticalWhitespace('\n')); + EXPECT_TRUE(isVerticalWhitespace('\r')); + + EXPECT_FALSE(isVerticalWhitespace('\x80')); + EXPECT_FALSE(isVerticalWhitespace('\xc2')); + EXPECT_FALSE(isVerticalWhitespace('\xff')); +} + +TEST(CharInfoTest, isWhitespace) { + EXPECT_FALSE(isWhitespace('a')); + EXPECT_FALSE(isWhitespace('_')); + EXPECT_FALSE(isWhitespace('0')); + EXPECT_FALSE(isWhitespace('.')); + EXPECT_FALSE(isWhitespace('`')); + EXPECT_FALSE(isWhitespace('\0')); + EXPECT_FALSE(isWhitespace('\x7f')); + + EXPECT_TRUE(isWhitespace(' ')); + EXPECT_TRUE(isWhitespace('\t')); + EXPECT_TRUE(isWhitespace('\f')); + EXPECT_TRUE(isWhitespace('\v')); + + EXPECT_TRUE(isWhitespace('\n')); + EXPECT_TRUE(isWhitespace('\r')); + + EXPECT_FALSE(isWhitespace('\x80')); + EXPECT_FALSE(isWhitespace('\xc2')); + EXPECT_FALSE(isWhitespace('\xff')); +} + +TEST(CharInfoTest, isDigit) { + EXPECT_TRUE(isDigit('0')); + EXPECT_TRUE(isDigit('9')); + + EXPECT_FALSE(isDigit('a')); + EXPECT_FALSE(isDigit('A')); + + EXPECT_FALSE(isDigit('z')); + EXPECT_FALSE(isDigit('Z')); + + EXPECT_FALSE(isDigit('.')); + EXPECT_FALSE(isDigit('_')); + + EXPECT_FALSE(isDigit('/')); + EXPECT_FALSE(isDigit('\0')); + + EXPECT_FALSE(isDigit('\x80')); + EXPECT_FALSE(isDigit('\xc2')); + EXPECT_FALSE(isDigit('\xff')); +} + +TEST(CharInfoTest, isHexDigit) { + EXPECT_TRUE(isHexDigit('0')); + EXPECT_TRUE(isHexDigit('9')); + + EXPECT_TRUE(isHexDigit('a')); + EXPECT_TRUE(isHexDigit('A')); + + EXPECT_FALSE(isHexDigit('z')); + EXPECT_FALSE(isHexDigit('Z')); + + EXPECT_FALSE(isHexDigit('.')); + EXPECT_FALSE(isHexDigit('_')); + + EXPECT_FALSE(isHexDigit('/')); + EXPECT_FALSE(isHexDigit('\0')); + + EXPECT_FALSE(isHexDigit('\x80')); + EXPECT_FALSE(isHexDigit('\xc2')); + EXPECT_FALSE(isHexDigit('\xff')); +} + +TEST(CharInfoTest, isLetter) { + EXPECT_FALSE(isLetter('0')); + EXPECT_FALSE(isLetter('9')); + + EXPECT_TRUE(isLetter('a')); + EXPECT_TRUE(isLetter('A')); + + EXPECT_TRUE(isLetter('z')); + EXPECT_TRUE(isLetter('Z')); + + EXPECT_FALSE(isLetter('.')); + EXPECT_FALSE(isLetter('_')); + + EXPECT_FALSE(isLetter('/')); + EXPECT_FALSE(isLetter('(')); + EXPECT_FALSE(isLetter('\0')); + + EXPECT_FALSE(isLetter('\x80')); + EXPECT_FALSE(isLetter('\xc2')); + EXPECT_FALSE(isLetter('\xff')); +} + +TEST(CharInfoTest, isLowercase) { + EXPECT_FALSE(isLowercase('0')); + EXPECT_FALSE(isLowercase('9')); + + EXPECT_TRUE(isLowercase('a')); + EXPECT_FALSE(isLowercase('A')); + + EXPECT_TRUE(isLowercase('z')); + EXPECT_FALSE(isLowercase('Z')); + + EXPECT_FALSE(isLowercase('.')); + EXPECT_FALSE(isLowercase('_')); + + EXPECT_FALSE(isLowercase('/')); + EXPECT_FALSE(isLowercase('(')); + EXPECT_FALSE(isLowercase('\0')); + + EXPECT_FALSE(isLowercase('\x80')); + EXPECT_FALSE(isLowercase('\xc2')); + EXPECT_FALSE(isLowercase('\xff')); +} + +TEST(CharInfoTest, isUppercase) { + EXPECT_FALSE(isUppercase('0')); + EXPECT_FALSE(isUppercase('9')); + + EXPECT_FALSE(isUppercase('a')); + EXPECT_TRUE(isUppercase('A')); + + EXPECT_FALSE(isUppercase('z')); + EXPECT_TRUE(isUppercase('Z')); + + EXPECT_FALSE(isUppercase('.')); + EXPECT_FALSE(isUppercase('_')); + + EXPECT_FALSE(isUppercase('/')); + EXPECT_FALSE(isUppercase('(')); + EXPECT_FALSE(isUppercase('\0')); + + EXPECT_FALSE(isUppercase('\x80')); + EXPECT_FALSE(isUppercase('\xc2')); + EXPECT_FALSE(isUppercase('\xff')); +} + +TEST(CharInfoTest, isAlphanumeric) { + EXPECT_TRUE(isAlphanumeric('0')); + EXPECT_TRUE(isAlphanumeric('9')); + + EXPECT_TRUE(isAlphanumeric('a')); + EXPECT_TRUE(isAlphanumeric('A')); + + EXPECT_TRUE(isAlphanumeric('z')); + EXPECT_TRUE(isAlphanumeric('Z')); + + EXPECT_FALSE(isAlphanumeric('.')); + EXPECT_FALSE(isAlphanumeric('_')); + + EXPECT_FALSE(isAlphanumeric('/')); + EXPECT_FALSE(isAlphanumeric('(')); + EXPECT_FALSE(isAlphanumeric('\0')); + + EXPECT_FALSE(isAlphanumeric('\x80')); + EXPECT_FALSE(isAlphanumeric('\xc2')); + EXPECT_FALSE(isAlphanumeric('\xff')); +} + +TEST(CharInfoTest, isPunctuation) { + EXPECT_FALSE(isPunctuation('0')); + EXPECT_FALSE(isPunctuation('9')); + + EXPECT_FALSE(isPunctuation('a')); + EXPECT_FALSE(isPunctuation('A')); + + EXPECT_FALSE(isPunctuation('z')); + EXPECT_FALSE(isPunctuation('Z')); + + EXPECT_TRUE(isPunctuation('.')); + EXPECT_TRUE(isPunctuation('_')); + + EXPECT_TRUE(isPunctuation('/')); + EXPECT_TRUE(isPunctuation('(')); + + EXPECT_FALSE(isPunctuation(' ')); + EXPECT_FALSE(isPunctuation('\n')); + EXPECT_FALSE(isPunctuation('\0')); + + EXPECT_FALSE(isPunctuation('\x80')); + EXPECT_FALSE(isPunctuation('\xc2')); + EXPECT_FALSE(isPunctuation('\xff')); +} + +TEST(CharInfoTest, isPrintable) { + EXPECT_TRUE(isPrintable('0')); + EXPECT_TRUE(isPrintable('9')); + + EXPECT_TRUE(isPrintable('a')); + EXPECT_TRUE(isPrintable('A')); + + EXPECT_TRUE(isPrintable('z')); + EXPECT_TRUE(isPrintable('Z')); + + EXPECT_TRUE(isPrintable('.')); + EXPECT_TRUE(isPrintable('_')); + + EXPECT_TRUE(isPrintable('/')); + EXPECT_TRUE(isPrintable('(')); + + EXPECT_TRUE(isPrintable(' ')); + EXPECT_FALSE(isPrintable('\t')); + EXPECT_FALSE(isPrintable('\n')); + EXPECT_FALSE(isPrintable('\0')); + + EXPECT_FALSE(isPrintable('\x80')); + EXPECT_FALSE(isPrintable('\xc2')); + EXPECT_FALSE(isPrintable('\xff')); +} + +TEST(CharInfoTest, isPreprocessingNumberBody) { + EXPECT_TRUE(isPreprocessingNumberBody('0')); + EXPECT_TRUE(isPreprocessingNumberBody('9')); + + EXPECT_TRUE(isPreprocessingNumberBody('a')); + EXPECT_TRUE(isPreprocessingNumberBody('A')); + + EXPECT_TRUE(isPreprocessingNumberBody('z')); + EXPECT_TRUE(isPreprocessingNumberBody('Z')); + EXPECT_TRUE(isPreprocessingNumberBody('.')); + EXPECT_TRUE(isPreprocessingNumberBody('_')); + + EXPECT_FALSE(isPreprocessingNumberBody('/')); + EXPECT_FALSE(isPreprocessingNumberBody('(')); + EXPECT_FALSE(isPreprocessingNumberBody('\0')); + + EXPECT_FALSE(isPreprocessingNumberBody('\x80')); + EXPECT_FALSE(isPreprocessingNumberBody('\xc2')); + EXPECT_FALSE(isPreprocessingNumberBody('\xff')); +} + +TEST(CharInfoTest, isRawStringDelimBody) { + EXPECT_TRUE(isRawStringDelimBody('0')); + EXPECT_TRUE(isRawStringDelimBody('9')); + + EXPECT_TRUE(isRawStringDelimBody('a')); + EXPECT_TRUE(isRawStringDelimBody('A')); + + EXPECT_TRUE(isRawStringDelimBody('z')); + EXPECT_TRUE(isRawStringDelimBody('Z')); + EXPECT_TRUE(isRawStringDelimBody('.')); + EXPECT_TRUE(isRawStringDelimBody('_')); + + EXPECT_TRUE(isRawStringDelimBody('/')); + EXPECT_FALSE(isRawStringDelimBody('(')); + EXPECT_FALSE(isRawStringDelimBody('\0')); +} |