aboutsummaryrefslogtreecommitdiff
path: root/lib/Lex
diff options
context:
space:
mode:
authorJordan Rose <jordan_rose@apple.com>2013-02-08 22:30:22 +0000
committerJordan Rose <jordan_rose@apple.com>2013-02-08 22:30:22 +0000
commit9893902eceba7f01dd1521349d33866f77254d78 (patch)
tree663def75b1c867719f47927560893f869a761668 /lib/Lex
parente22cef5cb2e460bae88563cfc5fcf98d742d6215 (diff)
Pull Lexer's CharInfo table out for general use throughout Clang.
Rewriting the same predicates over and over again is bad for code size and code maintainence. Using the functions in <ctype.h> is generally unsafe unless they are specified to be locale-independent (i.e. only isdigit and isxdigit). The next commit will try to clean up uses of <ctype.h> functions within Clang. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@174765 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Lex')
-rw-r--r--lib/Lex/Lexer.cpp175
1 files changed, 5 insertions, 170 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 1b064c88ff..6aae4e17fa 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -25,6 +25,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Lex/Lexer.h"
+#include "clang/Basic/CharInfo.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Lex/CodeCompletionHandler.h"
#include "clang/Lex/LexDiagnostic.h"
@@ -38,8 +39,6 @@
#include <cstring>
using namespace clang;
-static void InitCharacterInfo();
-
//===----------------------------------------------------------------------===//
// Token Class Implementation
//===----------------------------------------------------------------------===//
@@ -66,8 +65,6 @@ void Lexer::anchor() { }
void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
const char *BufEnd) {
- InitCharacterInfo();
-
BufferStart = BufStart;
BufferPtr = BufPtr;
BufferEnd = BufEnd;
@@ -408,9 +405,6 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
}
-
-static bool isWhitespace(unsigned char c);
-
/// MeasureTokenLength - Relex the token at the specified location and return
/// its length in bytes in the input file. If the token needs cleaning (e.g.
/// includes a trigraph or an escaped newline) then this count includes bytes
@@ -1008,163 +1002,8 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
}
-//===----------------------------------------------------------------------===//
-// Character information.
-//===----------------------------------------------------------------------===//
-
-enum {
- CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0'
- CHAR_VERT_WS = 0x02, // '\r', '\n'
- CHAR_LETTER = 0x04, // a-z,A-Z
- CHAR_NUMBER = 0x08, // 0-9
- CHAR_UNDER = 0x10, // _
- CHAR_PERIOD = 0x20, // .
- CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"'
-};
-
-// Statically initialize CharInfo table based on ASCII character set
-// Reference: FreeBSD 7.2 /usr/share/misc/ascii
-static const unsigned char CharInfo[256] =
-{
-// 0 NUL 1 SOH 2 STX 3 ETX
-// 4 EOT 5 ENQ 6 ACK 7 BEL
- 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
-// 8 BS 9 HT 10 NL 11 VT
-//12 NP 13 CR 14 SO 15 SI
- 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
- CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 ,
-//16 DLE 17 DC1 18 DC2 19 DC3
-//20 DC4 21 NAK 22 SYN 23 ETB
- 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
-//24 CAN 25 EM 26 SUB 27 ESC
-//28 FS 29 GS 30 RS 31 US
- 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 ,
-//32 SP 33 ! 34 " 35 #
-//36 $ 37 % 38 & 39 '
- CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
- 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//40 ( 41 ) 42 * 43 +
-//44 , 45 - 46 . 47 /
- 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL ,
- CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
-//48 0 49 1 50 2 51 3
-//52 4 53 5 54 6 55 7
- CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
- CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
-//56 8 57 9 58 : 59 ;
-//60 < 61 = 62 > 63 ?
- CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
- CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//64 @ 65 A 66 B 67 C
-//68 D 69 E 70 F 71 G
- 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//72 H 73 I 74 J 75 K
-//76 L 77 M 78 N 79 O
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//80 P 81 Q 82 R 83 S
-//84 T 85 U 86 V 87 W
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//88 X 89 Y 90 Z 91 [
-//92 \ 93 ] 94 ^ 95 _
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
- 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER ,
-//96 ` 97 a 98 b 99 c
-//100 d 101 e 102 f 103 g
- 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//104 h 105 i 106 j 107 k
-//108 l 109 m 110 n 111 o
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//112 p 113 q 114 r 115 s
-//116 t 117 u 118 v 119 w
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//120 x 121 y 122 z 123 {
-//124 | 125 } 126 ~ 127 DEL
- CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
- CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
-};
-
-static void InitCharacterInfo() {
- static bool isInited = false;
- if (isInited) return;
- // check the statically-initialized CharInfo table
- assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
- assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
- assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
- assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
- assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
- assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
- assert(CHAR_UNDER == CharInfo[(int)'_']);
- assert(CHAR_PERIOD == CharInfo[(int)'.']);
- for (unsigned i = 'a'; i <= 'z'; ++i) {
- assert(CHAR_LETTER == CharInfo[i]);
- assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
- }
- for (unsigned i = '0'; i <= '9'; ++i)
- assert(CHAR_NUMBER == CharInfo[i]);
-
- isInited = true;
-}
-
-
-/// isIdentifierHead - Return true if this is the first character of an
-/// identifier, which is [a-zA-Z_].
-static inline bool isIdentifierHead(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false;
-}
-
-/// isIdentifierBody - Return true if this is the body character of an
-/// identifier, which is [a-zA-Z0-9_].
-static inline bool isIdentifierBody(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
-}
-
-/// isHorizontalWhitespace - Return true if this character is horizontal
-/// whitespace: ' ', '\\t', '\\f', '\\v'. Note that this returns false for
-/// '\\0'.
-static inline bool isHorizontalWhitespace(unsigned char c) {
- return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
-}
-
-/// isVerticalWhitespace - Return true if this character is vertical
-/// whitespace: '\\n', '\\r'. Note that this returns false for '\\0'.
-static inline bool isVerticalWhitespace(unsigned char c) {
- return (CharInfo[c] & CHAR_VERT_WS) ? true : false;
-}
-
-/// isWhitespace - Return true if this character is horizontal or vertical
-/// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. Note that this returns
-/// false for '\\0'.
-static inline bool isWhitespace(unsigned char c) {
- return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
-}
-
-/// isNumberBody - Return true if this is the body character of an
-/// preprocessing number, which is [a-zA-Z0-9_.].
-static inline bool isNumberBody(unsigned char c) {
- return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
- true : false;
-}
-
-/// isRawStringDelimBody - Return true if this is the body character of a
-/// raw string delimiter.
-static inline bool isRawStringDelimBody(unsigned char c) {
- return (CharInfo[c] &
- (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
- true : false;
-}
-
-// Allow external clients to make use of CharInfo.
bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
- return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents);
+ return isIdentifierBody(c, LangOpts.DollarIdents);
}
@@ -1578,10 +1417,6 @@ static bool isAllowedInitiallyIDChar(uint32_t c) {
!(0xFE20 <= c && c <= 0xFE2F);
}
-static inline bool isASCII(char C) {
- return static_cast<signed char>(C) >= 0;
-}
-
void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@@ -1595,8 +1430,8 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Fast path, no $,\,? in identifier found. '\' might be an escaped newline
// or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
//
- // TODO: Could merge these checks into a CharInfo flag to make the comparison
- // cheaper
+ // TODO: Could merge these checks into an InfoTable flag to make the
+ // comparison cheaper
if (isASCII(C) && C != '\\' && C != '?' &&
(C != '$' || !LangOpts.DollarIdents)) {
FinishIdentifier:
@@ -1700,7 +1535,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
unsigned Size;
char C = getCharAndSize(CurPtr, Size);
char PrevCh = 0;
- while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+ while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
CurPtr = ConsumeChar(CurPtr, Size, Result);
PrevCh = C;
C = getCharAndSize(CurPtr, Size);