Pull Lexer's CharInfo table out for general use throughout Clang.

Rewriting the same predicates over and over again is bad for code size and code maintainence. Using the functions in <ctype.h> is generally unsafe unless they are specified to be locale-independent (i.e. only isdigit and isxdigit). The next commit will try to clean up uses of <ctype.h> functions within Clang. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@174765 91177308-0d34-0410-b5e6-96231b3b80d8
author: Jordan Rose <jordan_rose@apple.com> 2013-02-08 22:30:22 +0000
committer: Jordan Rose <jordan_rose@apple.com> 2013-02-08 22:30:22 +0000
commit: 9893902eceba7f01dd1521349d33866f77254d78 (patch)
tree: 663def75b1c867719f47927560893f869a761668 /lib/Lex
parent: e22cef5cb2e460bae88563cfc5fcf98d742d6215 (diff)
1 files changed, 5 insertions, 170 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 1b064c88ff..6aae4e17fa 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -25,6 +25,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Lex/Lexer.h"
+#include "clang/Basic/CharInfo.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/CodeCompletionHandler.h"
 #include "clang/Lex/LexDiagnostic.h"
@@ -38,8 +39,6 @@
 #include <cstring>
 using namespace clang;
 
-static void InitCharacterInfo();
-
 //===----------------------------------------------------------------------===//
 // Token Class Implementation
 //===----------------------------------------------------------------------===//
@@ -66,8 +65,6 @@ void Lexer::anchor() { }
 
 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
                       const char *BufEnd) {
-  InitCharacterInfo();
-
   BufferStart = BufStart;
   BufferPtr = BufPtr;
   BufferEnd = BufEnd;
@@ -408,9 +405,6 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
 }
 
 
-
-static bool isWhitespace(unsigned char c);
-
 /// MeasureTokenLength - Relex the token at the specified location and return
 /// its length in bytes in the input file.  If the token needs cleaning (e.g.
 /// includes a trigraph or an escaped newline) then this count includes bytes
@@ -1008,163 +1002,8 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
 }
 
-//===----------------------------------------------------------------------===//
-// Character information.
-//===----------------------------------------------------------------------===//
-
-enum {
-  CHAR_HORZ_WS  = 0x01,  // ' ', '\t', '\f', '\v'.  Note, no '\0'
-  CHAR_VERT_WS  = 0x02,  // '\r', '\n'
-  CHAR_LETTER   = 0x04,  // a-z,A-Z
-  CHAR_NUMBER   = 0x08,  // 0-9
-  CHAR_UNDER    = 0x10,  // _
-  CHAR_PERIOD   = 0x20,  // .
-  CHAR_RAWDEL   = 0x40   // {}[]#<>%:;?*+-/^&|~!=,"'
-};
-
-// Statically initialize CharInfo table based on ASCII character set
-// Reference: FreeBSD 7.2 /usr/share/misc/ascii
-static const unsigned char CharInfo[256] =
-{
-// 0 NUL         1 SOH         2 STX         3 ETX
-// 4 EOT         5 ENQ         6 ACK         7 BEL
-   0           , 0           , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
-// 8 BS          9 HT         10 NL         11 VT
-//12 NP         13 CR         14 SO         15 SI
-   0           , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
-   CHAR_HORZ_WS, CHAR_VERT_WS, 0           , 0           ,
-//16 DLE        17 DC1        18 DC2        19 DC3
-//20 DC4        21 NAK        22 SYN        23 ETB
-   0           , 0           , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
-//24 CAN        25 EM         26 SUB        27 ESC
-//28 FS         29 GS         30 RS         31 US
-   0           , 0           , 0           , 0           ,
-   0           , 0           , 0           , 0           ,
-//32 SP         33  !         34  "         35  #
-//36  $         37  %         38  &         39  '
-   CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//40  (         41  )         42  *         43  +
-//44  ,         45  -         46  .         47  /
-   0           , 0           , CHAR_RAWDEL , CHAR_RAWDEL ,
-   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
-//48  0         49  1         50  2         51  3
-//52  4         53  5         54  6         55  7
-   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
-   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
-//56  8         57  9         58  :         59  ;
-//60  <         61  =         62  >         63  ?
-   CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
-   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-//64  @         65  A         66  B         67  C
-//68  D         69  E         70  F         71  G
-   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//72  H         73  I         74  J         75  K
-//76  L         77  M         78  N         79  O
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//80  P         81  Q         82  R         83  S
-//84  T         85  U         86  V         87  W
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//88  X         89  Y         90  Z         91  [
-//92  \         93  ]         94  ^         95  _
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
-   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER  ,
-//96  `         97  a         98  b         99  c
-//100  d       101  e        102  f        103  g
-   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//104  h       105  i        106  j        107  k
-//108  l       109  m        110  n        111  o
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//112  p       113  q        114  r        115  s
-//116  t       117  u        118  v        119  w
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
-//120  x       121  y        122  z        123  {
-//124  |       125  }        126  ~        127 DEL
-   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
-   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
-};
-
-static void InitCharacterInfo() {
-  static bool isInited = false;
-  if (isInited) return;
-  // check the statically-initialized CharInfo table
-  assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
-  assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
-  assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
-  assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
-  assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
-  assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
-  assert(CHAR_UNDER   == CharInfo[(int)'_']);
-  assert(CHAR_PERIOD  == CharInfo[(int)'.']);
-  for (unsigned i = 'a'; i <= 'z'; ++i) {
-    assert(CHAR_LETTER == CharInfo[i]);
-    assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
-  }
-  for (unsigned i = '0'; i <= '9'; ++i)
-    assert(CHAR_NUMBER == CharInfo[i]);
-    
-  isInited = true;
-}
-
-
-/// isIdentifierHead - Return true if this is the first character of an
-/// identifier, which is [a-zA-Z_].
-static inline bool isIdentifierHead(unsigned char c) {
-  return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false;
-}
-
-/// isIdentifierBody - Return true if this is the body character of an
-/// identifier, which is [a-zA-Z0-9_].
-static inline bool isIdentifierBody(unsigned char c) {
-  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
-}
-
-/// isHorizontalWhitespace - Return true if this character is horizontal
-/// whitespace: ' ', '\\t', '\\f', '\\v'.  Note that this returns false for
-/// '\\0'.
-static inline bool isHorizontalWhitespace(unsigned char c) {
-  return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
-}
-
-/// isVerticalWhitespace - Return true if this character is vertical
-/// whitespace: '\\n', '\\r'.  Note that this returns false for '\\0'.
-static inline bool isVerticalWhitespace(unsigned char c) {
-  return (CharInfo[c] & CHAR_VERT_WS) ? true : false;
-}
-
-/// isWhitespace - Return true if this character is horizontal or vertical
-/// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'.  Note that this returns
-/// false for '\\0'.
-static inline bool isWhitespace(unsigned char c) {
-  return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
-}
-
-/// isNumberBody - Return true if this is the body character of an
-/// preprocessing number, which is [a-zA-Z0-9_.].
-static inline bool isNumberBody(unsigned char c) {
-  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
-    true : false;
-}
-
-/// isRawStringDelimBody - Return true if this is the body character of a
-/// raw string delimiter.
-static inline bool isRawStringDelimBody(unsigned char c) {
-  return (CharInfo[c] &
-          (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
-    true : false;
-}
-
-// Allow external clients to make use of CharInfo.
 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
-  return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents);
+  return isIdentifierBody(c, LangOpts.DollarIdents);
 }
 
 
@@ -1578,10 +1417,6 @@ static bool isAllowedInitiallyIDChar(uint32_t c) {
          !(0xFE20 <= c && c <= 0xFE2F);
 }
 
-static inline bool isASCII(char C) {
-  return static_cast<signed char>(C) >= 0;
-}
-
 
 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
   // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@@ -1595,8 +1430,8 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
   // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
   // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
   //
-  // TODO: Could merge these checks into a CharInfo flag to make the comparison
-  // cheaper
+  // TODO: Could merge these checks into an InfoTable flag to make the
+  // comparison cheaper
   if (isASCII(C) && C != '\\' && C != '?' &&
       (C != '$' || !LangOpts.DollarIdents)) {
 FinishIdentifier:
@@ -1700,7 +1535,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
   unsigned Size;
   char C = getCharAndSize(CurPtr, Size);
   char PrevCh = 0;
-  while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+  while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
     CurPtr = ConsumeChar(CurPtr, Size, Result);
     PrevCh = C;
     C = getCharAndSize(CurPtr, Size);
author	Jordan Rose <jordan_rose@apple.com>	2013-02-08 22:30:22 +0000
committer	Jordan Rose <jordan_rose@apple.com>	2013-02-08 22:30:22 +0000
commit	9893902eceba7f01dd1521349d33866f77254d78 (patch)
tree	663def75b1c867719f47927560893f869a761668 /lib/Lex
parent	e22cef5cb2e460bae88563cfc5fcf98d742d6215 (diff)