Stage two of getting CFE top correct.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@39734 91177308-0d34-0410-b5e6-96231b3b80d8
author: Reid Spencer <rspencer@reidspencer.com> 2007-07-11 17:01:13 +0000
committer: Reid Spencer <rspencer@reidspencer.com> 2007-07-11 17:01:13 +0000
commit: 5f016e2cb5d11daeb237544de1c5d59f20fe1a6e (patch)
tree: 8b6bfcb8783d16827f896d5facbd4549300e8a1e /Lex
parent: a5f182095bf2065ca94f1c86957ee91f9068964b (diff)
11 files changed, 6574 insertions, 0 deletions
diff --git a/Lex/HeaderSearch.cpp b/Lex/HeaderSearch.cpp
new file mode 100644
index 0000000000..520205e1da
--- /dev/null
+++ b/Lex/HeaderSearch.cpp
@@ -0,0 +1,319 @@
+//===--- HeaderSearch.cpp - Resolve Header File Locations ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the DirectoryLookup and HeaderSearch interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/FileManager.h"
+#include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/IdentifierTable.h"
+#include "llvm/System/Path.h"
+#include "llvm/ADT/SmallString.h"
+using namespace clang;
+
+HeaderSearch::HeaderSearch(FileManager &FM) : FileMgr(FM), FrameworkMap(64) {
+  SystemDirIdx = 0;
+  NoCurDirSearch = false;
+  
+  NumIncluded = 0;
+  NumMultiIncludeFileOptzn = 0;
+  NumFrameworkLookups = NumSubFrameworkLookups = 0;
+}
+
+void HeaderSearch::PrintStats() {
+  fprintf(stderr, "\n*** HeaderSearch Stats:\n");
+  fprintf(stderr, "%d files tracked.\n", (int)FileInfo.size());
+  unsigned NumOnceOnlyFiles = 0, MaxNumIncludes = 0, NumSingleIncludedFiles = 0;
+  for (unsigned i = 0, e = FileInfo.size(); i != e; ++i) {
+    NumOnceOnlyFiles += FileInfo[i].isImport;
+    if (MaxNumIncludes < FileInfo[i].NumIncludes)
+      MaxNumIncludes = FileInfo[i].NumIncludes;
+    NumSingleIncludedFiles += FileInfo[i].NumIncludes == 1;
+  }
+  fprintf(stderr, "  %d #import/#pragma once files.\n", NumOnceOnlyFiles);
+  fprintf(stderr, "  %d included exactly once.\n", NumSingleIncludedFiles);
+  fprintf(stderr, "  %d max times a file is included.\n", MaxNumIncludes);
+  
+  fprintf(stderr, "  %d #include/#include_next/#import.\n", NumIncluded);
+  fprintf(stderr, "    %d #includes skipped due to"
+          " the multi-include optimization.\n", NumMultiIncludeFileOptzn);
+  
+  fprintf(stderr, "%d framework lookups.\n", NumFrameworkLookups);
+  fprintf(stderr, "%d subframework lookups.\n", NumSubFrameworkLookups);
+}
+
+//===----------------------------------------------------------------------===//
+// Header File Location.
+//===----------------------------------------------------------------------===//
+
+const FileEntry *HeaderSearch::DoFrameworkLookup(const DirectoryEntry *Dir,
+                                                 const char *FilenameStart,
+                                                 const char *FilenameEnd) {
+  // Framework names must have a '/' in the filename.
+  const char *SlashPos = std::find(FilenameStart, FilenameEnd, '/');
+  if (SlashPos == FilenameEnd) return 0;
+  
+  llvm::StringMapEntry<const DirectoryEntry *> &CacheLookup =
+    FrameworkMap.GetOrCreateValue(FilenameStart, SlashPos);
+  
+  // If it is some other directory, fail.
+  if (CacheLookup.getValue() && CacheLookup.getValue() != Dir)
+    return 0;
+
+  // FrameworkName = "/System/Library/Frameworks/"
+  llvm::SmallString<1024> FrameworkName;
+  FrameworkName += Dir->getName();
+  if (FrameworkName.empty() || FrameworkName.back() != '/')
+    FrameworkName.push_back('/');
+  
+  // FrameworkName = "/System/Library/Frameworks/Cocoa"
+  FrameworkName.append(FilenameStart, SlashPos);
+  
+  // FrameworkName = "/System/Library/Frameworks/Cocoa.framework/"
+  FrameworkName += ".framework/";
+ 
+  if (CacheLookup.getValue() == 0) {
+    ++NumFrameworkLookups;
+    
+    // If the framework dir doesn't exist, we fail.
+    if (!llvm::sys::Path(std::string(FrameworkName.begin(), 
+                                     FrameworkName.end())).exists())
+      return 0;
+    
+    // Otherwise, if it does, remember that this is the right direntry for this
+    // framework.
+    CacheLookup.setValue(Dir);
+  }
+  
+  // Check "/System/Library/Frameworks/Cocoa.framework/Headers/file.h"
+  unsigned OrigSize = FrameworkName.size();
+  
+  FrameworkName += "Headers/";
+  FrameworkName.append(SlashPos+1, FilenameEnd);
+  if (const FileEntry *FE = FileMgr.getFile(FrameworkName.begin(),
+                                            FrameworkName.end())) {
+    return FE;
+  }
+  
+  // Check "/System/Library/Frameworks/Cocoa.framework/PrivateHeaders/file.h"
+  const char *Private = "Private";
+  FrameworkName.insert(FrameworkName.begin()+OrigSize, Private, 
+                       Private+strlen(Private));
+  return FileMgr.getFile(FrameworkName.begin(), FrameworkName.end());
+}
+
+/// LookupFile - Given a "foo" or <foo> reference, look up the indicated file,
+/// return null on failure.  isAngled indicates whether the file reference is
+/// for system #include's or not (i.e. using <> instead of "").  CurFileEnt, if
+/// non-null, indicates where the #including file is, in case a relative search
+/// is needed.
+const FileEntry *HeaderSearch::LookupFile(const char *FilenameStart,
+                                          const char *FilenameEnd, 
+                                          bool isAngled,
+                                          const DirectoryLookup *FromDir,
+                                          const DirectoryLookup *&CurDir,
+                                          const FileEntry *CurFileEnt) {
+  // If 'Filename' is absolute, check to see if it exists and no searching.
+  // FIXME: Portability.  This should be a sys::Path interface, this doesn't
+  // handle things like C:\foo.txt right, nor win32 \\network\device\blah.
+  if (FilenameStart[0] == '/') {
+    CurDir = 0;
+
+    // If this was an #include_next "/absolute/file", fail.
+    if (FromDir) return 0;
+    
+    // Otherwise, just return the file.
+    return FileMgr.getFile(FilenameStart, FilenameEnd);
+  }
+  
+  llvm::SmallString<1024> TmpDir;
+  
+  // Step #0, unless disabled, check to see if the file is in the #includer's
+  // directory.  This search is not done for <> headers.
+  if (CurFileEnt && !isAngled && !NoCurDirSearch) {
+    // Concatenate the requested file onto the directory.
+    // FIXME: Portability.  Filename concatenation should be in sys::Path.
+    TmpDir += CurFileEnt->getDir()->getName();
+    TmpDir.push_back('/');
+    TmpDir.append(FilenameStart, FilenameEnd);
+    if (const FileEntry *FE = FileMgr.getFile(TmpDir.begin(), TmpDir.end())) {
+      // Leave CurDir unset.
+      
+      // This file is a system header or C++ unfriendly if the old file is.
+      getFileInfo(FE).DirInfo = getFileInfo(CurFileEnt).DirInfo;
+      return FE;
+    }
+    TmpDir.clear();
+  }
+  
+  CurDir = 0;
+
+  // If this is a system #include, ignore the user #include locs.
+  unsigned i = isAngled ? SystemDirIdx : 0;
+  
+  // If this is a #include_next request, start searching after the directory the
+  // file was found in.
+  if (FromDir)
+    i = FromDir-&SearchDirs[0];
+  
+  // Check each directory in sequence to see if it contains this file.
+  for (; i != SearchDirs.size(); ++i) {
+    const FileEntry *FE = 0;
+    if (!SearchDirs[i].isFramework()) {
+      // FIXME: Portability.  Adding file to dir should be in sys::Path.
+      // Concatenate the requested file onto the directory.
+      TmpDir.clear();
+      TmpDir += SearchDirs[i].getDir()->getName();
+      TmpDir.push_back('/');
+      TmpDir.append(FilenameStart, FilenameEnd);
+      FE = FileMgr.getFile(TmpDir.begin(), TmpDir.end());
+    } else {
+      FE = DoFrameworkLookup(SearchDirs[i].getDir(), FilenameStart,FilenameEnd);
+    }
+    
+    if (FE) {
+      CurDir = &SearchDirs[i];
+      
+      // This file is a system header or C++ unfriendly if the dir is.
+      getFileInfo(FE).DirInfo = CurDir->getDirCharacteristic();
+      return FE;
+    }
+  }
+  
+  // Otherwise, didn't find it.
+  return 0;
+}
+
+/// LookupSubframeworkHeader - Look up a subframework for the specified
+/// #include file.  For example, if #include'ing <HIToolbox/HIToolbox.h> from
+/// within ".../Carbon.framework/Headers/Carbon.h", check to see if HIToolbox
+/// is a subframework within Carbon.framework.  If so, return the FileEntry
+/// for the designated file, otherwise return null.
+const FileEntry *HeaderSearch::
+LookupSubframeworkHeader(const char *FilenameStart,
+                         const char *FilenameEnd,
+                         const FileEntry *ContextFileEnt) {
+  // Framework names must have a '/' in the filename.  Find it.
+  const char *SlashPos = std::find(FilenameStart, FilenameEnd, '/');
+  if (SlashPos == FilenameEnd) return 0;
+  
+  // Look up the base framework name of the ContextFileEnt.
+  const char *ContextName = ContextFileEnt->getName();
+    
+  // If the context info wasn't a framework, couldn't be a subframework.
+  const char *FrameworkPos = strstr(ContextName, ".framework/");
+  if (FrameworkPos == 0)
+    return 0;
+  
+  llvm::SmallString<1024> FrameworkName(ContextName, 
+                                        FrameworkPos+strlen(".framework/"));
+
+  // Append Frameworks/HIToolbox.framework/
+  FrameworkName += "Frameworks/";
+  FrameworkName.append(FilenameStart, SlashPos);
+  FrameworkName += ".framework/";
+
+  llvm::StringMapEntry<const DirectoryEntry *> &CacheLookup =
+    FrameworkMap.GetOrCreateValue(FilenameStart, SlashPos);
+  
+  // Some other location?
+  if (CacheLookup.getValue() &&
+      CacheLookup.getKeyLength() == FrameworkName.size() &&
+      memcmp(CacheLookup.getKeyData(), &FrameworkName[0],
+             CacheLookup.getKeyLength()) != 0)
+    return 0;
+  
+  // Cache subframework.
+  if (CacheLookup.getValue() == 0) {
+    ++NumSubFrameworkLookups;
+    
+    // If the framework dir doesn't exist, we fail.
+    const DirectoryEntry *Dir = FileMgr.getDirectory(FrameworkName.begin(),
+                                                     FrameworkName.end());
+    if (Dir == 0) return 0;
+    
+    // Otherwise, if it does, remember that this is the right direntry for this
+    // framework.
+    CacheLookup.setValue(Dir);
+  }
+  
+  const FileEntry *FE = 0;
+
+  // Check ".../Frameworks/HIToolbox.framework/Headers/HIToolbox.h"
+  llvm::SmallString<1024> HeadersFilename(FrameworkName);
+  HeadersFilename += "Headers/";
+  HeadersFilename.append(SlashPos+1, FilenameEnd);
+  if (!(FE = FileMgr.getFile(HeadersFilename.begin(),
+                             HeadersFilename.end()))) {
+    
+    // Check ".../Frameworks/HIToolbox.framework/PrivateHeaders/HIToolbox.h"
+    HeadersFilename = FrameworkName;
+    HeadersFilename += "PrivateHeaders/";
+    HeadersFilename.append(SlashPos+1, FilenameEnd);
+    if (!(FE = FileMgr.getFile(HeadersFilename.begin(), HeadersFilename.end())))
+      return 0;
+  }
+  
+  // This file is a system header or C++ unfriendly if the old file is.
+  getFileInfo(FE).DirInfo = getFileInfo(ContextFileEnt).DirInfo;
+  return FE;
+}
+
+//===----------------------------------------------------------------------===//
+// File Info Management.
+//===----------------------------------------------------------------------===//
+
+
+/// getFileInfo - Return the PerFileInfo structure for the specified
+/// FileEntry.
+HeaderSearch::PerFileInfo &HeaderSearch::getFileInfo(const FileEntry *FE) {
+  if (FE->getUID() >= FileInfo.size())
+    FileInfo.resize(FE->getUID()+1);
+  return FileInfo[FE->getUID()];
+}  
+
+/// ShouldEnterIncludeFile - Mark the specified file as a target of of a
+/// #include, #include_next, or #import directive.  Return false if #including
+/// the file will have no effect or true if we should include it.
+bool HeaderSearch::ShouldEnterIncludeFile(const FileEntry *File, bool isImport){
+  ++NumIncluded; // Count # of attempted #includes.
+
+  // Get information about this file.
+  PerFileInfo &FileInfo = getFileInfo(File);
+  
+  // If this is a #import directive, check that we have not already imported
+  // this header.
+  if (isImport) {
+    // If this has already been imported, don't import it again.
+    FileInfo.isImport = true;
+    
+    // Has this already been #import'ed or #include'd?
+    if (FileInfo.NumIncludes) return false;
+  } else {
+    // Otherwise, if this is a #include of a file that was previously #import'd
+    // or if this is the second #include of a #pragma once file, ignore it.
+    if (FileInfo.isImport)
+      return false;
+  }
+  
+  // Next, check to see if the file is wrapped with #ifndef guards.  If so, and
+  // if the macro that guards it is defined, we know the #include has no effect.
+  if (FileInfo.ControllingMacro && FileInfo.ControllingMacro->getMacroInfo()) {
+    ++NumMultiIncludeFileOptzn;
+    return false;
+  }
+  
+  // Increment the number of times this file has been included.
+  ++FileInfo.NumIncludes;
+  
+  return true;
+}
+
+
diff --git a/Lex/IdentifierTable.cpp b/Lex/IdentifierTable.cpp
new file mode 100644
index 0000000000..e671af9839
--- /dev/null
+++ b/Lex/IdentifierTable.cpp
@@ -0,0 +1,188 @@
+//===--- IdentifierTable.cpp - Hash table for identifier lookup -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IdentifierInfo, IdentifierVisitor, and
+// IdentifierTable interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/IdentifierTable.h"
+#include "clang/Lex/MacroInfo.h"
+#include "clang/Basic/LangOptions.h"
+using namespace clang;
+
+//===----------------------------------------------------------------------===//
+// IdentifierInfo Implementation
+//===----------------------------------------------------------------------===//
+
+IdentifierInfo::IdentifierInfo() {
+  Macro = 0;
+  TokenID = tok::identifier;
+  PPID = tok::pp_not_keyword;
+  ObjCID = tok::objc_not_keyword;
+  BuiltinID = 0;
+  IsExtension = false;
+  IsPoisoned = false;
+  IsOtherTargetMacro = false;
+  IsCPPOperatorKeyword = false;
+  FETokenInfo = 0;
+}
+
+IdentifierInfo::~IdentifierInfo() {
+  delete Macro;
+}
+
+//===----------------------------------------------------------------------===//
+// IdentifierTable Implementation
+//===----------------------------------------------------------------------===//
+
+IdentifierTable::IdentifierTable(const LangOptions &LangOpts)
+  // Start with space for 8K identifiers.
+  : HashTable(8192) {
+
+  // Populate the identifier table with info about keywords for the current
+  // language.
+  AddKeywords(LangOpts);
+}
+
+//===----------------------------------------------------------------------===//
+// Language Keyword Implementation
+//===----------------------------------------------------------------------===//
+
+/// AddKeyword - This method is used to associate a token ID with specific
+/// identifiers because they are language keywords.  This causes the lexer to
+/// automatically map matching identifiers to specialized token codes.
+///
+/// The C90/C99/CPP flags are set to 0 if the token should be enabled in the
+/// specified langauge, set to 1 if it is an extension in the specified
+/// language, and set to 2 if disabled in the specified language.
+static void AddKeyword(const char *Keyword, unsigned KWLen,
+                       tok::TokenKind TokenCode,
+                       int C90, int C99, int CXX,
+                       const LangOptions &LangOpts, IdentifierTable &Table) {
+  int Flags = LangOpts.CPlusPlus ? CXX : (LangOpts.C99 ? C99 : C90);
+  
+  // Don't add this keyword if disabled in this language or if an extension
+  // and extensions are disabled.
+  if (Flags + LangOpts.NoExtensions >= 2) return;
+  
+  IdentifierInfo &Info = Table.get(Keyword, Keyword+KWLen);
+  Info.setTokenID(TokenCode);
+  Info.setIsExtensionToken(Flags == 1);
+}
+
+static void AddAlias(const char *Keyword, unsigned KWLen,
+                     const char *AliaseeKeyword, unsigned AliaseeKWLen,
+                     const LangOptions &LangOpts, IdentifierTable &Table) {
+  IdentifierInfo &AliasInfo = Table.get(Keyword, Keyword+KWLen);
+  IdentifierInfo &AliaseeInfo = Table.get(AliaseeKeyword,
+                                          AliaseeKeyword+AliaseeKWLen);
+  AliasInfo.setTokenID(AliaseeInfo.getTokenID());
+  AliasInfo.setIsExtensionToken(AliaseeInfo.isExtensionToken());
+}  
+
+/// AddPPKeyword - Register a preprocessor keyword like "define" "undef" or 
+/// "elif".
+static void AddPPKeyword(tok::PPKeywordKind PPID, 
+                         const char *Name, unsigned NameLen,
+                         IdentifierTable &Table) {
+  Table.get(Name, Name+NameLen).setPPKeywordID(PPID);
+}
+
+/// AddCXXOperatorKeyword - Register a C++ operator keyword alternative
+/// representations.
+static void AddCXXOperatorKeyword(const char *Keyword, unsigned KWLen,
+                                  tok::TokenKind TokenCode,
+                                  IdentifierTable &Table) {
+  IdentifierInfo &Info = Table.get(Keyword, Keyword + KWLen);
+  Info.setTokenID(TokenCode);
+  Info.setIsCPlusplusOperatorKeyword();
+}
+
+/// AddObjCKeyword - Register an Objective-C @keyword like "class" "selector" or 
+/// "property".
+static void AddObjCKeyword(tok::ObjCKeywordKind ObjCID, 
+                           const char *Name, unsigned NameLen,
+                           IdentifierTable &Table) {
+  Table.get(Name, Name+NameLen).setObjCKeywordID(ObjCID);
+}
+
+/// AddKeywords - Add all keywords to the symbol table.
+///
+void IdentifierTable::AddKeywords(const LangOptions &LangOpts) {
+  enum {
+    C90Shift = 0,
+    EXTC90   = 1 << C90Shift,
+    NOTC90   = 2 << C90Shift,
+    C99Shift = 2,
+    EXTC99   = 1 << C99Shift,
+    NOTC99   = 2 << C99Shift,
+    CPPShift = 4,
+    EXTCPP   = 1 << CPPShift,
+    NOTCPP   = 2 << CPPShift,
+    Mask     = 3
+  };
+  
+  // Add keywords and tokens for the current language.
+#define KEYWORD(NAME, FLAGS) \
+  AddKeyword(#NAME, strlen(#NAME), tok::kw_ ## NAME,  \
+             ((FLAGS) >> C90Shift) & Mask, \
+             ((FLAGS) >> C99Shift) & Mask, \
+             ((FLAGS) >> CPPShift) & Mask, LangOpts, *this);
+#define ALIAS(NAME, TOK) \
+  AddAlias(NAME, strlen(NAME), #TOK, strlen(#TOK), LangOpts, *this);
+#define PPKEYWORD(NAME) \
+  AddPPKeyword(tok::pp_##NAME, #NAME, strlen(#NAME), *this);
+#define CXX_KEYWORD_OPERATOR(NAME, ALIAS) \
+  if (LangOpts.CXXOperatorNames)          \
+    AddCXXOperatorKeyword(#NAME, strlen(#NAME), tok::ALIAS, *this);
+#define OBJC1_AT_KEYWORD(NAME) \
+  if (LangOpts.ObjC1)          \
+    AddObjCKeyword(tok::objc_##NAME, #NAME, strlen(#NAME), *this);
+#define OBJC2_AT_KEYWORD(NAME) \
+  if (LangOpts.ObjC2)          \
+    AddObjCKeyword(tok::objc_##NAME, #NAME, strlen(#NAME), *this);
+#include "clang/Basic/TokenKinds.def"
+}
+
+
+//===----------------------------------------------------------------------===//
+// Stats Implementation
+//===----------------------------------------------------------------------===//
+
+/// PrintStats - Print statistics about how well the identifier table is doing
+/// at hashing identifiers.
+void IdentifierTable::PrintStats() const {
+  unsigned NumBuckets = HashTable.getNumBuckets();
+  unsigned NumIdentifiers = HashTable.getNumItems();
+  unsigned NumEmptyBuckets = NumBuckets-NumIdentifiers;
+  unsigned AverageIdentifierSize = 0;
+  unsigned MaxIdentifierLength = 0;
+  
+  // TODO: Figure out maximum times an identifier had to probe for -stats.
+  for (llvm::StringMap<IdentifierInfo, llvm::BumpPtrAllocator>::const_iterator
+       I = HashTable.begin(), E = HashTable.end(); I != E; ++I) {
+    unsigned IdLen = I->getKeyLength();
+    AverageIdentifierSize += IdLen;
+    if (MaxIdentifierLength < IdLen)
+      MaxIdentifierLength = IdLen;
+  }
+  
+  fprintf(stderr, "\n*** Identifier Table Stats:\n");
+  fprintf(stderr, "# Identifiers:   %d\n", NumIdentifiers);
+  fprintf(stderr, "# Empty Buckets: %d\n", NumEmptyBuckets);
+  fprintf(stderr, "Hash density (#identifiers per bucket): %f\n",
+          NumIdentifiers/(double)NumBuckets);
+  fprintf(stderr, "Ave identifier length: %f\n",
+          (AverageIdentifierSize/(double)NumIdentifiers));
+  fprintf(stderr, "Max identifier length: %d\n", MaxIdentifierLength);
+  
+  // Compute statistics about the memory allocated for identifiers.
+  HashTable.getAllocator().PrintStats();
+}
diff --git a/Lex/Lexer.cpp b/Lex/Lexer.cpp
new file mode 100644
index 0000000000..1775b2f7bf
--- /dev/null
+++ b/Lex/Lexer.cpp
@@ -0,0 +1,1491 @@
+//===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the Lexer and LexerToken interfaces.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: GCC Diagnostics emitted by the lexer:
+// PEDWARN: (form feed|vertical tab) in preprocessing directive
+//
+// Universal characters, unicode, char mapping:
+// WARNING: `%.*s' is not in NFKC
+// WARNING: `%.*s' is not in NFC
+//
+// Other:
+// TODO: Options to support:
+//    -fexec-charset,-fwide-exec-charset
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/SourceLocation.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cctype>
+using namespace clang;
+
+static void InitCharacterInfo();
+
+Lexer::Lexer(const llvm::MemoryBuffer *File, unsigned fileid, Preprocessor &pp,
+             const char *BufStart, const char *BufEnd)
+  : BufferEnd(BufEnd ? BufEnd : File->getBufferEnd()),
+    InputFile(File), CurFileID(fileid), PP(pp), Features(PP.getLangOptions()) {
+  Is_PragmaLexer = false;
+  IsMainFile = false;
+  InitCharacterInfo();
+      
+  assert(BufferEnd[0] == 0 &&
+         "We assume that the input buffer has a null character at the end"
+         " to simplify lexing!");
+    
+  BufferPtr = BufStart ? BufStart : File->getBufferStart();
+
+  // Start of the file is a start of line.
+  IsAtStartOfLine = true;
+
+  // We are not after parsing a #.
+  ParsingPreprocessorDirective = false;
+
+  // We are not after parsing #include.
+  ParsingFilename = false;
+
+  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
+  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
+  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
+  // or otherwise skipping over tokens.
+  LexingRawMode = false;
+  
+  // Default to keeping comments if requested.
+  KeepCommentMode = PP.getCommentRetentionState();
+}
+
+/// Stringify - Convert the specified string into a C string, with surrounding
+/// ""'s, and with escaped \ and " characters.
+std::string Lexer::Stringify(const std::string &Str, bool Charify) {
+  std::string Result = Str;
+  char Quote = Charify ? '\'' : '"';
+  for (unsigned i = 0, e = Result.size(); i != e; ++i) {
+    if (Result[i] == '\\' || Result[i] == Quote) {
+      Result.insert(Result.begin()+i, '\\');
+      ++i; ++e;
+    }
+  }
+  return Result;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Character information.
+//===----------------------------------------------------------------------===//
+
+static unsigned char CharInfo[256];
+
+enum {
+  CHAR_HORZ_WS  = 0x01,  // ' ', '\t', '\f', '\v'.  Note, no '\0'
+  CHAR_VERT_WS  = 0x02,  // '\r', '\n'
+  CHAR_LETTER   = 0x04,  // a-z,A-Z
+  CHAR_NUMBER   = 0x08,  // 0-9
+  CHAR_UNDER    = 0x10,  // _
+  CHAR_PERIOD   = 0x20   // .
+};
+
+static void InitCharacterInfo() {
+  static bool isInited = false;
+  if (isInited) return;
+  isInited = true;
+  
+  // Intiialize the CharInfo table.
+  // TODO: statically initialize this.
+  CharInfo[(int)' '] = CharInfo[(int)'\t'] = 
+  CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS;
+  CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS;
+  
+  CharInfo[(int)'_'] = CHAR_UNDER;
+  CharInfo[(int)'.'] = CHAR_PERIOD;
+  for (unsigned i = 'a'; i <= 'z'; ++i)
+    CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER;
+  for (unsigned i = '0'; i <= '9'; ++i)
+    CharInfo[i] = CHAR_NUMBER;
+}
+
+/// isIdentifierBody - Return true if this is the body character of an
+/// identifier, which is [a-zA-Z0-9_].
+static inline bool isIdentifierBody(unsigned char c) {
+  return CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER);
+}
+
+/// isHorizontalWhitespace - Return true if this character is horizontal
+/// whitespace: ' ', '\t', '\f', '\v'.  Note that this returns false for '\0'.
+static inline bool isHorizontalWhitespace(unsigned char c) {
+  return CharInfo[c] & CHAR_HORZ_WS;
+}
+
+/// isWhitespace - Return true if this character is horizontal or vertical
+/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.  Note that this returns false
+/// for '\0'.
+static inline bool isWhitespace(unsigned char c) {
+  return CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS);
+}
+
+/// isNumberBody - Return true if this is the body character of an
+/// preprocessing number, which is [a-zA-Z0-9_.].
+static inline bool isNumberBody(unsigned char c) {
+  return CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Diagnostics forwarding code.
+//===----------------------------------------------------------------------===//
+
+/// getSourceLocation - Return a source location identifier for the specified
+/// offset in the current file.
+SourceLocation Lexer::getSourceLocation(const char *Loc) const {
+  assert(Loc >= InputFile->getBufferStart() && Loc <= BufferEnd &&
+         "Location out of range for this buffer!");
+  return SourceLocation(CurFileID, Loc-InputFile->getBufferStart());
+}
+
+
+/// Diag - Forwarding function for diagnostics.  This translate a source
+/// position in the current buffer into a SourceLocation object for rendering.
+void Lexer::Diag(const char *Loc, unsigned DiagID,
+                 const std::string &Msg) const {
+  if (LexingRawMode && Diagnostic::isNoteWarningOrExtension(DiagID))
+    return;
+  PP.Diag(getSourceLocation(Loc), DiagID, Msg);
+}
+void Lexer::Diag(SourceLocation Loc, unsigned DiagID,
+                 const std::string &Msg) const {
+  if (LexingRawMode && Diagnostic::isNoteWarningOrExtension(DiagID))
+    return;
+  PP.Diag(Loc, DiagID, Msg);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Trigraph and Escaped Newline Handling Code.
+//===----------------------------------------------------------------------===//
+
+/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
+/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
+static char GetTrigraphCharForLetter(char Letter) {
+  switch (Letter) {
+  default:   return 0;
+  case '=':  return '#';
+  case ')':  return ']';
+  case '(':  return '[';
+  case '!':  return '|';
+  case '\'': return '^';
+  case '>':  return '}';
+  case '/':  return '\\';
+  case '<':  return '{';
+  case '-':  return '~';
+  }
+}
+
+/// DecodeTrigraphChar - If the specified character is a legal trigraph when
+/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
+/// return the result character.  Finally, emit a warning about trigraph use
+/// whether trigraphs are enabled or not.
+static char DecodeTrigraphChar(const char *CP, Lexer *L) {
+  char Res = GetTrigraphCharForLetter(*CP);
+  if (Res && L) {
+    if (!L->getFeatures().Trigraphs) {
+      L->Diag(CP-2, diag::trigraph_ignored);
+      return 0;
+    } else {
+      L->Diag(CP-2, diag::trigraph_converted, std::string()+Res);
+    }
+  }
+  return Res;
+}
+
+/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
+/// get its size, and return it.  This is tricky in several cases:
+///   1. If currently at the start of a trigraph, we warn about the trigraph,
+///      then either return the trigraph (skipping 3 chars) or the '?',
+///      depending on whether trigraphs are enabled or not.
+///   2. If this is an escaped newline (potentially with whitespace between
+///      the backslash and newline), implicitly skip the newline and return
+///      the char after it.
+///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
+///
+/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
+/// know that we can accumulate into Size, and that we have already incremented
+/// Ptr by Size bytes.
+///
+/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
+/// be updated to match.
+///
+char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
+                               LexerToken *Tok) {
+  // If we have a slash, look for an escaped newline.
+  if (Ptr[0] == '\\') {
+    ++Size;
+    ++Ptr;
+Slash:
+    // Common case, backslash-char where the char is not whitespace.
+    if (!isWhitespace(Ptr[0])) return '\\';
+    
+    // See if we have optional whitespace characters followed by a newline.
+    {
+      unsigned SizeTmp = 0;
+      do {
+        ++SizeTmp;
+        if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
+          // Remember that this token needs to be cleaned.
+          if (Tok) Tok->setFlag(LexerToken::NeedsCleaning);
+
+          // Warn if there was whitespace between the backslash and newline.
+          if (SizeTmp != 1 && Tok)
+            Diag(Ptr, diag::backslash_newline_space);
+          
+          // If this is a \r\n or \n\r, skip the newlines.
+          if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
+              Ptr[SizeTmp-1] != Ptr[SizeTmp])
+            ++SizeTmp;
+          
+          // Found backslash<whitespace><newline>.  Parse the char after it.
+          Size += SizeTmp;
+          Ptr  += SizeTmp;
+          // Use slow version to accumulate a correct size field.
+          return getCharAndSizeSlow(Ptr, Size, Tok);
+        }
+      } while (isWhitespace(Ptr[SizeTmp]));
+    }
+      
+    // Otherwise, this is not an escaped newline, just return the slash.
+    return '\\';
+  }
+  
+  // If this is a trigraph, process it.
+  if (Ptr[0] == '?' && Ptr[1] == '?') {
+    // If this is actually a legal trigraph (not something like "??x"), emit
+    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
+    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
+      // Remember that this token needs to be cleaned.
+      if (Tok) Tok->setFlag(LexerToken::NeedsCleaning);
+
+      Ptr += 3;
+      Size += 3;
+      if (C == '\\') goto Slash;
+      return C;
+    }
+  }
+  
+  // If this is neither, return a single character.
+  ++Size;
+  return *Ptr;
+}
+
+
+/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
+/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
+/// and that we have already incremented Ptr by Size bytes.
+///
+/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
+/// be updated to match.
+char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
+                                     const LangOptions &Features) {
+  // If we have a slash, look for an escaped newline.
+  if (Ptr[0] == '\\') {
+    ++Size;
+    ++Ptr;
+Slash:
+    // Common case, backslash-char where the char is not whitespace.
+    if (!isWhitespace(Ptr[0])) return '\\';
+    
+    // See if we have optional whitespace characters followed by a newline.
+    {
+      unsigned SizeTmp = 0;
+      do {
+        ++SizeTmp;
+        if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
+          
+          // If this is a \r\n or \n\r, skip the newlines.
+          if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
+              Ptr[SizeTmp-1] != Ptr[SizeTmp])
+            ++SizeTmp;
+          
+          // Found backslash<whitespace><newline>.  Parse the char after it.
+          Size += SizeTmp;
+          Ptr  += SizeTmp;
+          
+          // Use slow version to accumulate a correct size field.
+          return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
+        }
+      } while (isWhitespace(Ptr[SizeTmp]));
+    }
+    
+    // Otherwise, this is not an escaped newline, just return the slash.
+    return '\\';
+  }
+  
+  // If this is a trigraph, process it.
+  if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
+    // If this is actually a legal trigraph (not something like "??x"), return
+    // it.
+    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
+      Ptr += 3;
+      Size += 3;
+      if (C == '\\') goto Slash;
+      return C;
+    }
+  }
+  
+  // If this is neither, return a single character.
+  ++Size;
+  return *Ptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper methods for lexing.
+//===----------------------------------------------------------------------===//
+
+void Lexer::LexIdentifier(LexerToken &Result, const char *CurPtr) {
+  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
+  unsigned Size;
+  unsigned char C = *CurPtr++;
+  while (isIdentifierBody(C)) {
+    C = *CurPtr++;
+  }
+  --CurPtr;   // Back up over the skipped character.
+
+  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
+  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
+  // FIXME: UCNs.
+  if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
+FinishIdentifier:
+    const char *IdStart = BufferPtr;
+    FormTokenWithChars(Result, CurPtr);
+    Result.setKind(tok::identifier);
+    
+    // If we are in raw mode, return this identifier raw.  There is no need to
+    // look up identifier information or
author	Reid Spencer <rspencer@reidspencer.com>	2007-07-11 17:01:13 +0000
committer	Reid Spencer <rspencer@reidspencer.com>	2007-07-11 17:01:13 +0000
commit	5f016e2cb5d11daeb237544de1c5d59f20fe1a6e (patch)
tree	8b6bfcb8783d16827f896d5facbd4549300e8a1e /Lex
parent	a5f182095bf2065ca94f1c86957ee91f9068964b (diff)