aboutsummaryrefslogtreecommitdiff
path: root/Lex
diff options
context:
space:
mode:
authorReid Spencer <rspencer@reidspencer.com>2007-07-11 17:01:13 +0000
committerReid Spencer <rspencer@reidspencer.com>2007-07-11 17:01:13 +0000
commit5f016e2cb5d11daeb237544de1c5d59f20fe1a6e (patch)
tree8b6bfcb8783d16827f896d5facbd4549300e8a1e /Lex
parenta5f182095bf2065ca94f1c86957ee91f9068964b (diff)
Stage two of getting CFE top correct.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@39734 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'Lex')
-rw-r--r--Lex/HeaderSearch.cpp319
-rw-r--r--Lex/IdentifierTable.cpp188
-rw-r--r--Lex/Lexer.cpp1491
-rw-r--r--Lex/LiteralSupport.cpp661
-rw-r--r--Lex/MacroExpander.cpp636
-rw-r--r--Lex/MacroInfo.cpp70
-rw-r--r--Lex/Makefile28
-rw-r--r--Lex/PPExpressions.cpp654
-rw-r--r--Lex/Pragma.cpp369
-rw-r--r--Lex/Preprocessor.cpp2087
-rw-r--r--Lex/ScratchBuffer.cpp71
11 files changed, 6574 insertions, 0 deletions
diff --git a/Lex/HeaderSearch.cpp b/Lex/HeaderSearch.cpp
new file mode 100644
index 0000000000..520205e1da
--- /dev/null
+++ b/Lex/HeaderSearch.cpp
@@ -0,0 +1,319 @@
+//===--- HeaderSearch.cpp - Resolve Header File Locations ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the DirectoryLookup and HeaderSearch interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/FileManager.h"
+#include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/IdentifierTable.h"
+#include "llvm/System/Path.h"
+#include "llvm/ADT/SmallString.h"
+using namespace clang;
+
+HeaderSearch::HeaderSearch(FileManager &FM) : FileMgr(FM), FrameworkMap(64) {
+ SystemDirIdx = 0;
+ NoCurDirSearch = false;
+
+ NumIncluded = 0;
+ NumMultiIncludeFileOptzn = 0;
+ NumFrameworkLookups = NumSubFrameworkLookups = 0;
+}
+
+void HeaderSearch::PrintStats() {
+ fprintf(stderr, "\n*** HeaderSearch Stats:\n");
+ fprintf(stderr, "%d files tracked.\n", (int)FileInfo.size());
+ unsigned NumOnceOnlyFiles = 0, MaxNumIncludes = 0, NumSingleIncludedFiles = 0;
+ for (unsigned i = 0, e = FileInfo.size(); i != e; ++i) {
+ NumOnceOnlyFiles += FileInfo[i].isImport;
+ if (MaxNumIncludes < FileInfo[i].NumIncludes)
+ MaxNumIncludes = FileInfo[i].NumIncludes;
+ NumSingleIncludedFiles += FileInfo[i].NumIncludes == 1;
+ }
+ fprintf(stderr, " %d #import/#pragma once files.\n", NumOnceOnlyFiles);
+ fprintf(stderr, " %d included exactly once.\n", NumSingleIncludedFiles);
+ fprintf(stderr, " %d max times a file is included.\n", MaxNumIncludes);
+
+ fprintf(stderr, " %d #include/#include_next/#import.\n", NumIncluded);
+ fprintf(stderr, " %d #includes skipped due to"
+ " the multi-include optimization.\n", NumMultiIncludeFileOptzn);
+
+ fprintf(stderr, "%d framework lookups.\n", NumFrameworkLookups);
+ fprintf(stderr, "%d subframework lookups.\n", NumSubFrameworkLookups);
+}
+
+//===----------------------------------------------------------------------===//
+// Header File Location.
+//===----------------------------------------------------------------------===//
+
+const FileEntry *HeaderSearch::DoFrameworkLookup(const DirectoryEntry *Dir,
+ const char *FilenameStart,
+ const char *FilenameEnd) {
+ // Framework names must have a '/' in the filename.
+ const char *SlashPos = std::find(FilenameStart, FilenameEnd, '/');
+ if (SlashPos == FilenameEnd) return 0;
+
+ llvm::StringMapEntry<const DirectoryEntry *> &CacheLookup =
+ FrameworkMap.GetOrCreateValue(FilenameStart, SlashPos);
+
+ // If it is some other directory, fail.
+ if (CacheLookup.getValue() && CacheLookup.getValue() != Dir)
+ return 0;
+
+ // FrameworkName = "/System/Library/Frameworks/"
+ llvm::SmallString<1024> FrameworkName;
+ FrameworkName += Dir->getName();
+ if (FrameworkName.empty() || FrameworkName.back() != '/')
+ FrameworkName.push_back('/');
+
+ // FrameworkName = "/System/Library/Frameworks/Cocoa"
+ FrameworkName.append(FilenameStart, SlashPos);
+
+ // FrameworkName = "/System/Library/Frameworks/Cocoa.framework/"
+ FrameworkName += ".framework/";
+
+ if (CacheLookup.getValue() == 0) {
+ ++NumFrameworkLookups;
+
+ // If the framework dir doesn't exist, we fail.
+ if (!llvm::sys::Path(std::string(FrameworkName.begin(),
+ FrameworkName.end())).exists())
+ return 0;
+
+ // Otherwise, if it does, remember that this is the right direntry for this
+ // framework.
+ CacheLookup.setValue(Dir);
+ }
+
+ // Check "/System/Library/Frameworks/Cocoa.framework/Headers/file.h"
+ unsigned OrigSize = FrameworkName.size();
+
+ FrameworkName += "Headers/";
+ FrameworkName.append(SlashPos+1, FilenameEnd);
+ if (const FileEntry *FE = FileMgr.getFile(FrameworkName.begin(),
+ FrameworkName.end())) {
+ return FE;
+ }
+
+ // Check "/System/Library/Frameworks/Cocoa.framework/PrivateHeaders/file.h"
+ const char *Private = "Private";
+ FrameworkName.insert(FrameworkName.begin()+OrigSize, Private,
+ Private+strlen(Private));
+ return FileMgr.getFile(FrameworkName.begin(), FrameworkName.end());
+}
+
+/// LookupFile - Given a "foo" or <foo> reference, look up the indicated file,
+/// return null on failure. isAngled indicates whether the file reference is
+/// for system #include's or not (i.e. using <> instead of ""). CurFileEnt, if
+/// non-null, indicates where the #including file is, in case a relative search
+/// is needed.
+const FileEntry *HeaderSearch::LookupFile(const char *FilenameStart,
+ const char *FilenameEnd,
+ bool isAngled,
+ const DirectoryLookup *FromDir,
+ const DirectoryLookup *&CurDir,
+ const FileEntry *CurFileEnt) {
+ // If 'Filename' is absolute, check to see if it exists and no searching.
+ // FIXME: Portability. This should be a sys::Path interface, this doesn't
+ // handle things like C:\foo.txt right, nor win32 \\network\device\blah.
+ if (FilenameStart[0] == '/') {
+ CurDir = 0;
+
+ // If this was an #include_next "/absolute/file", fail.
+ if (FromDir) return 0;
+
+ // Otherwise, just return the file.
+ return FileMgr.getFile(FilenameStart, FilenameEnd);
+ }
+
+ llvm::SmallString<1024> TmpDir;
+
+ // Step #0, unless disabled, check to see if the file is in the #includer's
+ // directory. This search is not done for <> headers.
+ if (CurFileEnt && !isAngled && !NoCurDirSearch) {
+ // Concatenate the requested file onto the directory.
+ // FIXME: Portability. Filename concatenation should be in sys::Path.
+ TmpDir += CurFileEnt->getDir()->getName();
+ TmpDir.push_back('/');
+ TmpDir.append(FilenameStart, FilenameEnd);
+ if (const FileEntry *FE = FileMgr.getFile(TmpDir.begin(), TmpDir.end())) {
+ // Leave CurDir unset.
+
+ // This file is a system header or C++ unfriendly if the old file is.
+ getFileInfo(FE).DirInfo = getFileInfo(CurFileEnt).DirInfo;
+ return FE;
+ }
+ TmpDir.clear();
+ }
+
+ CurDir = 0;
+
+ // If this is a system #include, ignore the user #include locs.
+ unsigned i = isAngled ? SystemDirIdx : 0;
+
+ // If this is a #include_next request, start searching after the directory the
+ // file was found in.
+ if (FromDir)
+ i = FromDir-&SearchDirs[0];
+
+ // Check each directory in sequence to see if it contains this file.
+ for (; i != SearchDirs.size(); ++i) {
+ const FileEntry *FE = 0;
+ if (!SearchDirs[i].isFramework()) {
+ // FIXME: Portability. Adding file to dir should be in sys::Path.
+ // Concatenate the requested file onto the directory.
+ TmpDir.clear();
+ TmpDir += SearchDirs[i].getDir()->getName();
+ TmpDir.push_back('/');
+ TmpDir.append(FilenameStart, FilenameEnd);
+ FE = FileMgr.getFile(TmpDir.begin(), TmpDir.end());
+ } else {
+ FE = DoFrameworkLookup(SearchDirs[i].getDir(), FilenameStart,FilenameEnd);
+ }
+
+ if (FE) {
+ CurDir = &SearchDirs[i];
+
+ // This file is a system header or C++ unfriendly if the dir is.
+ getFileInfo(FE).DirInfo = CurDir->getDirCharacteristic();
+ return FE;
+ }
+ }
+
+ // Otherwise, didn't find it.
+ return 0;
+}
+
+/// LookupSubframeworkHeader - Look up a subframework for the specified
+/// #include file. For example, if #include'ing <HIToolbox/HIToolbox.h> from
+/// within ".../Carbon.framework/Headers/Carbon.h", check to see if HIToolbox
+/// is a subframework within Carbon.framework. If so, return the FileEntry
+/// for the designated file, otherwise return null.
+const FileEntry *HeaderSearch::
+LookupSubframeworkHeader(const char *FilenameStart,
+ const char *FilenameEnd,
+ const FileEntry *ContextFileEnt) {
+ // Framework names must have a '/' in the filename. Find it.
+ const char *SlashPos = std::find(FilenameStart, FilenameEnd, '/');
+ if (SlashPos == FilenameEnd) return 0;
+
+ // Look up the base framework name of the ContextFileEnt.
+ const char *ContextName = ContextFileEnt->getName();
+
+ // If the context info wasn't a framework, couldn't be a subframework.
+ const char *FrameworkPos = strstr(ContextName, ".framework/");
+ if (FrameworkPos == 0)
+ return 0;
+
+ llvm::SmallString<1024> FrameworkName(ContextName,
+ FrameworkPos+strlen(".framework/"));
+
+ // Append Frameworks/HIToolbox.framework/
+ FrameworkName += "Frameworks/";
+ FrameworkName.append(FilenameStart, SlashPos);
+ FrameworkName += ".framework/";
+
+ llvm::StringMapEntry<const DirectoryEntry *> &CacheLookup =
+ FrameworkMap.GetOrCreateValue(FilenameStart, SlashPos);
+
+ // Some other location?
+ if (CacheLookup.getValue() &&
+ CacheLookup.getKeyLength() == FrameworkName.size() &&
+ memcmp(CacheLookup.getKeyData(), &FrameworkName[0],
+ CacheLookup.getKeyLength()) != 0)
+ return 0;
+
+ // Cache subframework.
+ if (CacheLookup.getValue() == 0) {
+ ++NumSubFrameworkLookups;
+
+ // If the framework dir doesn't exist, we fail.
+ const DirectoryEntry *Dir = FileMgr.getDirectory(FrameworkName.begin(),
+ FrameworkName.end());
+ if (Dir == 0) return 0;
+
+ // Otherwise, if it does, remember that this is the right direntry for this
+ // framework.
+ CacheLookup.setValue(Dir);
+ }
+
+ const FileEntry *FE = 0;
+
+ // Check ".../Frameworks/HIToolbox.framework/Headers/HIToolbox.h"
+ llvm::SmallString<1024> HeadersFilename(FrameworkName);
+ HeadersFilename += "Headers/";
+ HeadersFilename.append(SlashPos+1, FilenameEnd);
+ if (!(FE = FileMgr.getFile(HeadersFilename.begin(),
+ HeadersFilename.end()))) {
+
+ // Check ".../Frameworks/HIToolbox.framework/PrivateHeaders/HIToolbox.h"
+ HeadersFilename = FrameworkName;
+ HeadersFilename += "PrivateHeaders/";
+ HeadersFilename.append(SlashPos+1, FilenameEnd);
+ if (!(FE = FileMgr.getFile(HeadersFilename.begin(), HeadersFilename.end())))
+ return 0;
+ }
+
+ // This file is a system header or C++ unfriendly if the old file is.
+ getFileInfo(FE).DirInfo = getFileInfo(ContextFileEnt).DirInfo;
+ return FE;
+}
+
+//===----------------------------------------------------------------------===//
+// File Info Management.
+//===----------------------------------------------------------------------===//
+
+
+/// getFileInfo - Return the PerFileInfo structure for the specified
+/// FileEntry.
+HeaderSearch::PerFileInfo &HeaderSearch::getFileInfo(const FileEntry *FE) {
+ if (FE->getUID() >= FileInfo.size())
+ FileInfo.resize(FE->getUID()+1);
+ return FileInfo[FE->getUID()];
+}
+
+/// ShouldEnterIncludeFile - Mark the specified file as a target of of a
+/// #include, #include_next, or #import directive. Return false if #including
+/// the file will have no effect or true if we should include it.
+bool HeaderSearch::ShouldEnterIncludeFile(const FileEntry *File, bool isImport){
+ ++NumIncluded; // Count # of attempted #includes.
+
+ // Get information about this file.
+ PerFileInfo &FileInfo = getFileInfo(File);
+
+ // If this is a #import directive, check that we have not already imported
+ // this header.
+ if (isImport) {
+ // If this has already been imported, don't import it again.
+ FileInfo.isImport = true;
+
+ // Has this already been #import'ed or #include'd?
+ if (FileInfo.NumIncludes) return false;
+ } else {
+ // Otherwise, if this is a #include of a file that was previously #import'd
+ // or if this is the second #include of a #pragma once file, ignore it.
+ if (FileInfo.isImport)
+ return false;
+ }
+
+ // Next, check to see if the file is wrapped with #ifndef guards. If so, and
+ // if the macro that guards it is defined, we know the #include has no effect.
+ if (FileInfo.ControllingMacro && FileInfo.ControllingMacro->getMacroInfo()) {
+ ++NumMultiIncludeFileOptzn;
+ return false;
+ }
+
+ // Increment the number of times this file has been included.
+ ++FileInfo.NumIncludes;
+
+ return true;
+}
+
+
diff --git a/Lex/IdentifierTable.cpp b/Lex/IdentifierTable.cpp
new file mode 100644
index 0000000000..e671af9839
--- /dev/null
+++ b/Lex/IdentifierTable.cpp
@@ -0,0 +1,188 @@
+//===--- IdentifierTable.cpp - Hash table for identifier lookup -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IdentifierInfo, IdentifierVisitor, and
+// IdentifierTable interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/IdentifierTable.h"
+#include "clang/Lex/MacroInfo.h"
+#include "clang/Basic/LangOptions.h"
+using namespace clang;
+
+//===----------------------------------------------------------------------===//
+// IdentifierInfo Implementation
+//===----------------------------------------------------------------------===//
+
+IdentifierInfo::IdentifierInfo() {
+ Macro = 0;
+ TokenID = tok::identifier;
+ PPID = tok::pp_not_keyword;
+ ObjCID = tok::objc_not_keyword;
+ BuiltinID = 0;
+ IsExtension = false;
+ IsPoisoned = false;
+ IsOtherTargetMacro = false;
+ IsCPPOperatorKeyword = false;
+ FETokenInfo = 0;
+}
+
+IdentifierInfo::~IdentifierInfo() {
+ delete Macro;
+}
+
+//===----------------------------------------------------------------------===//
+// IdentifierTable Implementation
+//===----------------------------------------------------------------------===//
+
+IdentifierTable::IdentifierTable(const LangOptions &LangOpts)
+ // Start with space for 8K identifiers.
+ : HashTable(8192) {
+
+ // Populate the identifier table with info about keywords for the current
+ // language.
+ AddKeywords(LangOpts);
+}
+
+//===----------------------------------------------------------------------===//
+// Language Keyword Implementation
+//===----------------------------------------------------------------------===//
+
+/// AddKeyword - This method is used to associate a token ID with specific
+/// identifiers because they are language keywords. This causes the lexer to
+/// automatically map matching identifiers to specialized token codes.
+///
+/// The C90/C99/CPP flags are set to 0 if the token should be enabled in the
+/// specified langauge, set to 1 if it is an extension in the specified
+/// language, and set to 2 if disabled in the specified language.
+static void AddKeyword(const char *Keyword, unsigned KWLen,
+ tok::TokenKind TokenCode,
+ int C90, int C99, int CXX,
+ const LangOptions &LangOpts, IdentifierTable &Table) {
+ int Flags = LangOpts.CPlusPlus ? CXX : (LangOpts.C99 ? C99 : C90);
+
+ // Don't add this keyword if disabled in this language or if an extension
+ // and extensions are disabled.
+ if (Flags + LangOpts.NoExtensions >= 2) return;
+
+ IdentifierInfo &Info = Table.get(Keyword, Keyword+KWLen);
+ Info.setTokenID(TokenCode);
+ Info.setIsExtensionToken(Flags == 1);
+}
+
+static void AddAlias(const char *Keyword, unsigned KWLen,
+ const char *AliaseeKeyword, unsigned AliaseeKWLen,
+ const LangOptions &LangOpts, IdentifierTable &Table) {
+ IdentifierInfo &AliasInfo = Table.get(Keyword, Keyword+KWLen);
+ IdentifierInfo &AliaseeInfo = Table.get(AliaseeKeyword,
+ AliaseeKeyword+AliaseeKWLen);
+ AliasInfo.setTokenID(AliaseeInfo.getTokenID());
+ AliasInfo.setIsExtensionToken(AliaseeInfo.isExtensionToken());
+}
+
+/// AddPPKeyword - Register a preprocessor keyword like "define" "undef" or
+/// "elif".
+static void AddPPKeyword(tok::PPKeywordKind PPID,
+ const char *Name, unsigned NameLen,
+ IdentifierTable &Table) {
+ Table.get(Name, Name+NameLen).setPPKeywordID(PPID);
+}
+
+/// AddCXXOperatorKeyword - Register a C++ operator keyword alternative
+/// representations.
+static void AddCXXOperatorKeyword(const char *Keyword, unsigned KWLen,
+ tok::TokenKind TokenCode,
+ IdentifierTable &Table) {
+ IdentifierInfo &Info = Table.get(Keyword, Keyword + KWLen);
+ Info.setTokenID(TokenCode);
+ Info.setIsCPlusplusOperatorKeyword();
+}
+
+/// AddObjCKeyword - Register an Objective-C @keyword like "class" "selector" or
+/// "property".
+static void AddObjCKeyword(tok::ObjCKeywordKind ObjCID,
+ const char *Name, unsigned NameLen,
+ IdentifierTable &Table) {
+ Table.get(Name, Name+NameLen).setObjCKeywordID(ObjCID);
+}
+
+/// AddKeywords - Add all keywords to the symbol table.
+///
+void IdentifierTable::AddKeywords(const LangOptions &LangOpts) {
+ enum {
+ C90Shift = 0,
+ EXTC90 = 1 << C90Shift,
+ NOTC90 = 2 << C90Shift,
+ C99Shift = 2,
+ EXTC99 = 1 << C99Shift,
+ NOTC99 = 2 << C99Shift,
+ CPPShift = 4,
+ EXTCPP = 1 << CPPShift,
+ NOTCPP = 2 << CPPShift,
+ Mask = 3
+ };
+
+ // Add keywords and tokens for the current language.
+#define KEYWORD(NAME, FLAGS) \
+ AddKeyword(#NAME, strlen(#NAME), tok::kw_ ## NAME, \
+ ((FLAGS) >> C90Shift) & Mask, \
+ ((FLAGS) >> C99Shift) & Mask, \
+ ((FLAGS) >> CPPShift) & Mask, LangOpts, *this);
+#define ALIAS(NAME, TOK) \
+ AddAlias(NAME, strlen(NAME), #TOK, strlen(#TOK), LangOpts, *this);
+#define PPKEYWORD(NAME) \
+ AddPPKeyword(tok::pp_##NAME, #NAME, strlen(#NAME), *this);
+#define CXX_KEYWORD_OPERATOR(NAME, ALIAS) \
+ if (LangOpts.CXXOperatorNames) \
+ AddCXXOperatorKeyword(#NAME, strlen(#NAME), tok::ALIAS, *this);
+#define OBJC1_AT_KEYWORD(NAME) \
+ if (LangOpts.ObjC1) \
+ AddObjCKeyword(tok::objc_##NAME, #NAME, strlen(#NAME), *this);
+#define OBJC2_AT_KEYWORD(NAME) \
+ if (LangOpts.ObjC2) \
+ AddObjCKeyword(tok::objc_##NAME, #NAME, strlen(#NAME), *this);
+#include "clang/Basic/TokenKinds.def"
+}
+
+
+//===----------------------------------------------------------------------===//
+// Stats Implementation
+//===----------------------------------------------------------------------===//
+
+/// PrintStats - Print statistics about how well the identifier table is doing
+/// at hashing identifiers.
+void IdentifierTable::PrintStats() const {
+ unsigned NumBuckets = HashTable.getNumBuckets();
+ unsigned NumIdentifiers = HashTable.getNumItems();
+ unsigned NumEmptyBuckets = NumBuckets-NumIdentifiers;
+ unsigned AverageIdentifierSize = 0;
+ unsigned MaxIdentifierLength = 0;
+
+ // TODO: Figure out maximum times an identifier had to probe for -stats.
+ for (llvm::StringMap<IdentifierInfo, llvm::BumpPtrAllocator>::const_iterator
+ I = HashTable.begin(), E = HashTable.end(); I != E; ++I) {
+ unsigned IdLen = I->getKeyLength();
+ AverageIdentifierSize += IdLen;
+ if (MaxIdentifierLength < IdLen)
+ MaxIdentifierLength = IdLen;
+ }
+
+ fprintf(stderr, "\n*** Identifier Table Stats:\n");
+ fprintf(stderr, "# Identifiers: %d\n", NumIdentifiers);
+ fprintf(stderr, "# Empty Buckets: %d\n", NumEmptyBuckets);
+ fprintf(stderr, "Hash density (#identifiers per bucket): %f\n",
+ NumIdentifiers/(double)NumBuckets);
+ fprintf(stderr, "Ave identifier length: %f\n",
+ (AverageIdentifierSize/(double)NumIdentifiers));
+ fprintf(stderr, "Max identifier length: %d\n", MaxIdentifierLength);
+
+ // Compute statistics about the memory allocated for identifiers.
+ HashTable.getAllocator().PrintStats();
+}
diff --git a/Lex/Lexer.cpp b/Lex/Lexer.cpp
new file mode 100644
index 0000000000..1775b2f7bf
--- /dev/null
+++ b/Lex/Lexer.cpp
@@ -0,0 +1,1491 @@
+//===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Lexer and LexerToken interfaces.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: GCC Diagnostics emitted by the lexer:
+// PEDWARN: (form feed|vertical tab) in preprocessing directive
+//
+// Universal characters, unicode, char mapping:
+// WARNING: `%.*s' is not in NFKC
+// WARNING: `%.*s' is not in NFC
+//
+// Other:
+// TODO: Options to support:
+// -fexec-charset,-fwide-exec-charset
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/SourceLocation.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cctype>
+using namespace clang;
+
+static void InitCharacterInfo();
+
+Lexer::Lexer(const llvm::MemoryBuffer *File, unsigned fileid, Preprocessor &pp,
+ const char *BufStart, const char *BufEnd)
+ : BufferEnd(BufEnd ? BufEnd : File->getBufferEnd()),
+ InputFile(File), CurFileID(fileid), PP(pp), Features(PP.getLangOptions()) {
+ Is_PragmaLexer = false;
+ IsMainFile = false;
+ InitCharacterInfo();
+
+ assert(BufferEnd[0] == 0 &&
+ "We assume that the input buffer has a null character at the end"
+ " to simplify lexing!");
+
+ BufferPtr = BufStart ? BufStart : File->getBufferStart();
+
+ // Start of the file is a start of line.
+ IsAtStartOfLine = true;
+
+ // We are not after parsing a #.
+ ParsingPreprocessorDirective = false;
+
+ // We are not after parsing #include.
+ ParsingFilename = false;
+
+ // We are not in raw mode. Raw mode disables diagnostics and interpretation
+ // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
+ // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
+ // or otherwise skipping over tokens.
+ LexingRawMode = false;
+
+ // Default to keeping comments if requested.
+ KeepCommentMode = PP.getCommentRetentionState();
+}
+
+/// Stringify - Convert the specified string into a C string, with surrounding
+/// ""'s, and with escaped \ and " characters.
+std::string Lexer::Stringify(const std::string &Str, bool Charify) {
+ std::string Result = Str;
+ char Quote = Charify ? '\'' : '"';
+ for (unsigned i = 0, e = Result.size(); i != e; ++i) {
+ if (Result[i] == '\\' || Result[i] == Quote) {
+ Result.insert(Result.begin()+i, '\\');
+ ++i; ++e;
+ }
+ }
+ return Result;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Character information.
+//===----------------------------------------------------------------------===//
+
+static unsigned char CharInfo[256];
+
+enum {
+ CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0'
+ CHAR_VERT_WS = 0x02, // '\r', '\n'
+ CHAR_LETTER = 0x04, // a-z,A-Z
+ CHAR_NUMBER = 0x08, // 0-9
+ CHAR_UNDER = 0x10, // _
+ CHAR_PERIOD = 0x20 // .
+};
+
+static void InitCharacterInfo() {
+ static bool isInited = false;
+ if (isInited) return;
+ isInited = true;
+
+ // Intiialize the CharInfo table.
+ // TODO: statically initialize this.
+ CharInfo[(int)' '] = CharInfo[(int)'\t'] =
+ CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS;
+ CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS;
+
+ CharInfo[(int)'_'] = CHAR_UNDER;
+ CharInfo[(int)'.'] = CHAR_PERIOD;
+ for (unsigned i = 'a'; i <= 'z'; ++i)
+ CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER;
+ for (unsigned i = '0'; i <= '9'; ++i)
+ CharInfo[i] = CHAR_NUMBER;
+}
+
+/// isIdentifierBody - Return true if this is the body character of an
+/// identifier, which is [a-zA-Z0-9_].
+static inline bool isIdentifierBody(unsigned char c) {
+ return CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER);
+}
+
+/// isHorizontalWhitespace - Return true if this character is horizontal
+/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'.
+static inline bool isHorizontalWhitespace(unsigned char c) {
+ return CharInfo[c] & CHAR_HORZ_WS;
+}
+
+/// isWhitespace - Return true if this character is horizontal or vertical
+/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false
+/// for '\0'.
+static inline bool isWhitespace(unsigned char c) {
+ return CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS);
+}
+
+/// isNumberBody - Return true if this is the body character of an
+/// preprocessing number, which is [a-zA-Z0-9_.].
+static inline bool isNumberBody(unsigned char c) {
+ return CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Diagnostics forwarding code.
+//===----------------------------------------------------------------------===//
+
+/// getSourceLocation - Return a source location identifier for the specified
+/// offset in the current file.
+SourceLocation Lexer::getSourceLocation(const char *Loc) const {
+ assert(Loc >= InputFile->getBufferStart() && Loc <= BufferEnd &&
+ "Location out of range for this buffer!");
+ return SourceLocation(CurFileID, Loc-InputFile->getBufferStart());
+}
+
+
+/// Diag - Forwarding function for diagnostics. This translate a source
+/// position in the current buffer into a SourceLocation object for rendering.
+void Lexer::Diag(const char *Loc, unsigned DiagID,
+ const std::string &Msg) const {
+ if (LexingRawMode && Diagnostic::isNoteWarningOrExtension(DiagID))
+ return;
+ PP.Diag(getSourceLocation(Loc), DiagID, Msg);
+}
+void Lexer::Diag(SourceLocation Loc, unsigned DiagID,
+ const std::string &Msg) const {
+ if (LexingRawMode && Diagnostic::isNoteWarningOrExtension(DiagID))
+ return;
+ PP.Diag(Loc, DiagID, Msg);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Trigraph and Escaped Newline Handling Code.
+//===----------------------------------------------------------------------===//
+
+/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
+/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
+static char GetTrigraphCharForLetter(char Letter) {
+ switch (Letter) {
+ default: return 0;
+ case '=': return '#';
+ case ')': return ']';
+ case '(': return '[';
+ case '!': return '|';
+ case '\'': return '^';
+ case '>': return '}';
+ case '/': return '\\';
+ case '<': return '{';
+ case '-': return '~';
+ }
+}
+
+/// DecodeTrigraphChar - If the specified character is a legal trigraph when
+/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
+/// return the result character. Finally, emit a warning about trigraph use
+/// whether trigraphs are enabled or not.
+static char DecodeTrigraphChar(const char *CP, Lexer *L) {
+ char Res = GetTrigraphCharForLetter(*CP);
+ if (Res && L) {
+ if (!L->getFeatures().Trigraphs) {
+ L->Diag(CP-2, diag::trigraph_ignored);
+ return 0;
+ } else {
+ L->Diag(CP-2, diag::trigraph_converted, std::string()+Res);
+ }
+ }
+ return Res;
+}
+
+/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
+/// get its size, and return it. This is tricky in several cases:
+/// 1. If currently at the start of a trigraph, we warn about the trigraph,
+/// then either return the trigraph (skipping 3 chars) or the '?',
+/// depending on whether trigraphs are enabled or not.
+/// 2. If this is an escaped newline (potentially with whitespace between
+/// the backslash and newline), implicitly skip the newline and return
+/// the char after it.
+/// 3. If this is a UCN, return it. FIXME: C++ UCN's?
+///
+/// This handles the slow/uncommon case of the getCharAndSize method. Here we
+/// know that we can accumulate into Size, and that we have already incremented
+/// Ptr by Size bytes.
+///
+/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
+/// be updated to match.
+///
+char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
+ LexerToken *Tok) {
+ // If we have a slash, look for an escaped newline.
+ if (Ptr[0] == '\\') {
+ ++Size;
+ ++Ptr;
+Slash:
+ // Common case, backslash-char where the char is not whitespace.
+ if (!isWhitespace(Ptr[0])) return '\\';
+
+ // See if we have optional whitespace characters followed by a newline.
+ {
+ unsigned SizeTmp = 0;
+ do {
+ ++SizeTmp;
+ if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
+ // Remember that this token needs to be cleaned.
+ if (Tok) Tok->setFlag(LexerToken::NeedsCleaning);
+
+ // Warn if there was whitespace between the backslash and newline.
+ if (SizeTmp != 1 && Tok)
+ Diag(Ptr, diag::backslash_newline_space);
+
+ // If this is a \r\n or \n\r, skip the newlines.
+ if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
+ Ptr[SizeTmp-1] != Ptr[SizeTmp])
+ ++SizeTmp;
+
+ // Found backslash<whitespace><newline>. Parse the char after it.
+ Size += SizeTmp;
+ Ptr += SizeTmp;
+ // Use slow version to accumulate a correct size field.
+ return getCharAndSizeSlow(Ptr, Size, Tok);
+ }
+ } while (isWhitespace(Ptr[SizeTmp]));
+ }
+
+ // Otherwise, this is not an escaped newline, just return the slash.
+ return '\\';
+ }
+
+ // If this is a trigraph, process it.
+ if (Ptr[0] == '?' && Ptr[1] == '?') {
+ // If this is actually a legal trigraph (not something like "??x"), emit
+ // a trigraph warning. If so, and if trigraphs are enabled, return it.
+ if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
+ // Remember that this token needs to be cleaned.
+ if (Tok) Tok->setFlag(LexerToken::NeedsCleaning);
+
+ Ptr += 3;
+ Size += 3;
+ if (C == '\\') goto Slash;
+ return C;
+ }
+ }
+
+ // If this is neither, return a single character.
+ ++Size;
+ return *Ptr;
+}
+
+
+/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
+/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
+/// and that we have already incremented Ptr by Size bytes.
+///
+/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
+/// be updated to match.
+char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
+ const LangOptions &Features) {
+ // If we have a slash, look for an escaped newline.
+ if (Ptr[0] == '\\') {
+ ++Size;
+ ++Ptr;
+Slash:
+ // Common case, backslash-char where the char is not whitespace.
+ if (!isWhitespace(Ptr[0])) return '\\';
+
+ // See if we have optional whitespace characters followed by a newline.
+ {
+ unsigned SizeTmp = 0;
+ do {
+ ++SizeTmp;
+ if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
+
+ // If this is a \r\n or \n\r, skip the newlines.
+ if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
+ Ptr[SizeTmp-1] != Ptr[SizeTmp])
+ ++SizeTmp;
+
+ // Found backslash<whitespace><newline>. Parse the char after it.
+ Size += SizeTmp;
+ Ptr += SizeTmp;
+
+ // Use slow version to accumulate a correct size field.
+ return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
+ }
+ } while (isWhitespace(Ptr[SizeTmp]));
+ }
+
+ // Otherwise, this is not an escaped newline, just return the slash.
+ return '\\';
+ }
+
+ // If this is a trigraph, process it.
+ if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
+ // If this is actually a legal trigraph (not something like "??x"), return
+ // it.
+ if (char C = GetTrigraphCharForLetter(Ptr[2])) {
+ Ptr += 3;
+ Size += 3;
+ if (C == '\\') goto Slash;
+ return C;
+ }
+ }
+
+ // If this is neither, return a single character.
+ ++Size;
+ return *Ptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper methods for lexing.
+//===----------------------------------------------------------------------===//
+
+void Lexer::LexIdentifier(LexerToken &Result, const char *CurPtr) {
+ // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
+ unsigned Size;
+ unsigned char C = *CurPtr++;
+ while (isIdentifierBody(C)) {
+ C = *CurPtr++;
+ }
+ --CurPtr; // Back up over the skipped character.
+
+ // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
+ // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
+ // FIXME: UCNs.
+ if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
+FinishIdentifier:
+ const char *IdStart = BufferPtr;
+ FormTokenWithChars(Result, CurPtr);
+ Result.setKind(tok::identifier);
+
+ // If we are in raw mode, return this identifier raw. There is no need to
+ // look up identifier information or