diff options
author | Reid Spencer <rspencer@reidspencer.com> | 2007-07-11 17:01:13 +0000 |
---|---|---|
committer | Reid Spencer <rspencer@reidspencer.com> | 2007-07-11 17:01:13 +0000 |
commit | 5f016e2cb5d11daeb237544de1c5d59f20fe1a6e (patch) | |
tree | 8b6bfcb8783d16827f896d5facbd4549300e8a1e /Lex | |
parent | a5f182095bf2065ca94f1c86957ee91f9068964b (diff) |
Stage two of getting CFE top correct.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@39734 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'Lex')
-rw-r--r-- | Lex/HeaderSearch.cpp | 319 | ||||
-rw-r--r-- | Lex/IdentifierTable.cpp | 188 | ||||
-rw-r--r-- | Lex/Lexer.cpp | 1491 | ||||
-rw-r--r-- | Lex/LiteralSupport.cpp | 661 | ||||
-rw-r--r-- | Lex/MacroExpander.cpp | 636 | ||||
-rw-r--r-- | Lex/MacroInfo.cpp | 70 | ||||
-rw-r--r-- | Lex/Makefile | 28 | ||||
-rw-r--r-- | Lex/PPExpressions.cpp | 654 | ||||
-rw-r--r-- | Lex/Pragma.cpp | 369 | ||||
-rw-r--r-- | Lex/Preprocessor.cpp | 2087 | ||||
-rw-r--r-- | Lex/ScratchBuffer.cpp | 71 |
11 files changed, 6574 insertions, 0 deletions
diff --git a/Lex/HeaderSearch.cpp b/Lex/HeaderSearch.cpp new file mode 100644 index 0000000000..520205e1da --- /dev/null +++ b/Lex/HeaderSearch.cpp @@ -0,0 +1,319 @@ +//===--- HeaderSearch.cpp - Resolve Header File Locations ---===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the DirectoryLookup and HeaderSearch interfaces. +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/FileManager.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/IdentifierTable.h" +#include "llvm/System/Path.h" +#include "llvm/ADT/SmallString.h" +using namespace clang; + +HeaderSearch::HeaderSearch(FileManager &FM) : FileMgr(FM), FrameworkMap(64) { + SystemDirIdx = 0; + NoCurDirSearch = false; + + NumIncluded = 0; + NumMultiIncludeFileOptzn = 0; + NumFrameworkLookups = NumSubFrameworkLookups = 0; +} + +void HeaderSearch::PrintStats() { + fprintf(stderr, "\n*** HeaderSearch Stats:\n"); + fprintf(stderr, "%d files tracked.\n", (int)FileInfo.size()); + unsigned NumOnceOnlyFiles = 0, MaxNumIncludes = 0, NumSingleIncludedFiles = 0; + for (unsigned i = 0, e = FileInfo.size(); i != e; ++i) { + NumOnceOnlyFiles += FileInfo[i].isImport; + if (MaxNumIncludes < FileInfo[i].NumIncludes) + MaxNumIncludes = FileInfo[i].NumIncludes; + NumSingleIncludedFiles += FileInfo[i].NumIncludes == 1; + } + fprintf(stderr, " %d #import/#pragma once files.\n", NumOnceOnlyFiles); + fprintf(stderr, " %d included exactly once.\n", NumSingleIncludedFiles); + fprintf(stderr, " %d max times a file is included.\n", MaxNumIncludes); + + fprintf(stderr, " %d #include/#include_next/#import.\n", NumIncluded); + fprintf(stderr, " %d #includes skipped due to" + " the multi-include optimization.\n", NumMultiIncludeFileOptzn); + + fprintf(stderr, "%d framework lookups.\n", NumFrameworkLookups); + fprintf(stderr, "%d subframework lookups.\n", NumSubFrameworkLookups); +} + +//===----------------------------------------------------------------------===// +// Header File Location. +//===----------------------------------------------------------------------===// + +const FileEntry *HeaderSearch::DoFrameworkLookup(const DirectoryEntry *Dir, + const char *FilenameStart, + const char *FilenameEnd) { + // Framework names must have a '/' in the filename. + const char *SlashPos = std::find(FilenameStart, FilenameEnd, '/'); + if (SlashPos == FilenameEnd) return 0; + + llvm::StringMapEntry<const DirectoryEntry *> &CacheLookup = + FrameworkMap.GetOrCreateValue(FilenameStart, SlashPos); + + // If it is some other directory, fail. + if (CacheLookup.getValue() && CacheLookup.getValue() != Dir) + return 0; + + // FrameworkName = "/System/Library/Frameworks/" + llvm::SmallString<1024> FrameworkName; + FrameworkName += Dir->getName(); + if (FrameworkName.empty() || FrameworkName.back() != '/') + FrameworkName.push_back('/'); + + // FrameworkName = "/System/Library/Frameworks/Cocoa" + FrameworkName.append(FilenameStart, SlashPos); + + // FrameworkName = "/System/Library/Frameworks/Cocoa.framework/" + FrameworkName += ".framework/"; + + if (CacheLookup.getValue() == 0) { + ++NumFrameworkLookups; + + // If the framework dir doesn't exist, we fail. + if (!llvm::sys::Path(std::string(FrameworkName.begin(), + FrameworkName.end())).exists()) + return 0; + + // Otherwise, if it does, remember that this is the right direntry for this + // framework. + CacheLookup.setValue(Dir); + } + + // Check "/System/Library/Frameworks/Cocoa.framework/Headers/file.h" + unsigned OrigSize = FrameworkName.size(); + + FrameworkName += "Headers/"; + FrameworkName.append(SlashPos+1, FilenameEnd); + if (const FileEntry *FE = FileMgr.getFile(FrameworkName.begin(), + FrameworkName.end())) { + return FE; + } + + // Check "/System/Library/Frameworks/Cocoa.framework/PrivateHeaders/file.h" + const char *Private = "Private"; + FrameworkName.insert(FrameworkName.begin()+OrigSize, Private, + Private+strlen(Private)); + return FileMgr.getFile(FrameworkName.begin(), FrameworkName.end()); +} + +/// LookupFile - Given a "foo" or <foo> reference, look up the indicated file, +/// return null on failure. isAngled indicates whether the file reference is +/// for system #include's or not (i.e. using <> instead of ""). CurFileEnt, if +/// non-null, indicates where the #including file is, in case a relative search +/// is needed. +const FileEntry *HeaderSearch::LookupFile(const char *FilenameStart, + const char *FilenameEnd, + bool isAngled, + const DirectoryLookup *FromDir, + const DirectoryLookup *&CurDir, + const FileEntry *CurFileEnt) { + // If 'Filename' is absolute, check to see if it exists and no searching. + // FIXME: Portability. This should be a sys::Path interface, this doesn't + // handle things like C:\foo.txt right, nor win32 \\network\device\blah. + if (FilenameStart[0] == '/') { + CurDir = 0; + + // If this was an #include_next "/absolute/file", fail. + if (FromDir) return 0; + + // Otherwise, just return the file. + return FileMgr.getFile(FilenameStart, FilenameEnd); + } + + llvm::SmallString<1024> TmpDir; + + // Step #0, unless disabled, check to see if the file is in the #includer's + // directory. This search is not done for <> headers. + if (CurFileEnt && !isAngled && !NoCurDirSearch) { + // Concatenate the requested file onto the directory. + // FIXME: Portability. Filename concatenation should be in sys::Path. + TmpDir += CurFileEnt->getDir()->getName(); + TmpDir.push_back('/'); + TmpDir.append(FilenameStart, FilenameEnd); + if (const FileEntry *FE = FileMgr.getFile(TmpDir.begin(), TmpDir.end())) { + // Leave CurDir unset. + + // This file is a system header or C++ unfriendly if the old file is. + getFileInfo(FE).DirInfo = getFileInfo(CurFileEnt).DirInfo; + return FE; + } + TmpDir.clear(); + } + + CurDir = 0; + + // If this is a system #include, ignore the user #include locs. + unsigned i = isAngled ? SystemDirIdx : 0; + + // If this is a #include_next request, start searching after the directory the + // file was found in. + if (FromDir) + i = FromDir-&SearchDirs[0]; + + // Check each directory in sequence to see if it contains this file. + for (; i != SearchDirs.size(); ++i) { + const FileEntry *FE = 0; + if (!SearchDirs[i].isFramework()) { + // FIXME: Portability. Adding file to dir should be in sys::Path. + // Concatenate the requested file onto the directory. + TmpDir.clear(); + TmpDir += SearchDirs[i].getDir()->getName(); + TmpDir.push_back('/'); + TmpDir.append(FilenameStart, FilenameEnd); + FE = FileMgr.getFile(TmpDir.begin(), TmpDir.end()); + } else { + FE = DoFrameworkLookup(SearchDirs[i].getDir(), FilenameStart,FilenameEnd); + } + + if (FE) { + CurDir = &SearchDirs[i]; + + // This file is a system header or C++ unfriendly if the dir is. + getFileInfo(FE).DirInfo = CurDir->getDirCharacteristic(); + return FE; + } + } + + // Otherwise, didn't find it. + return 0; +} + +/// LookupSubframeworkHeader - Look up a subframework for the specified +/// #include file. For example, if #include'ing <HIToolbox/HIToolbox.h> from +/// within ".../Carbon.framework/Headers/Carbon.h", check to see if HIToolbox +/// is a subframework within Carbon.framework. If so, return the FileEntry +/// for the designated file, otherwise return null. +const FileEntry *HeaderSearch:: +LookupSubframeworkHeader(const char *FilenameStart, + const char *FilenameEnd, + const FileEntry *ContextFileEnt) { + // Framework names must have a '/' in the filename. Find it. + const char *SlashPos = std::find(FilenameStart, FilenameEnd, '/'); + if (SlashPos == FilenameEnd) return 0; + + // Look up the base framework name of the ContextFileEnt. + const char *ContextName = ContextFileEnt->getName(); + + // If the context info wasn't a framework, couldn't be a subframework. + const char *FrameworkPos = strstr(ContextName, ".framework/"); + if (FrameworkPos == 0) + return 0; + + llvm::SmallString<1024> FrameworkName(ContextName, + FrameworkPos+strlen(".framework/")); + + // Append Frameworks/HIToolbox.framework/ + FrameworkName += "Frameworks/"; + FrameworkName.append(FilenameStart, SlashPos); + FrameworkName += ".framework/"; + + llvm::StringMapEntry<const DirectoryEntry *> &CacheLookup = + FrameworkMap.GetOrCreateValue(FilenameStart, SlashPos); + + // Some other location? + if (CacheLookup.getValue() && + CacheLookup.getKeyLength() == FrameworkName.size() && + memcmp(CacheLookup.getKeyData(), &FrameworkName[0], + CacheLookup.getKeyLength()) != 0) + return 0; + + // Cache subframework. + if (CacheLookup.getValue() == 0) { + ++NumSubFrameworkLookups; + + // If the framework dir doesn't exist, we fail. + const DirectoryEntry *Dir = FileMgr.getDirectory(FrameworkName.begin(), + FrameworkName.end()); + if (Dir == 0) return 0; + + // Otherwise, if it does, remember that this is the right direntry for this + // framework. + CacheLookup.setValue(Dir); + } + + const FileEntry *FE = 0; + + // Check ".../Frameworks/HIToolbox.framework/Headers/HIToolbox.h" + llvm::SmallString<1024> HeadersFilename(FrameworkName); + HeadersFilename += "Headers/"; + HeadersFilename.append(SlashPos+1, FilenameEnd); + if (!(FE = FileMgr.getFile(HeadersFilename.begin(), + HeadersFilename.end()))) { + + // Check ".../Frameworks/HIToolbox.framework/PrivateHeaders/HIToolbox.h" + HeadersFilename = FrameworkName; + HeadersFilename += "PrivateHeaders/"; + HeadersFilename.append(SlashPos+1, FilenameEnd); + if (!(FE = FileMgr.getFile(HeadersFilename.begin(), HeadersFilename.end()))) + return 0; + } + + // This file is a system header or C++ unfriendly if the old file is. + getFileInfo(FE).DirInfo = getFileInfo(ContextFileEnt).DirInfo; + return FE; +} + +//===----------------------------------------------------------------------===// +// File Info Management. +//===----------------------------------------------------------------------===// + + +/// getFileInfo - Return the PerFileInfo structure for the specified +/// FileEntry. +HeaderSearch::PerFileInfo &HeaderSearch::getFileInfo(const FileEntry *FE) { + if (FE->getUID() >= FileInfo.size()) + FileInfo.resize(FE->getUID()+1); + return FileInfo[FE->getUID()]; +} + +/// ShouldEnterIncludeFile - Mark the specified file as a target of of a +/// #include, #include_next, or #import directive. Return false if #including +/// the file will have no effect or true if we should include it. +bool HeaderSearch::ShouldEnterIncludeFile(const FileEntry *File, bool isImport){ + ++NumIncluded; // Count # of attempted #includes. + + // Get information about this file. + PerFileInfo &FileInfo = getFileInfo(File); + + // If this is a #import directive, check that we have not already imported + // this header. + if (isImport) { + // If this has already been imported, don't import it again. + FileInfo.isImport = true; + + // Has this already been #import'ed or #include'd? + if (FileInfo.NumIncludes) return false; + } else { + // Otherwise, if this is a #include of a file that was previously #import'd + // or if this is the second #include of a #pragma once file, ignore it. + if (FileInfo.isImport) + return false; + } + + // Next, check to see if the file is wrapped with #ifndef guards. If so, and + // if the macro that guards it is defined, we know the #include has no effect. + if (FileInfo.ControllingMacro && FileInfo.ControllingMacro->getMacroInfo()) { + ++NumMultiIncludeFileOptzn; + return false; + } + + // Increment the number of times this file has been included. + ++FileInfo.NumIncludes; + + return true; +} + + diff --git a/Lex/IdentifierTable.cpp b/Lex/IdentifierTable.cpp new file mode 100644 index 0000000000..e671af9839 --- /dev/null +++ b/Lex/IdentifierTable.cpp @@ -0,0 +1,188 @@ +//===--- IdentifierTable.cpp - Hash table for identifier lookup -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the IdentifierInfo, IdentifierVisitor, and +// IdentifierTable interfaces. +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/IdentifierTable.h" +#include "clang/Lex/MacroInfo.h" +#include "clang/Basic/LangOptions.h" +using namespace clang; + +//===----------------------------------------------------------------------===// +// IdentifierInfo Implementation +//===----------------------------------------------------------------------===// + +IdentifierInfo::IdentifierInfo() { + Macro = 0; + TokenID = tok::identifier; + PPID = tok::pp_not_keyword; + ObjCID = tok::objc_not_keyword; + BuiltinID = 0; + IsExtension = false; + IsPoisoned = false; + IsOtherTargetMacro = false; + IsCPPOperatorKeyword = false; + FETokenInfo = 0; +} + +IdentifierInfo::~IdentifierInfo() { + delete Macro; +} + +//===----------------------------------------------------------------------===// +// IdentifierTable Implementation +//===----------------------------------------------------------------------===// + +IdentifierTable::IdentifierTable(const LangOptions &LangOpts) + // Start with space for 8K identifiers. + : HashTable(8192) { + + // Populate the identifier table with info about keywords for the current + // language. + AddKeywords(LangOpts); +} + +//===----------------------------------------------------------------------===// +// Language Keyword Implementation +//===----------------------------------------------------------------------===// + +/// AddKeyword - This method is used to associate a token ID with specific +/// identifiers because they are language keywords. This causes the lexer to +/// automatically map matching identifiers to specialized token codes. +/// +/// The C90/C99/CPP flags are set to 0 if the token should be enabled in the +/// specified langauge, set to 1 if it is an extension in the specified +/// language, and set to 2 if disabled in the specified language. +static void AddKeyword(const char *Keyword, unsigned KWLen, + tok::TokenKind TokenCode, + int C90, int C99, int CXX, + const LangOptions &LangOpts, IdentifierTable &Table) { + int Flags = LangOpts.CPlusPlus ? CXX : (LangOpts.C99 ? C99 : C90); + + // Don't add this keyword if disabled in this language or if an extension + // and extensions are disabled. + if (Flags + LangOpts.NoExtensions >= 2) return; + + IdentifierInfo &Info = Table.get(Keyword, Keyword+KWLen); + Info.setTokenID(TokenCode); + Info.setIsExtensionToken(Flags == 1); +} + +static void AddAlias(const char *Keyword, unsigned KWLen, + const char *AliaseeKeyword, unsigned AliaseeKWLen, + const LangOptions &LangOpts, IdentifierTable &Table) { + IdentifierInfo &AliasInfo = Table.get(Keyword, Keyword+KWLen); + IdentifierInfo &AliaseeInfo = Table.get(AliaseeKeyword, + AliaseeKeyword+AliaseeKWLen); + AliasInfo.setTokenID(AliaseeInfo.getTokenID()); + AliasInfo.setIsExtensionToken(AliaseeInfo.isExtensionToken()); +} + +/// AddPPKeyword - Register a preprocessor keyword like "define" "undef" or +/// "elif". +static void AddPPKeyword(tok::PPKeywordKind PPID, + const char *Name, unsigned NameLen, + IdentifierTable &Table) { + Table.get(Name, Name+NameLen).setPPKeywordID(PPID); +} + +/// AddCXXOperatorKeyword - Register a C++ operator keyword alternative +/// representations. +static void AddCXXOperatorKeyword(const char *Keyword, unsigned KWLen, + tok::TokenKind TokenCode, + IdentifierTable &Table) { + IdentifierInfo &Info = Table.get(Keyword, Keyword + KWLen); + Info.setTokenID(TokenCode); + Info.setIsCPlusplusOperatorKeyword(); +} + +/// AddObjCKeyword - Register an Objective-C @keyword like "class" "selector" or +/// "property". +static void AddObjCKeyword(tok::ObjCKeywordKind ObjCID, + const char *Name, unsigned NameLen, + IdentifierTable &Table) { + Table.get(Name, Name+NameLen).setObjCKeywordID(ObjCID); +} + +/// AddKeywords - Add all keywords to the symbol table. +/// +void IdentifierTable::AddKeywords(const LangOptions &LangOpts) { + enum { + C90Shift = 0, + EXTC90 = 1 << C90Shift, + NOTC90 = 2 << C90Shift, + C99Shift = 2, + EXTC99 = 1 << C99Shift, + NOTC99 = 2 << C99Shift, + CPPShift = 4, + EXTCPP = 1 << CPPShift, + NOTCPP = 2 << CPPShift, + Mask = 3 + }; + + // Add keywords and tokens for the current language. +#define KEYWORD(NAME, FLAGS) \ + AddKeyword(#NAME, strlen(#NAME), tok::kw_ ## NAME, \ + ((FLAGS) >> C90Shift) & Mask, \ + ((FLAGS) >> C99Shift) & Mask, \ + ((FLAGS) >> CPPShift) & Mask, LangOpts, *this); +#define ALIAS(NAME, TOK) \ + AddAlias(NAME, strlen(NAME), #TOK, strlen(#TOK), LangOpts, *this); +#define PPKEYWORD(NAME) \ + AddPPKeyword(tok::pp_##NAME, #NAME, strlen(#NAME), *this); +#define CXX_KEYWORD_OPERATOR(NAME, ALIAS) \ + if (LangOpts.CXXOperatorNames) \ + AddCXXOperatorKeyword(#NAME, strlen(#NAME), tok::ALIAS, *this); +#define OBJC1_AT_KEYWORD(NAME) \ + if (LangOpts.ObjC1) \ + AddObjCKeyword(tok::objc_##NAME, #NAME, strlen(#NAME), *this); +#define OBJC2_AT_KEYWORD(NAME) \ + if (LangOpts.ObjC2) \ + AddObjCKeyword(tok::objc_##NAME, #NAME, strlen(#NAME), *this); +#include "clang/Basic/TokenKinds.def" +} + + +//===----------------------------------------------------------------------===// +// Stats Implementation +//===----------------------------------------------------------------------===// + +/// PrintStats - Print statistics about how well the identifier table is doing +/// at hashing identifiers. +void IdentifierTable::PrintStats() const { + unsigned NumBuckets = HashTable.getNumBuckets(); + unsigned NumIdentifiers = HashTable.getNumItems(); + unsigned NumEmptyBuckets = NumBuckets-NumIdentifiers; + unsigned AverageIdentifierSize = 0; + unsigned MaxIdentifierLength = 0; + + // TODO: Figure out maximum times an identifier had to probe for -stats. + for (llvm::StringMap<IdentifierInfo, llvm::BumpPtrAllocator>::const_iterator + I = HashTable.begin(), E = HashTable.end(); I != E; ++I) { + unsigned IdLen = I->getKeyLength(); + AverageIdentifierSize += IdLen; + if (MaxIdentifierLength < IdLen) + MaxIdentifierLength = IdLen; + } + + fprintf(stderr, "\n*** Identifier Table Stats:\n"); + fprintf(stderr, "# Identifiers: %d\n", NumIdentifiers); + fprintf(stderr, "# Empty Buckets: %d\n", NumEmptyBuckets); + fprintf(stderr, "Hash density (#identifiers per bucket): %f\n", + NumIdentifiers/(double)NumBuckets); + fprintf(stderr, "Ave identifier length: %f\n", + (AverageIdentifierSize/(double)NumIdentifiers)); + fprintf(stderr, "Max identifier length: %d\n", MaxIdentifierLength); + + // Compute statistics about the memory allocated for identifiers. + HashTable.getAllocator().PrintStats(); +} diff --git a/Lex/Lexer.cpp b/Lex/Lexer.cpp new file mode 100644 index 0000000000..1775b2f7bf --- /dev/null +++ b/Lex/Lexer.cpp @@ -0,0 +1,1491 @@ +//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Lexer and LexerToken interfaces. +// +//===----------------------------------------------------------------------===// +// +// TODO: GCC Diagnostics emitted by the lexer: +// PEDWARN: (form feed|vertical tab) in preprocessing directive +// +// Universal characters, unicode, char mapping: +// WARNING: `%.*s' is not in NFKC +// WARNING: `%.*s' is not in NFC +// +// Other: +// TODO: Options to support: +// -fexec-charset,-fwide-exec-charset +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/Lexer.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/SourceLocation.h" +#include "llvm/Support/MemoryBuffer.h" +#include <cctype> +using namespace clang; + +static void InitCharacterInfo(); + +Lexer::Lexer(const llvm::MemoryBuffer *File, unsigned fileid, Preprocessor &pp, + const char *BufStart, const char *BufEnd) + : BufferEnd(BufEnd ? BufEnd : File->getBufferEnd()), + InputFile(File), CurFileID(fileid), PP(pp), Features(PP.getLangOptions()) { + Is_PragmaLexer = false; + IsMainFile = false; + InitCharacterInfo(); + + assert(BufferEnd[0] == 0 && + "We assume that the input buffer has a null character at the end" + " to simplify lexing!"); + + BufferPtr = BufStart ? BufStart : File->getBufferStart(); + + // Start of the file is a start of line. + IsAtStartOfLine = true; + + // We are not after parsing a #. + ParsingPreprocessorDirective = false; + + // We are not after parsing #include. + ParsingFilename = false; + + // We are not in raw mode. Raw mode disables diagnostics and interpretation + // of tokens (e.g. identifiers, thus disabling macro expansion). It is used + // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block + // or otherwise skipping over tokens. + LexingRawMode = false; + + // Default to keeping comments if requested. + KeepCommentMode = PP.getCommentRetentionState(); +} + +/// Stringify - Convert the specified string into a C string, with surrounding +/// ""'s, and with escaped \ and " characters. +std::string Lexer::Stringify(const std::string &Str, bool Charify) { + std::string Result = Str; + char Quote = Charify ? '\'' : '"'; + for (unsigned i = 0, e = Result.size(); i != e; ++i) { + if (Result[i] == '\\' || Result[i] == Quote) { + Result.insert(Result.begin()+i, '\\'); + ++i; ++e; + } + } + return Result; +} + + +//===----------------------------------------------------------------------===// +// Character information. +//===----------------------------------------------------------------------===// + +static unsigned char CharInfo[256]; + +enum { + CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' + CHAR_VERT_WS = 0x02, // '\r', '\n' + CHAR_LETTER = 0x04, // a-z,A-Z + CHAR_NUMBER = 0x08, // 0-9 + CHAR_UNDER = 0x10, // _ + CHAR_PERIOD = 0x20 // . +}; + +static void InitCharacterInfo() { + static bool isInited = false; + if (isInited) return; + isInited = true; + + // Intiialize the CharInfo table. + // TODO: statically initialize this. + CharInfo[(int)' '] = CharInfo[(int)'\t'] = + CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS; + CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS; + + CharInfo[(int)'_'] = CHAR_UNDER; + CharInfo[(int)'.'] = CHAR_PERIOD; + for (unsigned i = 'a'; i <= 'z'; ++i) + CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER; + for (unsigned i = '0'; i <= '9'; ++i) + CharInfo[i] = CHAR_NUMBER; +} + +/// isIdentifierBody - Return true if this is the body character of an +/// identifier, which is [a-zA-Z0-9_]. +static inline bool isIdentifierBody(unsigned char c) { + return CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER); +} + +/// isHorizontalWhitespace - Return true if this character is horizontal +/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. +static inline bool isHorizontalWhitespace(unsigned char c) { + return CharInfo[c] & CHAR_HORZ_WS; +} + +/// isWhitespace - Return true if this character is horizontal or vertical +/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false +/// for '\0'. +static inline bool isWhitespace(unsigned char c) { + return CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS); +} + +/// isNumberBody - Return true if this is the body character of an +/// preprocessing number, which is [a-zA-Z0-9_.]. +static inline bool isNumberBody(unsigned char c) { + return CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD); +} + + +//===----------------------------------------------------------------------===// +// Diagnostics forwarding code. +//===----------------------------------------------------------------------===// + +/// getSourceLocation - Return a source location identifier for the specified +/// offset in the current file. +SourceLocation Lexer::getSourceLocation(const char *Loc) const { + assert(Loc >= InputFile->getBufferStart() && Loc <= BufferEnd && + "Location out of range for this buffer!"); + return SourceLocation(CurFileID, Loc-InputFile->getBufferStart()); +} + + +/// Diag - Forwarding function for diagnostics. This translate a source +/// position in the current buffer into a SourceLocation object for rendering. +void Lexer::Diag(const char *Loc, unsigned DiagID, + const std::string &Msg) const { + if (LexingRawMode && Diagnostic::isNoteWarningOrExtension(DiagID)) + return; + PP.Diag(getSourceLocation(Loc), DiagID, Msg); +} +void Lexer::Diag(SourceLocation Loc, unsigned DiagID, + const std::string &Msg) const { + if (LexingRawMode && Diagnostic::isNoteWarningOrExtension(DiagID)) + return; + PP.Diag(Loc, DiagID, Msg); +} + + +//===----------------------------------------------------------------------===// +// Trigraph and Escaped Newline Handling Code. +//===----------------------------------------------------------------------===// + +/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, +/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. +static char GetTrigraphCharForLetter(char Letter) { + switch (Letter) { + default: return 0; + case '=': return '#'; + case ')': return ']'; + case '(': return '['; + case '!': return '|'; + case '\'': return '^'; + case '>': return '}'; + case '/': return '\\'; + case '<': return '{'; + case '-': return '~'; + } +} + +/// DecodeTrigraphChar - If the specified character is a legal trigraph when +/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, +/// return the result character. Finally, emit a warning about trigraph use +/// whether trigraphs are enabled or not. +static char DecodeTrigraphChar(const char *CP, Lexer *L) { + char Res = GetTrigraphCharForLetter(*CP); + if (Res && L) { + if (!L->getFeatures().Trigraphs) { + L->Diag(CP-2, diag::trigraph_ignored); + return 0; + } else { + L->Diag(CP-2, diag::trigraph_converted, std::string()+Res); + } + } + return Res; +} + +/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, +/// get its size, and return it. This is tricky in several cases: +/// 1. If currently at the start of a trigraph, we warn about the trigraph, +/// then either return the trigraph (skipping 3 chars) or the '?', +/// depending on whether trigraphs are enabled or not. +/// 2. If this is an escaped newline (potentially with whitespace between +/// the backslash and newline), implicitly skip the newline and return +/// the char after it. +/// 3. If this is a UCN, return it. FIXME: C++ UCN's? +/// +/// This handles the slow/uncommon case of the getCharAndSize method. Here we +/// know that we can accumulate into Size, and that we have already incremented +/// Ptr by Size bytes. +/// +/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should +/// be updated to match. +/// +char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, + LexerToken *Tok) { + // If we have a slash, look for an escaped newline. + if (Ptr[0] == '\\') { + ++Size; + ++Ptr; +Slash: + // Common case, backslash-char where the char is not whitespace. + if (!isWhitespace(Ptr[0])) return '\\'; + + // See if we have optional whitespace characters followed by a newline. + { + unsigned SizeTmp = 0; + do { + ++SizeTmp; + if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { + // Remember that this token needs to be cleaned. + if (Tok) Tok->setFlag(LexerToken::NeedsCleaning); + + // Warn if there was whitespace between the backslash and newline. + if (SizeTmp != 1 && Tok) + Diag(Ptr, diag::backslash_newline_space); + + // If this is a \r\n or \n\r, skip the newlines. + if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && + Ptr[SizeTmp-1] != Ptr[SizeTmp]) + ++SizeTmp; + + // Found backslash<whitespace><newline>. Parse the char after it. + Size += SizeTmp; + Ptr += SizeTmp; + // Use slow version to accumulate a correct size field. + return getCharAndSizeSlow(Ptr, Size, Tok); + } + } while (isWhitespace(Ptr[SizeTmp])); + } + + // Otherwise, this is not an escaped newline, just return the slash. + return '\\'; + } + + // If this is a trigraph, process it. + if (Ptr[0] == '?' && Ptr[1] == '?') { + // If this is actually a legal trigraph (not something like "??x"), emit + // a trigraph warning. If so, and if trigraphs are enabled, return it. + if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { + // Remember that this token needs to be cleaned. + if (Tok) Tok->setFlag(LexerToken::NeedsCleaning); + + Ptr += 3; + Size += 3; + if (C == '\\') goto Slash; + return C; + } + } + + // If this is neither, return a single character. + ++Size; + return *Ptr; +} + + +/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the +/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, +/// and that we have already incremented Ptr by Size bytes. +/// +/// NOTE: When this method is updated, getCharAndSizeSlow (above) should +/// be updated to match. +char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, + const LangOptions &Features) { + // If we have a slash, look for an escaped newline. + if (Ptr[0] == '\\') { + ++Size; + ++Ptr; +Slash: + // Common case, backslash-char where the char is not whitespace. + if (!isWhitespace(Ptr[0])) return '\\'; + + // See if we have optional whitespace characters followed by a newline. + { + unsigned SizeTmp = 0; + do { + ++SizeTmp; + if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { + + // If this is a \r\n or \n\r, skip the newlines. + if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && + Ptr[SizeTmp-1] != Ptr[SizeTmp]) + ++SizeTmp; + + // Found backslash<whitespace><newline>. Parse the char after it. + Size += SizeTmp; + Ptr += SizeTmp; + + // Use slow version to accumulate a correct size field. + return getCharAndSizeSlowNoWarn(Ptr, Size, Features); + } + } while (isWhitespace(Ptr[SizeTmp])); + } + + // Otherwise, this is not an escaped newline, just return the slash. + return '\\'; + } + + // If this is a trigraph, process it. + if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { + // If this is actually a legal trigraph (not something like "??x"), return + // it. + if (char C = GetTrigraphCharForLetter(Ptr[2])) { + Ptr += 3; + Size += 3; + if (C == '\\') goto Slash; + return C; + } + } + + // If this is neither, return a single character. + ++Size; + return *Ptr; +} + +//===----------------------------------------------------------------------===// +// Helper methods for lexing. +//===----------------------------------------------------------------------===// + +void Lexer::LexIdentifier(LexerToken &Result, const char *CurPtr) { + // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] + unsigned Size; + unsigned char C = *CurPtr++; + while (isIdentifierBody(C)) { + C = *CurPtr++; + } + --CurPtr; // Back up over the skipped character. + + // Fast path, no $,\,? in identifier found. '\' might be an escaped newline + // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. + // FIXME: UCNs. + if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { +FinishIdentifier: + const char *IdStart = BufferPtr; + FormTokenWithChars(Result, CurPtr); + Result.setKind(tok::identifier); + + // If we are in raw mode, return this identifier raw. There is no need to + // look up identifier information or |