diff options
author | Michael J. Spencer <bigcheesegs@gmail.com> | 2012-04-03 23:09:22 +0000 |
---|---|---|
committer | Michael J. Spencer <bigcheesegs@gmail.com> | 2012-04-03 23:09:22 +0000 |
commit | 93210e847a1496b24cef881723e57c489082dcfe (patch) | |
tree | 83d1f8828d8b6835a6511d28cf3c63fad8b06aef /lib/Support/YAMLParser.cpp | |
parent | 2ce63c73520cd6e715f9114589f802938b5db01f (diff) |
Add YAML parser to Support.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@153977 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Support/YAMLParser.cpp')
-rw-r--r-- | lib/Support/YAMLParser.cpp | 2115 |
1 files changed, 2115 insertions, 0 deletions
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp new file mode 100644 index 0000000000..3e302d0eb1 --- /dev/null +++ b/lib/Support/YAMLParser.cpp @@ -0,0 +1,2115 @@ +//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a YAML parser. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/YAMLParser.h" + +#include "llvm/ADT/ilist.h" +#include "llvm/ADT/ilist_node.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SourceMgr.h" + +using namespace llvm; +using namespace yaml; + +enum UnicodeEncodingForm { + UEF_UTF32_LE, //< UTF-32 Little Endian + UEF_UTF32_BE, //< UTF-32 Big Endian + UEF_UTF16_LE, //< UTF-16 Little Endian + UEF_UTF16_BE, //< UTF-16 Big Endian + UEF_UTF8, //< UTF-8 or ascii. + UEF_Unknown //< Not a valid Unicode encoding. +}; + +/// EncodingInfo - Holds the encoding type and length of the byte order mark if +/// it exists. Length is in {0, 2, 3, 4}. +typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; + +/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode +/// encoding form of \a Input. +/// +/// @param Input A string of length 0 or more. +/// @returns An EncodingInfo indicating the Unicode encoding form of the input +/// and how long the byte order mark is if one exists. +static EncodingInfo getUnicodeEncoding(StringRef Input) { + if (Input.size() == 0) + return std::make_pair(UEF_Unknown, 0); + + switch (uint8_t(Input[0])) { + case 0x00: + if (Input.size() >= 4) { + if ( Input[1] == 0 + && uint8_t(Input[2]) == 0xFE + && uint8_t(Input[3]) == 0xFF) + return std::make_pair(UEF_UTF32_BE, 4); + if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) + return std::make_pair(UEF_UTF32_BE, 0); + } + + if (Input.size() >= 2 && Input[1] != 0) + return std::make_pair(UEF_UTF16_BE, 0); + return std::make_pair(UEF_Unknown, 0); + case 0xFF: + if ( Input.size() >= 4 + && uint8_t(Input[1]) == 0xFE + && Input[2] == 0 + && Input[3] == 0) + return std::make_pair(UEF_UTF32_LE, 4); + + if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) + return std::make_pair(UEF_UTF16_LE, 2); + return std::make_pair(UEF_Unknown, 0); + case 0xFE: + if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) + return std::make_pair(UEF_UTF16_BE, 2); + return std::make_pair(UEF_Unknown, 0); + case 0xEF: + if ( Input.size() >= 3 + && uint8_t(Input[1]) == 0xBB + && uint8_t(Input[2]) == 0xBF) + return std::make_pair(UEF_UTF8, 3); + return std::make_pair(UEF_Unknown, 0); + } + + // It could still be utf-32 or utf-16. + if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) + return std::make_pair(UEF_UTF32_LE, 0); + + if (Input.size() >= 2 && Input[1] == 0) + return std::make_pair(UEF_UTF16_LE, 0); + + return std::make_pair(UEF_UTF8, 0); +} + +namespace llvm { +namespace yaml { +/// Token - A single YAML token. +struct Token : ilist_node<Token> { + enum TokenKind { + TK_Error, // Uninitialized token. + TK_StreamStart, + TK_StreamEnd, + TK_VersionDirective, + TK_TagDirective, + TK_DocumentStart, + TK_DocumentEnd, + TK_BlockEntry, + TK_BlockEnd, + TK_BlockSequenceStart, + TK_BlockMappingStart, + TK_FlowEntry, + TK_FlowSequenceStart, + TK_FlowSequenceEnd, + TK_FlowMappingStart, + TK_FlowMappingEnd, + TK_Key, + TK_Value, + TK_Scalar, + TK_Alias, + TK_Anchor, + TK_Tag + } Kind; + + /// A string of length 0 or more whose begin() points to the logical location + /// of the token in the input. + StringRef Range; + + Token() : Kind(TK_Error) {} +}; +} +} + +template<> +struct ilist_sentinel_traits<Token> { + Token *createSentinel() const { + return &Sentinel; + } + static void destroySentinel(Token*) {} + + Token *provideInitialHead() const { return createSentinel(); } + Token *ensureHead(Token*) const { return createSentinel(); } + static void noteHead(Token*, Token*) {} + +private: + mutable Token Sentinel; +}; + +template<> +struct ilist_node_traits<Token> { + Token *createNode(const Token &V) { + return new (Alloc.Allocate<Token>()) Token(V); + } + static void deleteNode(Token *V) {} + + void addNodeToList(Token *) {} + void removeNodeFromList(Token *) {} + void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, + ilist_iterator<Token> /*first*/, + ilist_iterator<Token> /*last*/) {} + + BumpPtrAllocator Alloc; +}; + +typedef ilist<Token> TokenQueueT; + +namespace { +/// @brief This struct is used to track simple keys. +/// +/// Simple keys are handled by creating an entry in SimpleKeys for each Token +/// which could legally be the start of a simple key. When peekNext is called, +/// if the Token To be returned is referenced by a SimpleKey, we continue +/// tokenizing until that potential simple key has either been found to not be +/// a simple key (we moved on to the next line or went further than 1024 chars). +/// Or when we run into a Value, and then insert a Key token (and possibly +/// others) before the SimpleKey's Tok. +struct SimpleKey { + TokenQueueT::iterator Tok; + unsigned Column; + unsigned Line; + unsigned FlowLevel; + bool IsRequired; + + bool operator ==(const SimpleKey &Other) { + return Tok == Other.Tok; + } +}; +} + +/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit +/// subsequence and the subsequence's length in code units (uint8_t). +/// A length of 0 represents an error. +typedef std::pair<uint32_t, unsigned> UTF8Decoded; + +static UTF8Decoded decodeUTF8(StringRef Range) { + StringRef::iterator Position= Range.begin(); + StringRef::iterator End = Range.end(); + // 1 byte: [0x00, 0x7f] + // Bit pattern: 0xxxxxxx + if ((*Position & 0x80) == 0) { + return std::make_pair(*Position, 1); + } + // 2 bytes: [0x80, 0x7ff] + // Bit pattern: 110xxxxx 10xxxxxx + if (Position + 1 != End && + ((*Position & 0xE0) == 0xC0) && + ((*(Position + 1) & 0xC0) == 0x80)) { + uint32_t codepoint = ((*Position & 0x1F) << 6) | + (*(Position + 1) & 0x3F); + if (codepoint >= 0x80) + return std::make_pair(codepoint, 2); + } + // 3 bytes: [0x8000, 0xffff] + // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx + if (Position + 2 != End && + ((*Position & 0xF0) == 0xE0) && + ((*(Position + 1) & 0xC0) == 0x80) && + ((*(Position + 2) & 0xC0) == 0x80)) { + uint32_t codepoint = ((*Position & 0x0F) << 12) | + ((*(Position + 1) & 0x3F) << 6) | + (*(Position + 2) & 0x3F); + // Codepoints between 0xD800 and 0xDFFF are invalid, as + // they are high / low surrogate halves used by UTF-16. + if (codepoint >= 0x800 && + (codepoint < 0xD800 || codepoint > 0xDFFF)) + return std::make_pair(codepoint, 3); + } + // 4 bytes: [0x10000, 0x10FFFF] + // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if (Position + 3 != End && + ((*Position & 0xF8) == 0xF0) && + ((*(Position + 1) & 0xC0) == 0x80) && + ((*(Position + 2) & 0xC0) == 0x80) && + ((*(Position + 3) & 0xC0) == 0x80)) { + uint32_t codepoint = ((*Position & 0x07) << 18) | + ((*(Position + 1) & 0x3F) << 12) | + ((*(Position + 2) & 0x3F) << 6) | + (*(Position + 3) & 0x3F); + if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) + return std::make_pair(codepoint, 4); + } + return std::make_pair(0, 0); +} + +namespace llvm { +namespace yaml { +/// @brief Scans YAML tokens from a MemoryBuffer. +class Scanner { +public: + Scanner(const StringRef Input, SourceMgr &SM); + + /// @brief Parse the next token and return it without popping it. + Token &peekNext(); + + /// @brief Parse the next token and pop it from the queue. + Token getNext(); + + void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, + ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) { + SM.PrintMessage(Loc, Kind, Message, Ranges); + } + + void setError(const Twine &Message, StringRef::iterator Position) { + if (Current >= End) + Current = End - 1; + + // Don't print out more errors after the first one we encounter. The rest + // are just the result of the first, and have no meaning. + if (!Failed) + printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); + Failed = true; + } + + void setError(const Twine &Message) { + setError(Message, Current); + } + + /// @brief Returns true if an error occurred while parsing. + bool failed() { + return Failed; + } + +private: + StringRef currentInput() { + return StringRef(Current, End - Current); + } + + /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting + /// at \a Position. + /// + /// If the UTF-8 code units starting at Position do not form a well-formed + /// code unit subsequence, then the Unicode scalar value is 0, and the length + /// is 0. + UTF8Decoded decodeUTF8(StringRef::iterator Position) { + return ::decodeUTF8(StringRef(Position, End - Position)); + } + + // The following functions are based on the gramar rules in the YAML spec. The + // style of the function names it meant to closely match how they are written + // in the spec. The number within the [] is the number of the grammar rule in + // the spec. + // + // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. + // + // c- + // A production starting and ending with a special character. + // b- + // A production matching a single line break. + // nb- + // A production starting and ending with a non-break character. + // s- + // A production starting and ending with a white space character. + // ns- + // A production starting and ending with a non-space character. + // l- + // A production matching complete line(s). + + /// @brief Skip a single nb-char[27] starting at Position. + /// + /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] + /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] + /// + /// @returns The code unit after the nb-char, or Position if it's not an + /// nb-char. + StringRef::iterator skip_nb_char(StringRef::iterator Position); + + /// @brief Skip a single b-break[28] starting at Position. + /// + /// A b-break is 0xD 0xA | 0xD | 0xA + /// + /// @returns The code unit after the b-break, or Position if it's not a + /// b-break. + StringRef::iterator skip_b_break(StringRef::iterator Position); + + /// @brief Skip a single s-white[33] starting at Position. + /// + /// A s-white is 0x20 | 0x9 + /// + /// @returns The code unit after the s-white, or Position if it's not a + /// s-white. + StringRef::iterator skip_s_white(StringRef::iterator Position); + + /// @brief Skip a single ns-char[34] starting at Position. + /// + /// A ns-char is nb-char - s-white + /// + /// @returns The code unit after the ns-char, or Position if it's not a + /// ns-char. + StringRef::iterator skip_ns_char(StringRef::iterator Position); + + typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); + /// @brief Skip minimal well-formed code unit subsequences until Func + /// returns its input. + /// + /// @returns The code unit after the last minimal well-formed code unit + /// subsequence that Func accepted. + StringRef::iterator skip_while( SkipWhileFunc Func + , StringRef::iterator Position); + + /// @brief Scan ns-uri-char[39]s starting at Cur. + /// + /// This updates Cur and Column while scanning. + /// + /// @returns A StringRef starting at Cur which covers the longest contiguous + /// sequence of ns-uri-char. + StringRef scan_ns_uri_char(); + + /// @brief Scan ns-plain-one-line[133] starting at \a Cur. + StringRef scan_ns_plain_one_line(); + + /// @brief Consume a minimal well-formed code unit subsequence starting at + /// \a Cur. Return false if it is not the same Unicode scalar value as + /// \a Expected. This updates \a Column. + bool consume(uint32_t Expected); + + /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. + void skip(uint32_t Distance); + + /// @brief Return true if the minimal well-formed code unit subsequence at + /// Pos is whitespace or a new line + bool isBlankOrBreak(StringRef::iterator Position); + + /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. + void saveSimpleKeyCandidate( TokenQueueT::iterator Tok + , unsigned AtColumn + , bool IsRequired); + + /// @brief Remove simple keys that can no longer be valid simple keys. + /// + /// Invalid simple keys are not on the current line or are further than 1024 + /// columns back. + void removeStaleSimpleKeyCandidates(); + + /// @brief Remove all simple keys on FlowLevel \a Level. + void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); + + /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd + /// tokens if needed. + bool unrollIndent(int ToColumn); + + /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint + /// if needed. + bool rollIndent( int ToColumn + , Token::TokenKind Kind + , TokenQueueT::iterator InsertPoint); + + /// @brief Skip whitespace and comments until the start of the next token. + void scanToNextToken(); + + /// @brief Must be the first token generated. + bool scanStreamStart(); + + /// @brief Generate tokens needed to close out the stream. + bool scanStreamEnd(); + + /// @brief Scan a %BLAH directive. + bool scanDirective(); + + /// @brief Scan a ... or ---. + bool scanDocumentIndicator(bool IsStart); + + /// @brief Scan a [ or { and generate the proper flow collection start token. + bool scanFlowCollectionStart(bool IsSequence); + + /// @brief Scan a ] or } and generate the proper flow collection end token. + bool scanFlowCollectionEnd(bool IsSequence); + + /// @brief Scan the , that separates entries in a flow collection. + bool scanFlowEntry(); + + /// @brief Scan the - that starts block sequence entries. + bool scanBlockEntry(); + + /// @brief Scan an explicit ? indicating a key. + bool scanKey(); + + /// @brief Scan an explicit : indicating a value. + bool scanValue(); + + /// @brief Scan a quoted scalar. + bool scanFlowScalar(bool IsDoubleQuoted); + + /// @brief Scan an unquoted scalar. + bool scanPlainScalar(); + + /// @brief Scan an Alias or Anchor starting with * or &. + bool scanAliasOrAnchor(bool IsAlias); + + /// @brief Scan a block scalar starting with | or >. + bool scanBlockScalar(bool IsLiteral); + + /// @brief Scan a tag of the form !stuff. + bool scanTag(); + + /// @brief Dispatch to the next scanning function based on \a *Cur. + bool fetchMoreTokens(); + + /// @brief The SourceMgr used for diagnostics and buffer management. + SourceMgr &SM; + + /// @brief The original input. + MemoryBuffer *InputBuffer; + + /// @brief The current position of the scanner. + StringRef::iterator Current; + + /// @brief The end of the input (one past the last character). + StringRef::iterator End; + + /// @brief Current YAML indentation level in spaces. + int Indent; + + /// @brief Current column number in Unicode code points. + unsigned Column; + + /// @brief Current line number. + unsigned Line; + + /// @brief How deep we are in flow style containers. 0 Means at block level. + unsigned FlowLevel; + + /// @brief Are we at the start of the stream? + bool IsStartOfStream; + + /// @brief Can the next token be the start of a simple key? + bool IsSimpleKeyAllowed; + + /// @brief Is the next token required to start a simple key? + bool IsSimpleKeyRequired; + + /// @brief True if an error has occurred. + bool Failed; + + /// @brief Queue of tokens. This is required to queue up tokens while looking + /// for the end of a simple key. And for cases where a single character + /// can produce multiple tokens (e.g. BlockEnd). + TokenQueueT TokenQueue; + + /// @brief Indentation levels. + SmallVector<int, 4> Indents; + + /// @brief Potential simple keys. + SmallVector<SimpleKey, 4> SimpleKeys; +}; + +} // end namespace yaml +} // end namespace llvm + +/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. +static void encodeUTF8( uint32_t UnicodeScalarValue + , SmallVectorImpl<char> &Result) { + if (UnicodeScalarValue <= 0x7F) { + Result.push_back(UnicodeScalarValue & 0x7F); + } else if (UnicodeScalarValue <= 0x7FF) { + uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); + uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); + Result.push_back(FirstByte); + Result.push_back(SecondByte); + } else if (UnicodeScalarValue <= 0xFFFF) { + uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); + uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); + uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); + Result.push_back(FirstByte); + Result.push_back(SecondByte); + Result.push_back(ThirdByte); + } else if (UnicodeScalarValue <= 0x10FFFF) { + uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); + uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); + uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); + uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); + Result.push_back(FirstByte); + Result.push_back(SecondByte); + Result.push_back(ThirdByte); + Result.push_back(FourthByte); + } +} + +bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { + SourceMgr SM; + Scanner scanner(Input, SM); + while (true) { + Token T = scanner.getNext(); + switch (T.Kind) { + case Token::TK_StreamStart: + OS << "Stream-Start: "; + break; + case Token::TK_StreamEnd: + OS << "Stream-End: "; + break; + case Token::TK_VersionDirective: + OS << "Version-Directive: "; + break; + case Token::TK_TagDirective: + OS << "Tag-Directive: "; + break; + case Token::TK_DocumentStart: + OS << "Document-Start: "; + break; + case Token::TK_DocumentEnd: + OS << "Document-End: "; + break; + case Token::TK_BlockEntry: + OS << "Block-Entry: "; + break; + case Token::TK_BlockEnd: + OS << "Block-End: "; + break; + case Token::TK_BlockSequenceStart: + OS << "Block-Sequence-Start: "; + break; + case Token::TK_BlockMappingStart: + OS << "Block-Mapping-Start: "; + break; + case Token::TK_FlowEntry: + OS << "Flow-Entry: "; + break; + case Token::TK_FlowSequenceStart: + OS << "Flow-Sequence-Start: "; + break; + case Token::TK_FlowSequenceEnd: + OS << "Flow-Sequence-End: "; + break; + case Token::TK_FlowMappingStart: + OS << "Flow-Mapping-Start: "; + break; + case Token::TK_FlowMappingEnd: + OS << "Flow-Mapping-End: "; + break; + case Token::TK_Key: + OS << "Key: "; + break; + case Token::TK_Value: + OS << "Value: "; + break; + case Token::TK_Scalar: + OS << "Scalar: "; + break; + case Token::TK_Alias: + OS << "Alias: "; + break; + case Token::TK_Anchor: + OS << "Anchor: "; + break; + case Token::TK_Tag: + OS << "Tag: "; + break; + case Token::TK_Error: + break; + } + OS << T.Range << "\n"; + if (T.Kind == Token::TK_StreamEnd) + break; + else if (T.Kind == Token::TK_Error) + return false; + } + return true; +} + +bool yaml::scanTokens(StringRef Input) { + llvm::SourceMgr SM; + llvm::yaml::Scanner scanner(Input, SM); + for (;;) { + llvm::yaml::Token T = scanner.getNext(); + if (T.Kind == Token::TK_StreamEnd) + break; + else if (T.Kind == Token::TK_Error) + return false; + } + return true; +} + +std::string yaml::escape(StringRef Input) { + std::string EscapedInput; + for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { + if (*i == '\\') + EscapedInput += "\\\\"; + else if (*i == '"') + EscapedInput += "\\\""; + else if (*i == 0) + EscapedInput += "\\0"; + else if (*i == 0x07) + EscapedInput += "\\a"; + else if (*i == 0x08) + EscapedInput += "\\b"; + else if (*i == 0x09) + EscapedInput += "\\t"; + else if (*i == 0x0A) + EscapedInput += "\\n"; + else if (*i == 0x0B) + EscapedInput += "\\v"; + else if (*i == 0x0C) + EscapedInput += "\\f"; + else if (*i == 0x0D) + EscapedInput += "\\r"; + else if (*i == 0x1B) + EscapedInput += "\\e"; + else if (*i >= 0 && *i < 0x20) { // Control characters not handled above. + std::string HexStr = utohexstr(*i); + EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; + } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. + UTF8Decoded UnicodeScalarValue + = decodeUTF8(StringRef(i, Input.end() - i)); + if (UnicodeScalarValue.second == 0) { + // Found invalid char. + SmallString<4> Val; + encodeUTF8(0xFFFD, Val); + EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); + // FIXME: Error reporting. + return EscapedInput; + } + if (UnicodeScalarValue.first == 0x85) + EscapedInput += "\\N"; + else if (UnicodeScalarValue.first == 0xA0) + EscapedInput += "\\_"; + else if (UnicodeScalarValue.first == 0x2028) + EscapedInput += "\\L"; + else if (UnicodeScalarValue.first == 0x2029) + EscapedInput += "\\P"; + else { + std::string HexStr = utohexstr(UnicodeScalarValue.first); + if (HexStr.size() <= 2) + EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; + else if (HexStr.size() <= 4) + EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; + else if (HexStr.size() <= 8) + EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; + } + i += UnicodeScalarValue.second - 1; + } else + EscapedInput.push_back(*i); + } + return EscapedInput; +} + +Scanner::Scanner(StringRef Input, SourceMgr &sm) + : SM(sm) + , Indent(-1) + , Column(0) + , Line(0) + , FlowLevel(0) + , IsStartOfStream(true) + , IsSimpleKeyAllowed(true) + , IsSimpleKeyRequired(false) + , Failed(false) { + InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); + SM.AddNewSourceBuffer(InputBuffer, SMLoc()); + Current = InputBuffer->getBufferStart(); + End = InputBuffer->getBufferEnd(); +} + +Token &Scanner::peekNext() { + // If the current token is a possible simple key, keep parsing until we + // can confirm. + bool NeedMore = false; + while (true) { + if (TokenQueue.empty() || NeedMore) { + if (!fetchMoreTokens()) { + TokenQueue.clear(); + TokenQueue.push_back(Token()); + return TokenQueue.front(); + } + } + assert(!TokenQueue.empty() && + "fetchMoreTokens lied about getting tokens!"); + + removeStaleSimpleKeyCandidates(); + SimpleKey SK; + SK.Tok = TokenQueue.front(); + if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) + == SimpleKeys.end()) + break; + else + NeedMore = true; + } + return TokenQueue.front(); +} + +Token Scanner::getNext() { + Token Ret = peekNext(); + // TokenQueue can be empty if there was an error getting the next token. + if (!TokenQueue.empty()) + TokenQueue.pop_front(); + + // There cannot be any referenced Token's if the TokenQueue is empty. So do a + // quick deallocation of them all. + if (TokenQueue.empty()) { + TokenQueue.Alloc.Reset(); + } + + return Ret; +} + +StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { + // Check 7 bit c-printable - b-char. + if ( *Position == 0x09 + || (*Position >= 0x20 && *Position <= 0x7E)) + return Position + 1; + + // Check for valid UTF-8. + if (uint8_t(*Position) & 0x80) { + UTF8Decoded u8d = decodeUTF8(Position); + if ( u8d.second != 0 + && u8d.first != 0xFEFF + && ( u8d.first == 0x85 + || ( u8d.first >= 0xA0 + && u8d.first <= 0xD7FF) + || ( u8d.first >= 0xE000 + && u8d.first <= 0xFFFD) + || ( u8d.first >= 0x10000 + && u8d.first <= 0x10FFFF))) + return Position + u8d.second; + } + return Position; +} + +StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { + if (*Position == 0x0D) { + if (Position + 1 != End && *(Position + 1) == 0x0A) + return Position + 2; + return Position + 1; + } + + if (*Position == 0x0A) + return Position + 1; + return Position; +} + + +StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { + if (Position == End) + return Position; + if (*Position == ' ' || *Position == '\t') + return Position + 1; + return Position; +} + +StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { + if (Position == End) + return Position; + if (*Position == ' ' || *Position == '\t') + return Position; + return skip_nb_char(Position); +} + +StringRef::iterator Scanner::skip_while( SkipWhileFunc Func + , StringRef::iterator Position) { + while (true) { + StringRef::iterator i = (this->*Func)(Position); + if (i == Position) + break; + Position = i; + } + return Position; +} + +static bool is_ns_hex_digit(const char C) { + return (C >= '0' && C <= '9') + || (C >= 'a' && C <= 'z') + || (C >= 'A' && C <= 'Z'); +} + +static bool is_ns_word_char(const char C) { + return C == '-' + || (C >= 'a' && C <= 'z') + || (C >= 'A' && C <= 'Z'); +} + +StringRef Scanner::scan_ns_uri_char() { + StringRef::iterator Start = Current; + while (true) { + if (Current == End) + break; + if (( *Current == '%' + && Current + 2 < End + && is_ns_hex_digit(*(Current + 1)) + && is_ns_hex_digit(*(Current + 2))) + || is_ns_word_char(*Current) + || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") + != StringRef::npos) { + ++Current; + ++Column; + } else + break; + } + return StringRef(Start, Current - Start); +} + +StringRef Scanner::scan_ns_plain_one_line() { + StringRef::iterator start = Current; + // The first character must already be verified. + ++Current; + while (true) { + if (Current == End) { + break; + } else if (*Current == ':') { + // Check if the next character is a ns-char. + if (Current + 1 == End) + break; + StringRef::iterator i = skip_ns_char(Current + 1); + if (Current + 1 != i) { + Current = i; + Column += 2; // Consume both the ':' and ns-char. + } else + break; + } else if (*Current == '#') { + // Check if the previous character was a ns-char. + // The & 0x80 check is to check for the trailing byte of a utf-8 + if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) { + ++Current; + ++Column; + } else + break; + } else { + StringRef::iterator i = skip_nb_char(Current); + if (i == Current) + break; + Current = i; + ++Column; + } + } + return StringRef(start, Current - start); +} + +bool Scanner::consume(uint32_t Expected) { + if (Expected >= 0x80) + report_fatal_error("Not dealing with this yet"); + if (Current == End) + return false; + if (uint8_t(*Current) >= 0x80) + report_fatal_error("Not dealing with this yet"); + if (uint8_t(*Current) == Expected) { + ++Current; + ++Column; + return true; + } + return false; +} + +void Scanner::skip(uint32_t Distance) { + Current += Distance; + Column += Distance; +} + +bool Scanner::isBlankOrBreak(StringRef::iterator Position) { + if (Position == End) + return false; + if ( *Position == ' ' || *Position == '\t' + || *Position == '\r' || *Position == '\n') + return true; + return false; +} + +void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok + , unsigned AtColumn + , bool IsRequired) { + if (IsSimpleKeyAllowed) { + SimpleKey SK; + SK.Tok = Tok; + SK.Line = Line; + SK.Column = AtColumn; + SK.IsRequired = IsRequired; + SK.FlowLevel = FlowLevel; + SimpleKeys.push_back(SK); + } +} + +void Scanner::removeStaleSimpleKeyCandidates() { + for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); + i != SimpleKeys.end();) { + if (i->Line != Line || i->Column + 1024 < Column) { + if (i->IsRequired) + setError( "Could not find expected : for simple key" + , i->Tok->Range.begin()); + i = SimpleKeys.erase(i); + } else + ++i; + } +} + +void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { + if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) + SimpleKeys.pop_back(); +} + +bool Scanner::unrollIndent(int ToColumn) { + Token T; + // Indentation is ignored in flow. + if (FlowLevel != 0) + return true; + + while (Indent > ToColumn) { + T.Kind = Token::TK_BlockEnd; + T.Range = StringRef(Current, 1); + TokenQueue.push_back(T); + Indent = Indents.pop_back_val(); + } + + return true; +} + +bool Scanner::rollIndent( int ToColumn + , Token::TokenKind Kind + , TokenQueueT::iterator InsertPoint) { + if (FlowLevel) + return true; + if (Indent < ToColumn) { + Indents.push_back(Indent); + Indent = ToColumn; + + Token T; + T.Kind = Kind; + T.Range = StringRef(Current, 0); + TokenQueue.insert(InsertPoint, T); + } + return true; +} + +void Scanner::scanToNextToken() { + while (true) { + while (*Current == ' ' || *Current == '\t') { + skip(1); + } + + // Skip comment. + if (*Current == '#') { + while (true) { + // This may skip more than one byte, thus Column is only incremented + // for code points. + StringRef::iterator i = skip_nb_char(Current); + if (i == Current) + break; + Current = i; + ++Column; + } + } + + // Skip EOL. + StringRef::iterator i = skip_b_break(Current); + if (i == Current) + break; + Current = i; + ++Line; + Column = 0; + // New lines may start a simple key. + if (!FlowLevel) + IsSimpleKeyAllowed = true; + } +} + +bool Scanner::scanStreamStart() { + IsStartOfStream = false; + + EncodingInfo EI = getUnicodeEncoding(currentInput()); + + Token T; + T.Kind = Token::TK_StreamStart; + T.Range = StringRef(Current, EI.second); + TokenQueue.push_back(T); + Current += EI.second; + return true; +} + +bool Scanner::scanStreamEnd() { + // Force an ending new line if one isn't present. + if (Column != 0) { + Column = 0; + ++Line; + } + + unrollIndent(-1); + SimpleKeys.clear(); + IsSimpleKeyAllowed = false; + + Token T; + T.Kind = Token::TK_StreamEnd; + T.Range = StringRef(Current, 0); + TokenQueue.push_back(T); + return true; +} + +bool Scanner::scanDirective() { + // Reset the indentation level. + unrollIndent(-1); + SimpleKeys.clear(); + IsSimpleKeyAllowed = false; + + StringRef::iterator Start = Current; + consume('%'); + StringRef::iterator NameStart = Current; + Current = skip_while(&Scanner::skip_ns_char, Current); + StringRef Name(NameStart, Current - NameStart); + Current = skip_while(&Scanner::skip_s_white, Current); + + if (Name == "YAML") { + Cu |