diff options
author | alon@honor <none@none> | 2010-10-23 13:53:46 -0700 |
---|---|---|
committer | alon@honor <none@none> | 2010-10-23 13:53:46 -0700 |
commit | de6934db38e16a6572f7813531e5966ac115e744 (patch) | |
tree | 828438813d4318c9cfbf76672535e0874d5f0e2d | |
parent | bdf551e4e041814bab99f0f774684fb3c5bd812f (diff) |
regex-ize tokenizer
-rw-r--r-- | src/intertyper.js | 102 |
1 files changed, 64 insertions, 38 deletions
diff --git a/src/intertyper.js b/src/intertyper.js index f5302829..be2045f2 100644 --- a/src/intertyper.js +++ b/src/intertyper.js @@ -37,18 +37,21 @@ function intertyper(data) { }, }); + var ENCLOSER_STARTERS = set('[', '(', '<'); + var ENCLOSER_ENDERS = { + '[': ']', + '(': ')', + '<': '>', + }; + // Line tokenizer var tokenizer = substrate.addZyme('Tokenizer', { processItem: function(item, inner) { //assert(item.lineNum != 40000); //if (item.lineNum) print(item.lineNum); - var lineText = item.lineText + " "; var tokens = []; - var tokenStart = -1; - var indent = -1; var quotes = 0; var lastToken = null; - var i = 0; var CHUNKSIZE = 64; // How much forward to peek forward. Too much means too many string segments copied // Note: '{' is not an encloser, as its use in functions is split over many lines var enclosers = { @@ -59,17 +62,14 @@ function intertyper(data) { '<': 0, '>': '<', }; - var ENCLOSER_STARTERS = set('[', '(', '<'); var totalEnclosing = 0; var that = this; - function finishToken(includeThis) { - var text = lineText.substr(tokenStart, i-tokenStart + (includeThis ? 1 : 0)); - + function makeToken(text) { + if (text.length == 0) return; // merge certain tokens if ( (lastToken && lastToken.text == '%' && text[0] == '"' ) || (lastToken && text.replace(/\*/g, '') == '') ) { lastToken.text += text; - tokenStart = -1; return; } @@ -82,9 +82,6 @@ function intertyper(data) { }, true); token.type = text[0]; } - if (indent == -1) { - indent = tokenStart; - } // merge certain tokens if (lastToken && isType(lastToken.text) && isFunctionDef(token)) { lastToken.text += ' ' + text; @@ -100,53 +97,82 @@ function intertyper(data) { tokens.push(token); lastToken = token; } - tokenStart = -1; } - for (; i < lineText.length; i++) { - var letter = lineText[i]; + // Split using meaningful characters + var lineText = item.lineText + ' '; + var re = /[\[\]\(\)<>, "]/g; + var segments = lineText.split(re); + segments.pop(); + var len = segments.length; + var i = -1; + var curr = ''; + var segment, letter; + for (var s = 0; s < len; s++) { + segment = segments[s]; + i += segment.length + 1; + letter = lineText[i]; + curr += segment; switch (letter) { case ' ': - if (tokenStart >= 0 && totalEnclosing == 0 && quotes == 0) finishToken(); + if (totalEnclosing == 0 && quotes == 0) { + makeToken(curr); + curr = ''; + } else { + curr += ' '; + } break; case '"': - if (tokenStart >= 0 && totalEnclosing == 0 && quotes == 0) finishToken(); - if (tokenStart == -1 && totalEnclosing == 0 && quotes == 0) tokenStart = i; - quotes = 1-quotes; - if (tokenStart == i) { - i += Math.max(0, lineText.substr(i+1, i+1+CHUNKSIZE).search(/[!"]/)); + if (totalEnclosing == 0) { + if (quotes == 0) { + makeToken(curr); + curr = '"'; + } else { + makeToken(curr + '"'); + curr = ''; + } + } else { + curr += '"'; } + quotes = 1-quotes; break; case ',': - if (tokenStart >= 0 && totalEnclosing == 0 && quotes == 0) finishToken(); if (totalEnclosing == 0 && quotes == 0) { + makeToken(curr); + curr = ''; tokens.push({ text: ',' }); + } else { + curr += ','; } break; default: - if (letter in enclosers && quotes == 0) { - if (letter in ENCLOSER_STARTERS) { - if (tokenStart >= 0 && totalEnclosing == 0 && quotes == 0) finishToken(); - if (tokenStart == -1 && totalEnclosing == 0 && quotes == 0) { - tokenStart = i; - } - enclosers[letter]++; - totalEnclosing++; - } else { - enclosers[enclosers[letter]]--; - totalEnclosing--; - if (tokenStart >= 0 && totalEnclosing == 0 && quotes == 0) finishToken(true); + assert(letter in enclosers); + if (quotes) { + curr += letter; + break; + } + if (letter in ENCLOSER_STARTERS) { + if (totalEnclosing == 0) { + makeToken(curr); + curr = ''; } + curr += letter; + enclosers[letter]++; + totalEnclosing++; } else { - if (tokenStart == -1 && totalEnclosing == 0 && quotes == 0) { - tokenStart = i; + enclosers[enclosers[letter]]--; + totalEnclosing--; + if (totalEnclosing == 0) { + makeToken(curr + letter); + curr = ''; + } else { + curr += letter; } - i += Math.max(0, lineText.substr(i+1, i+1+CHUNKSIZE).search(/[\[\]\(\)<>, "]/)); } } } var item = { tokens: tokens, - indent: indent, + indent: lineText.search(/[^ ]/), lineNum: item.lineNum, }; if (inner) { |