1 files changed, 368 insertions, 0 deletions
diff --git a/src/parseTools.js b/src/parseTools.js
new file mode 100644
index 00000000..2a689889
--- /dev/null
+++ b/src/parseTools.js
@@ -0,0 +1,368 @@
+// Various tools for parsing llvm
+
+// Simple #if/else/endif preprocessing for a file. Checks if the
+// ident checked is true in our global.
+function preprocess(text) {
+  var lines = text.split('\n');
+  var ret = '';
+  var show = true;
+  for (var i = 0; i < lines.length; i++) {
+    var line = lines[i];
+    if (line[0] != '#') {
+      if (show) {
+        ret += line + '\n';
+      }
+    } else {
+      if (line[1] == 'i') { // if
+        var ident = line.substr(4);
+        show = !!this[ident];
+      } else if (line[2] == 'l') { // else
+        show = !show;
+      } else if (line[2] == 'n') { // endif
+        show = true;
+      } else {
+        throw "Unclear preprocessor command: " + line;
+      }
+    }
+  }
+  return ret;
+}
+
+function addPointing(type) { return type + '*' }
+function removePointing(type, num) {
+  if (num === 0) return type;
+  return type.substr(0, type.length-(num ? num : 1))
+}
+
+function pointingLevels(type) {
+  if (!type) return 0;
+  var ret = 0;
+  var len1 = type.length - 1;
+  while (type[len1-ret] === '*') {
+    ret ++;
+  }
+  return ret;
+}
+
+function toNiceIdent(ident) {
+  if (parseFloat(ident) == ident) return ident;
+  if (ident == 'null') return '0'; // see parseNumerical
+  return ident.replace(/[" \.@%:<>,\*]/g, '_');
+}
+
+function isNumberType(type) {
+  var types = ['i1', 'i8', 'i32', 'i64', 'float', 'double'];
+  return types.indexOf(type) != -1;
+}
+
+function isStructPointerType(type) {
+  // This test is necessary for clang - in llvm-gcc, we
+  // could check for %struct. The downside is that %1 can
+  // be either a variable or a structure, and we guess it is
+  // a struct, which can lead to |call i32 %5()| having
+  // |%5()| as a function call (like |i32 (i8*)| etc.). So
+  // we must check later on, in call(), where we have more
+  // context, to differentiate such cases.
+  // A similar thing happns in isStructType()
+  return !isNumberType(type) && type[0] == '%';
+}
+
+function isStructType(type) {
+  if (isPointerType(type)) return false;
+  if (new RegExp(/^\[\d+\ x\ (.*)\]/g).test(type)) return true; // [15 x ?] blocks. Like structs
+  // See comment in isStructPointerType()
+  return !isNumberType(type) && type[0] == '%';
+}
+
+function isPointerType(type) { // TODO!
+  return pointingLevels(type) > 0;
+}
+
+function isVoidType(type) {
+  return type == 'void';
+}
+
+function isType(type) { // TODO!
+  return isVoidType(type) || isNumberType(type) || isStructType(type) || isPointerType(type);
+}
+
+// Detects a function definition, ([...|type,[type,...]])
+function isFunctionDef(token) {
+  var text = token.text;
+  var pointing = pointingLevels(text);
+  var nonPointing = removePointing(text, pointing);
+  if (nonPointing[0] != '(' || nonPointing.substr(-1) != ')')
+    return false;
+  if (nonPointing == '(...)') return true;
+  if (!token.item) return false;
+  var fail = false;
+  splitTokenList(token.item[0].tokens).forEach(function(segment) {
+    var subtoken = segment[0];
+    fail = fail || !isType(subtoken.text) || segment.length > 1;
+  });
+  return !fail;
+}
+
+function addIdent(token) {
+  token.ident = token.text;
+  return token;
+}
+
+function combineTokens(tokens) {
+  var ret = {
+    lineNum: tokens[0].lineNum,
+    text: '',
+    tokens: [],
+  };
+  tokens.forEach(function(token) {
+    ret.text += token.text;
+    ret.tokens.push(token);
+  });
+  return ret;
+}
+
+function compareTokens(a, b) {
+  var aId = a.__uid__;
+  var bId = b.__uid__;
+  a.__uid__ = 0;
+  b.__uid__ = 0;
+  var ret = JSON.stringify(a) == JSON.stringify(b);
+  a.__uid__ = aId;
+  b.__uid__ = bId;
+  return ret;
+}
+
+function getTokenIndexByText(tokens, text) {
+  var i = 0;
+  while (tokens[i].text != ';') i++;
+  return i;
+}
+
+function findTokenText(item, text) {
+  for (var i = 0; i < item.tokens.length; i++) {
+    if (item.tokens[i].text == text) return i;
+  }
+  return -1;
+}
+
+// Splits a list of tokens separated by commas. For example, a list of arguments in a function call
+function splitTokenList(tokens) {
+  if (tokens.length == 0) return [];
+  if (tokens.slice(-1)[0].text != ',') tokens.push({text:','});
+  var ret = [];
+  var seg = [];
+  tokens.forEach(function(token) {
+    if (token.text == ',') {
+      ret.push(seg);
+      seg = [];
+    } else {
+      seg.push(token);
+    }
+  });
+  return ret;
+}
+
+// Splits an item, with the intent of later reintegration
+function splitItem(parent, childSlot, copySlots) {
+  if (!copySlots) copySlots = [];
+  if (!parent[childSlot]) parent[childSlot] = {};
+  var child = parent[childSlot];
+  parent[childSlot] = null;
+  child.parentUid = parent.__uid__;
+  child.parentSlot = childSlot;
+  child.parentLineNum = child.lineNum = parent.lineNum;
+  copySlots.forEach(function(slot) { child[slot] = parent[slot] });
+  return {
+    parent: parent,
+    child: child,
+  };
+}
+
+function makeReintegrator(afterFunc) {
+  // reintegration - find intermediate representation-parsed items and
+  // place back in parents TODO: Optimize this code to optimal O(..)
+  return {
+    process: function(items) {
+      var ret = [];
+      for (var i = 0; i < items.length; i++) {
+        var found = false;
+        if (items[i] && items[i].parentSlot) {
+          var child = items[i];
+          for (var j = 0; j < items.length; j++) {
+            if (items[j] && items[j].lineNum == items[i].parentLineNum) {
+              var parent = items[j];
+              // process the pair
+              parent[child.parentSlot] = child;
+              delete child.parentLineNum;
+              afterFunc.call(this, parent, child);
+
+              items[i] = null;
+              items[j] = null;
+              found = true;
+              break;
+            }
+          }
+        }
+      }
+      this.forwardItems(items.filter(function(item) { return !!item }), this.name_); // next time hopefully
+      return ret;
+    }
+  };
+}
+
+function parseParamTokens(params) {
+  if (params.length === 0) return [];
+  var ret = [];
+  if (params[params.length-1].text != ',') {
+    params.push({ text: ',' });
+  }
+  var absIndex = 0;
+  while (params.length > 0) {
+    var i = 0;
+    while (params[i].text != ',') i++;
+    var segment = params.slice(0, i);
+    params = params.slice(i+1);
+    segment = cleanSegment(segment);
+    if (segment.length == 1) {
+      if (segment[0].text == '...') {
+        ret.push({
+          intertype: 'varargs',
+        });
+      } else {
+        // Clang sometimes has a parameter with just a type,
+        // no name... the name is implied to be %{the index}
+        ret.push({
+          intertype: 'value',
+          type: segment[0],
+          value: null,
+          ident: '_' + absIndex,
+        });
+      }
+    } else if (segment[1].text === 'getelementptr') {
+      ret.push(parseGetElementPtr(segment));
+    } else if (segment[1].text === 'bitcast') {
+      ret.push(parseBitcast(segment));
+    } else {
+      if (segment[2] && segment[2].text == 'to') { // part of bitcast params
+        segment = segment.slice(0, 2);
+      }
+      while (segment.length > 2) {
+        segment[0].text += segment[1].text;
+        segment.splice(1, 1); // TODO: merge tokens nicely
+      }
+      ret.push({
+        intertype: 'value',
+        type: segment[0],
+        value: segment[1],
+        ident: segment[1].text,
+      });
+      //          } else {
+      //            throw "what is this params token? " + JSON.stringify(segment);
+    }
+    absIndex ++;
+  }
+  return ret;
+}
+
+function cleanSegment(segment) {
+  if (segment.length == 1) return segment;
+  while (['noalias', 'sret', 'nocapture', 'nest', 'zeroext', 'signext'].indexOf(segment[1].text) != -1) {
+    segment.splice(1, 1);
+  }
+  return segment;
+}
+
+// Expects one of the several LVM getelementptr formats:
+// a qualifier, a type, a null, then an () item with tokens
+function parseGetElementPtr(segment) {
+//print("Parse GTP: " + dump(segment));
+  segment = segment.slice(0);
+  segment = cleanSegment(segment);
+  assertTrue(['inreg', 'byval'].indexOf(segment[1].text) == -1);
+  //dprint('// zz: ' + dump(segment) + '\n\n\n');
+  var ret = {
+    intertype: 'getelementptr',
+    type: segment[0],
+    params: parseParamTokens(segment[3].item[0].tokens),
+  };
+  ret.ident = toNiceIdent(ret.params[0].ident);
+  return ret;
+}
+
+// TODO: use this
+function parseBitcast(segment) {
+  //print('zz parseBC pre: ' + dump(segment));
+  var ret = {
+    intertype: 'bitcast',
+    type: segment[0],
+    params: parseParamTokens(segment[2].item[0].tokens),
+  };
+  ret.ident = toNiceIdent(ret.params[0].ident);
+//print('zz parseBC: ' + dump(ret));
+  return ret;
+}
+
+function cleanOutTokens(filterOut, tokens, index) {
+  while (filterOut.indexOf(tokens[index].text) != -1) {
+    tokens.splice(index, 1);
+  }
+}
+
+function _HexToInt(stringy) {
+  var ret = 0;
+  var mul = 1;
+  var base;
+  for (var i = (stringy.length - 1); i >= 0; i = i - 1) {
+    if (stringy.charCodeAt(i) >= "A".charCodeAt(0)) {
+      base = "A".charCodeAt(0) - 10;
+    } else {
+      base = "0".charCodeAt(0);
+    }
+    ret = ret + (mul*(stringy.charCodeAt(i) - base));
+    mul = mul * 16;
+  }
+  return ret;
+}
+
+function IEEEUnHex(stringy) {
+  var a = _HexToInt(stringy.substr(2, 8));
+  var b = _HexToInt(stringy.substr(10));
+  var e = (a >> ((52 - 32) & 0x7ff)) - 1023;
+  return ((((a & 0xfffff | 0x100000) * 1.0) / Math.pow(2,52-32)) * Math.pow(2, e)) + (((b * 1.0) / Math.pow(2, 52)) * Math.pow(2, e));
+}
+
+function parseNumerical(value, type) {
+  if ((!type || type == 'double' || type == 'float') && value.substr(0,2) == '0x') {
+    // Hexadecimal double value, as the llvm docs say,
+    // "The one non-intuitive notation for constants is the hexadecimal form of floating point constants."
+    return IEEEUnHex(value);
+  }
+  if (value == 'null') {
+    // NULL *is* 0, in C/C++. No JS null! (null == 0 is false, etc.)
+    return '0';
+  }
+  return value;
+}
+
+// \0Dsometext is really '\r', then sometext
+// This function returns an array of int values
+function parseLLVMString(str) {
+  var ret = [];
+  var i = 0;
+  while (i < str.length) {
+    var chr = str[i];
+    if (chr != '\\') {
+      ret.push(chr.charCodeAt(0));
+      i++;
+    } else {
+      ret.push(_HexToInt(str[i+1]+str[i+2]));
+      i += 3;
+    }
+  }
+  return ret;
+}
+
+function getLabelIds(labels) {
+  return labels.map(function(label) { return label.ident });
+}
+