aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJukka Jylänki <jujjyl@gmail.com>2014-01-10 14:40:24 +0200
committerJukka Jylänki <jujjyl@gmail.com>2014-01-10 14:54:24 +0200
commit4e3c1b3e862ffd8e324cd1f24ed84692bd50a83b (patch)
tree6f127371c5a63284856d4fcee427ba20b512c2b3
parentdfd9cf8cc063158cbaeecc81bc0e4b27ef6bec20 (diff)
Optimize GL emulation prepareClientAttributes. Fix issues where the slow path was not able to handle unaligned source data. Annotate some unsupported cases. Remove liveClientAttributes, which does not seem to be a win in profiles.
-rw-r--r--src/library_gl.js174
1 files changed, 98 insertions, 76 deletions
diff --git a/src/library_gl.js b/src/library_gl.js
index 0c601673..14ea8795 100644
--- a/src/library_gl.js
+++ b/src/library_gl.js
@@ -3367,7 +3367,6 @@ var LibraryGL = {
totalEnabledClientAttributes: 0,
enabledClientAttributes: [0, 0],
clientAttributes: [], // raw data, including possible unneeded ones
- liveClientAttributes: [], // the ones actually alive in the current computation, sorted
currentRenderer: null, // Caches the currently active FFP emulation renderer, so that it does not have to be re-looked up unless relevant state changes.
modifiedClientAttributes: false,
clientActiveTexture: 0,
@@ -3430,17 +3429,17 @@ var LibraryGL = {
if (GLImmediate.currentRenderer) {
return GLImmediate.currentRenderer;
}
- // return a renderer object given the liveClientAttributes
// we maintain a cache of renderers, optimized to not generate garbage
- var attributes = GLImmediate.liveClientAttributes;
var cacheMap = GLImmediate.rendererCache;
var temp;
var keyView = cacheMap.getStaticKeyView().reset();
// By attrib state:
var enabledAttributesKey = 0;
- for (var i = 0; i < attributes.length; i++) {
- enabledAttributesKey |= 1 << attributes[i].name;
+ for (var i = 0; i < GLImmediate.MAX_TEXTURES+3; i++) {
+ if (GLImmediate.enabledClientAttributes[i]) {
+ enabledAttributesKey |= 1 << i;
+ }
}
keyView.next(enabledAttributesKey);
@@ -3471,7 +3470,13 @@ var LibraryGL = {
var renderer = keyView.get();
if (!renderer) {
#if GL_DEBUG
- Module.printErr('generating renderer for ' + JSON.stringify(attributes));
+ var liveClientAttributes = [];
+ for (var i = 0; i < GLImmediate.MAX_TEXTURES+3; i++) {
+ if (GLImmediate.enabledClientAttributes[i]) {
+ liveClientAttributes.push(clientAttributes[i]);
+ }
+ }
+ Module.printErr('generating renderer for ' + JSON.stringify(liveClientAttributes));
#endif
renderer = GLImmediate.createRenderer();
GLImmediate.currentRenderer = renderer;
@@ -4077,96 +4082,113 @@ var LibraryGL = {
// does not work for glBegin/End, where we generate renderer components dynamically and then
// disable them ourselves, but it does help with glDrawElements/Arrays.
if (!GLImmediate.modifiedClientAttributes) {
+#if GL_ASSERTIONS
+ if ((GLImmediate.stride & 3) != 0) {
+ Runtime.warnOnce('Warning: Rendering from client side vertex arrays where stride (' + GLImmediate.stride + ') is not a multiple of four! This is not currently supported!');
+ }
+#endif
GLImmediate.vertexCounter = (GLImmediate.stride * count) / 4; // XXX assuming float
return;
}
GLImmediate.modifiedClientAttributes = false;
- var stride = 0, start;
- var attributes = GLImmediate.liveClientAttributes;
- attributes.length = 0;
- for (var i = 0; i < GLImmediate.NUM_ATTRIBUTES; i++) {
- if (GLImmediate.enabledClientAttributes[i]) attributes.push(GLImmediate.clientAttributes[i]);
- }
- attributes.sort(function(x, y) { return !x ? (!y ? 0 : 1) : (!y ? -1 : (x.pointer - y.pointer)) });
- start = GL.currArrayBuffer ? 0 : attributes[0].pointer;
- var multiStrides = false;
- for (var i = 0; i < attributes.length; i++) {
- var attribute = attributes[i];
- if (!attribute) break;
- if (stride != 0 && stride != attribute.stride) multiStrides = true;
- if (attribute.stride) stride = attribute.stride;
+ // The role of prepareClientAttributes is to examine the set of client-side vertex attribute buffers
+ // that user code has submitted, and to prepare them to be uploaded to a VBO in GPU memory
+ // (since WebGL does not support client-side rendering, i.e. rendering from vertex data in CPU memory)
+ // User can submit vertex data generally in three different configurations:
+ // 1. Fully planar: all attributes are in their own separate tightly-packed arrays in CPU memory.
+ // 2. Fully interleaved: all attributes share a single array where data is interleaved something like (pos,uv,normal), (pos,uv,normal), ...
+ // 3. Complex hybrid: Multiple separate arrays that either are sparsely strided, and/or partially interleave vertex attributes.
+
+ // For simplicity, we support the case (2) as the fast case. For (1) and (3), we do a memory copy of the
+ // vertex data here to prepare a relayouted buffer that is of the structure in case (2). The reason
+ // for this is that it allows the emulation code to get away with using just one VBO buffer for rendering,
+ // and not have to maintain multiple ones. Therefore cases (1) and (3) will be very slow, and case (2) is fast.
+
+ // Detect which case we are in by using a quick heuristic by examining the strides of the buffers. If all the buffers have identical
+ // stride, we assume we have case (2), otherwise we have something more complex.
+ var clientStartPointer = 0x7FFFFFFF;
+ var bytes = 0; // Total number of bytes taken up by a single vertex.
+ var minStride = 0x7FFFFFFF;
+ var maxStride = 0;
+ for (var i = 0; i < 3+GLImmediate.MAX_TEXTURES; i++) {
+ if (GLImmediate.enabledClientAttributes[i]) {
+ var attr = GLImmediate.clientAttributes[i];
+ clientStartPointer = Math.min(clientStartPointer, attr.pointer);
+ attr.sizeBytes = attr.size * GL.byteSizeByType[attr.type - GL.byteSizeByTypeRoot];
+ bytes += attr.sizeBytes;
+ minStride = Math.min(minStride, attr.stride);
+ maxStride = Math.max(maxStride, attr.stride);
+ }
}
- if (multiStrides) stride = 0; // we will need to restride
- var bytes = 0; // total size in bytes
- if (!stride && !beginEnd) {
- // beginEnd can not have stride in the attributes, that is fine. otherwise,
- // no stride means that all attributes are in fact packed. to keep the rest of
- // our emulation code simple, we perform unpacking/restriding here. this adds overhead, so
- // it is a good idea to not hit this!
-#if ASSERTIONS
- Runtime.warnOnce('Unpacking/restriding attributes, this is slow and dangerous');
+ if ((minStride != maxStride || maxStride < bytes) && !beginEnd) {
+ // We are in cases (1) or (3): slow path, shuffle the data around into a single interleaved vertex buffer.
+ // The immediate-mode glBegin()/glEnd() vertex submission gets automatically generated in appropriate layout,
+ // so never need to come down this path if that was used.
+#if GL_ASSERTIONS
+ Runtime.warnOnce('Rendering from planar client-side vertex arrays. This is a very slow emulation path! Use interleaved vertex arrays for best performance.');
#endif
if (!GLImmediate.restrideBuffer) GLImmediate.restrideBuffer = _malloc(GL.MAX_TEMP_BUFFER_SIZE);
- start = GLImmediate.restrideBuffer;
-#if ASSERTIONS
- assert(start % 4 == 0);
-#endif
+ var start = GLImmediate.restrideBuffer;
+ bytes = 0;
// calculate restrided offsets and total size
- for (var i = 0; i < attributes.length; i++) {
- var attribute = attributes[i];
- if (!attribute) break;
- var size = attribute.size * GL.byteSizeByType[attribute.type - GL.byteSizeByTypeRoot];
- if (size % 4 != 0) size += 4 - (size % 4); // align everything
- attribute.offset = bytes;
- bytes += size;
+ for (var i = 0; i < GLImmediate.clientAttributes.length; i++) {
+ if (GLImmediate.enabledClientAttributes[i]) {
+ var attribute = GLImmediate.clientAttributes[i];
+ var size = attribute.sizeBytes;
+ if (size % 4 != 0) size += 4 - (size % 4); // align everything
+ attribute.offset = bytes;
+ bytes += size;
+ }
}
-#if ASSERTIONS
- assert(count*bytes <= GL.MAX_TEMP_BUFFER_SIZE);
-#endif
- // copy out the data (we need to know the stride for that, and define attribute.pointer
- for (var i = 0; i < attributes.length; i++) {
- var attribute = attributes[i];
- if (!attribute) break;
- var size4 = Math.floor((attribute.size * GL.byteSizeByType[attribute.type - GL.byteSizeByTypeRoot])/4);
- for (var j = 0; j < count; j++) {
- for (var k = 0; k < size4; k++) { // copy in chunks of 4 bytes, our alignment makes this possible
- HEAP32[((start + attribute.offset + bytes*j)>>2) + k] = HEAP32[(attribute.pointer>>2) + j*size4 + k];
+ // copy out the data (we need to know the stride for that, and define attribute.pointer)
+ for (var i = 0; i < GLImmediate.clientAttributes.length; i++) {
+ if (GLImmediate.enabledClientAttributes[i]) {
+ var attribute = GLImmediate.clientAttributes[i];
+ var srcStride = Math.max(attribute.sizeBytes, attribute.stride);
+ if ((srcStride & 3) == 0 && (attribute.sizeBytes & 3) == 0) {
+ var size4 = attribute.sizeBytes>>2;
+ var srcStride4 = Math.max(attribute.sizeBytes, attribute.stride)>>2;
+ for (var j = 0; j < count; j++) {
+ for (var k = 0; k < size4; k++) { // copy in chunks of 4 bytes, our alignment makes this possible
+ HEAP32[((start + attribute.offset + bytes*j)>>2) + k] = HEAP32[(attribute.pointer>>2) + j*srcStride4 + k];
+ }
+ }
+ } else {
+ for (var j = 0; j < count; j++) {
+ for (var k = 0; k < attribute.sizeBytes; k++) { // source data was not aligned to multiples of 4, must copy byte by byte.
+ HEAP8[start + attribute.offset + bytes*j + k] = HEAP8[attribute.pointer + j*srcStride + k];
+ }
+ }
}
+ attribute.pointer = start + attribute.offset;
}
- attribute.pointer = start + attribute.offset;
}
+ GLImmediate.stride = bytes;
+ GLImmediate.vertexPointer = start;
} else {
- // normal situation, everything is strided and in the same buffer
- for (var i = 0; i < attributes.length; i++) {
- var attribute = attributes[i];
- if (!attribute) break;
- attribute.offset = attribute.pointer - start;
- if (attribute.offset > bytes) { // ensure we start where we should
-#if ASSERTIONS
- assert((attribute.offset - bytes)%4 == 0); // XXX assuming 4-alignment
-#endif
- bytes += attribute.offset - bytes;
- }
- bytes += attribute.size * GL.byteSizeByType[attribute.type - GL.byteSizeByTypeRoot];
- if (bytes % 4 != 0) bytes += 4 - (bytes % 4); // XXX assuming 4-alignment
+ // case (2): fast path, all data is interleaved to a single vertex array so we can get away with a single VBO upload.
+ if (GL.currArrayBuffer) {
+ GLImmediate.vertexPointer = 0;
+ } else {
+ GLImmediate.vertexPointer = clientStartPointer;
}
-#if ASSERTIONS
- assert(beginEnd || bytes <= stride); // if not begin-end, explicit stride should make sense with total byte size
-#endif
- if (bytes < stride) { // ensure the size is that of the stride
- bytes = stride;
+ for (var i = 0; i < 3+GLImmediate.MAX_TEXTURES; i++) {
+ if (GLImmediate.enabledClientAttributes[i]) {
+ var attr = GLImmediate.clientAttributes[i];
+ attr.offset = attr.pointer - clientStartPointer; // Compute what will be the offset of this attribute in the VBO after we upload.
+ }
}
+ GLImmediate.stride = Math.max(maxStride, bytes);
}
- GLImmediate.stride = bytes;
-
if (!beginEnd) {
- bytes *= count;
- if (!GL.currArrayBuffer) {
- GLImmediate.vertexPointer = start;
+#if GL_ASSERTIONS
+ if ((GLImmediate.stride & 3) != 0) {
+ Runtime.warnOnce('Warning: Rendering from client side vertex arrays where stride (' + GLImmediate.stride + ') is not a multiple of four! This is not currently supported!');
}
- GLImmediate.vertexCounter = bytes / 4; // XXX assuming float
+#endif
+ GLImmediate.vertexCounter = (GLImmediate.stride * count) / 4; // XXX assuming float
}
},