diff options
author | Jukka Jylänki <jujjyl@gmail.com> | 2014-01-10 14:40:24 +0200 |
---|---|---|
committer | Jukka Jylänki <jujjyl@gmail.com> | 2014-01-10 14:54:24 +0200 |
commit | 4e3c1b3e862ffd8e324cd1f24ed84692bd50a83b (patch) | |
tree | 6f127371c5a63284856d4fcee427ba20b512c2b3 | |
parent | dfd9cf8cc063158cbaeecc81bc0e4b27ef6bec20 (diff) |
Optimize GL emulation prepareClientAttributes. Fix issues where the slow path was not able to handle unaligned source data. Annotate some unsupported cases. Remove liveClientAttributes, which does not seem to be a win in profiles.
-rw-r--r-- | src/library_gl.js | 174 |
1 files changed, 98 insertions, 76 deletions
diff --git a/src/library_gl.js b/src/library_gl.js index 0c601673..14ea8795 100644 --- a/src/library_gl.js +++ b/src/library_gl.js @@ -3367,7 +3367,6 @@ var LibraryGL = { totalEnabledClientAttributes: 0, enabledClientAttributes: [0, 0], clientAttributes: [], // raw data, including possible unneeded ones - liveClientAttributes: [], // the ones actually alive in the current computation, sorted currentRenderer: null, // Caches the currently active FFP emulation renderer, so that it does not have to be re-looked up unless relevant state changes. modifiedClientAttributes: false, clientActiveTexture: 0, @@ -3430,17 +3429,17 @@ var LibraryGL = { if (GLImmediate.currentRenderer) { return GLImmediate.currentRenderer; } - // return a renderer object given the liveClientAttributes // we maintain a cache of renderers, optimized to not generate garbage - var attributes = GLImmediate.liveClientAttributes; var cacheMap = GLImmediate.rendererCache; var temp; var keyView = cacheMap.getStaticKeyView().reset(); // By attrib state: var enabledAttributesKey = 0; - for (var i = 0; i < attributes.length; i++) { - enabledAttributesKey |= 1 << attributes[i].name; + for (var i = 0; i < GLImmediate.MAX_TEXTURES+3; i++) { + if (GLImmediate.enabledClientAttributes[i]) { + enabledAttributesKey |= 1 << i; + } } keyView.next(enabledAttributesKey); @@ -3471,7 +3470,13 @@ var LibraryGL = { var renderer = keyView.get(); if (!renderer) { #if GL_DEBUG - Module.printErr('generating renderer for ' + JSON.stringify(attributes)); + var liveClientAttributes = []; + for (var i = 0; i < GLImmediate.MAX_TEXTURES+3; i++) { + if (GLImmediate.enabledClientAttributes[i]) { + liveClientAttributes.push(clientAttributes[i]); + } + } + Module.printErr('generating renderer for ' + JSON.stringify(liveClientAttributes)); #endif renderer = GLImmediate.createRenderer(); GLImmediate.currentRenderer = renderer; @@ -4077,96 +4082,113 @@ var LibraryGL = { // does not work for glBegin/End, where we generate renderer components dynamically and then // disable them ourselves, but it does help with glDrawElements/Arrays. if (!GLImmediate.modifiedClientAttributes) { +#if GL_ASSERTIONS + if ((GLImmediate.stride & 3) != 0) { + Runtime.warnOnce('Warning: Rendering from client side vertex arrays where stride (' + GLImmediate.stride + ') is not a multiple of four! This is not currently supported!'); + } +#endif GLImmediate.vertexCounter = (GLImmediate.stride * count) / 4; // XXX assuming float return; } GLImmediate.modifiedClientAttributes = false; - var stride = 0, start; - var attributes = GLImmediate.liveClientAttributes; - attributes.length = 0; - for (var i = 0; i < GLImmediate.NUM_ATTRIBUTES; i++) { - if (GLImmediate.enabledClientAttributes[i]) attributes.push(GLImmediate.clientAttributes[i]); - } - attributes.sort(function(x, y) { return !x ? (!y ? 0 : 1) : (!y ? -1 : (x.pointer - y.pointer)) }); - start = GL.currArrayBuffer ? 0 : attributes[0].pointer; - var multiStrides = false; - for (var i = 0; i < attributes.length; i++) { - var attribute = attributes[i]; - if (!attribute) break; - if (stride != 0 && stride != attribute.stride) multiStrides = true; - if (attribute.stride) stride = attribute.stride; + // The role of prepareClientAttributes is to examine the set of client-side vertex attribute buffers + // that user code has submitted, and to prepare them to be uploaded to a VBO in GPU memory + // (since WebGL does not support client-side rendering, i.e. rendering from vertex data in CPU memory) + // User can submit vertex data generally in three different configurations: + // 1. Fully planar: all attributes are in their own separate tightly-packed arrays in CPU memory. + // 2. Fully interleaved: all attributes share a single array where data is interleaved something like (pos,uv,normal), (pos,uv,normal), ... + // 3. Complex hybrid: Multiple separate arrays that either are sparsely strided, and/or partially interleave vertex attributes. + + // For simplicity, we support the case (2) as the fast case. For (1) and (3), we do a memory copy of the + // vertex data here to prepare a relayouted buffer that is of the structure in case (2). The reason + // for this is that it allows the emulation code to get away with using just one VBO buffer for rendering, + // and not have to maintain multiple ones. Therefore cases (1) and (3) will be very slow, and case (2) is fast. + + // Detect which case we are in by using a quick heuristic by examining the strides of the buffers. If all the buffers have identical + // stride, we assume we have case (2), otherwise we have something more complex. + var clientStartPointer = 0x7FFFFFFF; + var bytes = 0; // Total number of bytes taken up by a single vertex. + var minStride = 0x7FFFFFFF; + var maxStride = 0; + for (var i = 0; i < 3+GLImmediate.MAX_TEXTURES; i++) { + if (GLImmediate.enabledClientAttributes[i]) { + var attr = GLImmediate.clientAttributes[i]; + clientStartPointer = Math.min(clientStartPointer, attr.pointer); + attr.sizeBytes = attr.size * GL.byteSizeByType[attr.type - GL.byteSizeByTypeRoot]; + bytes += attr.sizeBytes; + minStride = Math.min(minStride, attr.stride); + maxStride = Math.max(maxStride, attr.stride); + } } - if (multiStrides) stride = 0; // we will need to restride - var bytes = 0; // total size in bytes - if (!stride && !beginEnd) { - // beginEnd can not have stride in the attributes, that is fine. otherwise, - // no stride means that all attributes are in fact packed. to keep the rest of - // our emulation code simple, we perform unpacking/restriding here. this adds overhead, so - // it is a good idea to not hit this! -#if ASSERTIONS - Runtime.warnOnce('Unpacking/restriding attributes, this is slow and dangerous'); + if ((minStride != maxStride || maxStride < bytes) && !beginEnd) { + // We are in cases (1) or (3): slow path, shuffle the data around into a single interleaved vertex buffer. + // The immediate-mode glBegin()/glEnd() vertex submission gets automatically generated in appropriate layout, + // so never need to come down this path if that was used. +#if GL_ASSERTIONS + Runtime.warnOnce('Rendering from planar client-side vertex arrays. This is a very slow emulation path! Use interleaved vertex arrays for best performance.'); #endif if (!GLImmediate.restrideBuffer) GLImmediate.restrideBuffer = _malloc(GL.MAX_TEMP_BUFFER_SIZE); - start = GLImmediate.restrideBuffer; -#if ASSERTIONS - assert(start % 4 == 0); -#endif + var start = GLImmediate.restrideBuffer; + bytes = 0; // calculate restrided offsets and total size - for (var i = 0; i < attributes.length; i++) { - var attribute = attributes[i]; - if (!attribute) break; - var size = attribute.size * GL.byteSizeByType[attribute.type - GL.byteSizeByTypeRoot]; - if (size % 4 != 0) size += 4 - (size % 4); // align everything - attribute.offset = bytes; - bytes += size; + for (var i = 0; i < GLImmediate.clientAttributes.length; i++) { + if (GLImmediate.enabledClientAttributes[i]) { + var attribute = GLImmediate.clientAttributes[i]; + var size = attribute.sizeBytes; + if (size % 4 != 0) size += 4 - (size % 4); // align everything + attribute.offset = bytes; + bytes += size; + } } -#if ASSERTIONS - assert(count*bytes <= GL.MAX_TEMP_BUFFER_SIZE); -#endif - // copy out the data (we need to know the stride for that, and define attribute.pointer - for (var i = 0; i < attributes.length; i++) { - var attribute = attributes[i]; - if (!attribute) break; - var size4 = Math.floor((attribute.size * GL.byteSizeByType[attribute.type - GL.byteSizeByTypeRoot])/4); - for (var j = 0; j < count; j++) { - for (var k = 0; k < size4; k++) { // copy in chunks of 4 bytes, our alignment makes this possible - HEAP32[((start + attribute.offset + bytes*j)>>2) + k] = HEAP32[(attribute.pointer>>2) + j*size4 + k]; + // copy out the data (we need to know the stride for that, and define attribute.pointer) + for (var i = 0; i < GLImmediate.clientAttributes.length; i++) { + if (GLImmediate.enabledClientAttributes[i]) { + var attribute = GLImmediate.clientAttributes[i]; + var srcStride = Math.max(attribute.sizeBytes, attribute.stride); + if ((srcStride & 3) == 0 && (attribute.sizeBytes & 3) == 0) { + var size4 = attribute.sizeBytes>>2; + var srcStride4 = Math.max(attribute.sizeBytes, attribute.stride)>>2; + for (var j = 0; j < count; j++) { + for (var k = 0; k < size4; k++) { // copy in chunks of 4 bytes, our alignment makes this possible + HEAP32[((start + attribute.offset + bytes*j)>>2) + k] = HEAP32[(attribute.pointer>>2) + j*srcStride4 + k]; + } + } + } else { + for (var j = 0; j < count; j++) { + for (var k = 0; k < attribute.sizeBytes; k++) { // source data was not aligned to multiples of 4, must copy byte by byte. + HEAP8[start + attribute.offset + bytes*j + k] = HEAP8[attribute.pointer + j*srcStride + k]; + } + } } + attribute.pointer = start + attribute.offset; } - attribute.pointer = start + attribute.offset; } + GLImmediate.stride = bytes; + GLImmediate.vertexPointer = start; } else { - // normal situation, everything is strided and in the same buffer - for (var i = 0; i < attributes.length; i++) { - var attribute = attributes[i]; - if (!attribute) break; - attribute.offset = attribute.pointer - start; - if (attribute.offset > bytes) { // ensure we start where we should -#if ASSERTIONS - assert((attribute.offset - bytes)%4 == 0); // XXX assuming 4-alignment -#endif - bytes += attribute.offset - bytes; - } - bytes += attribute.size * GL.byteSizeByType[attribute.type - GL.byteSizeByTypeRoot]; - if (bytes % 4 != 0) bytes += 4 - (bytes % 4); // XXX assuming 4-alignment + // case (2): fast path, all data is interleaved to a single vertex array so we can get away with a single VBO upload. + if (GL.currArrayBuffer) { + GLImmediate.vertexPointer = 0; + } else { + GLImmediate.vertexPointer = clientStartPointer; } -#if ASSERTIONS - assert(beginEnd || bytes <= stride); // if not begin-end, explicit stride should make sense with total byte size -#endif - if (bytes < stride) { // ensure the size is that of the stride - bytes = stride; + for (var i = 0; i < 3+GLImmediate.MAX_TEXTURES; i++) { + if (GLImmediate.enabledClientAttributes[i]) { + var attr = GLImmediate.clientAttributes[i]; + attr.offset = attr.pointer - clientStartPointer; // Compute what will be the offset of this attribute in the VBO after we upload. + } } + GLImmediate.stride = Math.max(maxStride, bytes); } - GLImmediate.stride = bytes; - if (!beginEnd) { - bytes *= count; - if (!GL.currArrayBuffer) { - GLImmediate.vertexPointer = start; +#if GL_ASSERTIONS + if ((GLImmediate.stride & 3) != 0) { + Runtime.warnOnce('Warning: Rendering from client side vertex arrays where stride (' + GLImmediate.stride + ') is not a multiple of four! This is not currently supported!'); } - GLImmediate.vertexCounter = bytes / 4; // XXX assuming float +#endif + GLImmediate.vertexCounter = (GLImmediate.stride * count) / 4; // XXX assuming float } }, |