/*
* inffast.S is a hand tuned assembler version of:
*
* inffast.c -- fast decoding
* Copyright (C) 1995-2003 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Copyright (C) 2003 Chris Anderson <christop@charm.net>
* Please use the copyright conditions above.
*
* This version (Jan-23-2003) of inflate_fast was coded and tested under
* GNU/Linux on a pentium 3, using the gcc-3.2 compiler distribution. On that
* machine, I found that gzip style archives decompressed about 20% faster than
* the gcc-3.2 -O3 -fomit-frame-pointer compiled version. Your results will
* depend on how large of a buffer is used for z_stream.next_in & next_out
* (8K-32K worked best for my 256K cpu cache) and how much overhead there is in
* stream processing I/O and crc32/addler32. In my case, this routine used
* 70% of the cpu time and crc32 used 20%.
*
* I am confident that this version will work in the general case, but I have
* not tested a wide variety of datasets or a wide variety of platforms.
*
* Jan-24-2003 -- Added -DUSE_MMX define for slightly faster inflating.
* It should be a runtime flag instead of compile time flag...
*
* Jan-26-2003 -- Added runtime check for MMX support with cpuid instruction.
* With -DUSE_MMX, only MMX code is compiled. With -DNO_MMX, only non-MMX code
* is compiled. Without either option, runtime detection is enabled. Runtime
* detection should work on all modern cpus and the recomended algorithm (flip
* ID bit on eflags and then use the cpuid instruction) is used in many
* multimedia applications. Tested under win2k with gcc-2.95 and gas-2.12
* distributed with cygwin3. Compiling with gcc-2.95 -c inffast.S -o
* inffast.obj generates a COFF object which can then be linked with MSVC++
* compiled code. Tested under FreeBSD 4.7 with gcc-2.95.
*
* Jan-28-2003 -- Tested Athlon XP... MMX mode is slower than no MMX (and
* slower than compiler generated code). Adjusted cpuid check to use the MMX
* code only for Pentiums < P4 until I have more data on the P4. Speed
* improvment is only about 15% on the Athlon when compared with code generated
* with MSVC++. Not sure yet, but I think the P4 will also be slower using the
* MMX mode because many of it's x86 ALU instructions execute in .5 cycles and
* have less latency than MMX ops. Added code to buffer the last 11 bytes of
* the input stream since the MMX code grabs bits in chunks of 32, which
* differs from the inffast.c algorithm. I don't think there would have been
* read overruns where a page boundary was crossed (a segfault), but there
* could have been overruns when next_in ends on unaligned memory (unintialized
* memory read).
*
* Mar-13-2003 -- P4 MMX is slightly slower than P4 NO_MMX. I created a C
* version of the non-MMX code so that it doesn't depend on zstrm and zstate
* structure offsets which are hard coded in this file. This was last tested
* with zlib-1.2.0 which is currently in beta testing, newer versions of this
* and inffas86.c can be found at http://www.eetbeetee.com/zlib/ and
* http://www.charm.net/~christop/zlib/
*/
/*
* if you have underscore linking problems (_inflate_fast undefined), try
* using -DGAS_COFF
*/
#if ! defined( GAS_COFF ) && ! defined( GAS_ELF )
#if defined( WIN32 ) || defined( __CYGWIN__ )
#define GAS_COFF /* windows object format */
#else
#define GAS_ELF
#endif
#endif /* ! GAS_COFF && ! GAS_ELF */
#if defined( GAS_COFF )
/* coff externals have underscores */
#define inflate_fast _inflate_fast
#define inflate_fast_use_mmx _inflate_fast_use_mmx
#endif /* GAS_COFF */
.file "inffast.S"
.globl inflate_fast
.text
.align 4,0
.L_invalid_literal_length_code_msg:
.string "invalid literal/length code"
.align 4,0
.L_invalid_distance_code_msg:
.string "invalid distance code"
.align 4,0
.L_invalid_distance_too_far_msg:
.string "invalid distance too far back"
#if ! defined( NO_MMX )
.align 4,0
.L_mask: /* mask[N] = ( 1 << N ) - 1 */
.long 0
.long 1
.long 3
.long 7
.long 15
.long 31
.long 63
.long 127
.long 255