diff options
Diffstat (limited to 'regex')
-rw-r--r-- | regex/COPYRIGHT | 20 | ||||
-rw-r--r-- | regex/Makefile | 130 | ||||
-rw-r--r-- | regex/README | 32 | ||||
-rw-r--r-- | regex/WHATSNEW | 108 | ||||
-rw-r--r-- | regex/cclass.h | 27 | ||||
-rw-r--r-- | regex/cname.h | 102 | ||||
-rw-r--r-- | regex/debug.c | 242 | ||||
-rw-r--r-- | regex/debug.ih | 14 | ||||
-rw-r--r-- | regex/engine.c | 1019 | ||||
-rw-r--r-- | regex/engine.ih | 35 | ||||
-rw-r--r-- | regex/main.c | 510 | ||||
-rw-r--r-- | regex/main.ih | 19 | ||||
-rwxr-xr-x | regex/mkh | 76 | ||||
-rw-r--r-- | regex/regcomp.c | 1603 | ||||
-rw-r--r-- | regex/regcomp.ih | 51 | ||||
-rw-r--r-- | regex/regerror.c | 126 | ||||
-rw-r--r-- | regex/regerror.ih | 12 | ||||
-rw-r--r-- | regex/regex.3 | 509 | ||||
-rw-r--r-- | regex/regex.7 | 235 | ||||
-rw-r--r-- | regex/regex.h | 74 | ||||
-rw-r--r-- | regex/regex2.h | 134 | ||||
-rw-r--r-- | regex/regexec.c | 138 | ||||
-rw-r--r-- | regex/regfree.c | 37 | ||||
-rw-r--r-- | regex/split.c | 316 | ||||
-rw-r--r-- | regex/tests | 477 | ||||
-rw-r--r-- | regex/utils.h | 22 |
26 files changed, 6068 insertions, 0 deletions
diff --git a/regex/COPYRIGHT b/regex/COPYRIGHT new file mode 100644 index 0000000..30c1f7a --- /dev/null +++ b/regex/COPYRIGHT @@ -0,0 +1,20 @@ +Copyright 1992, 1993, 1994, 1997 Henry Spencer. All rights reserved. +This software is not subject to any license of the American Telephone +and Telegraph Company or of the Regents of the University of California. + +Permission is granted to anyone to use this software for any purpose on +any computer system, and to alter it and redistribute it, subject +to the following restrictions: + +1. The author is not responsible for the consequences of use of this + software, no matter how awful, even if they arise from flaws in it. + +2. The origin of this software must not be misrepresented, either by + explicit claim or by omission. Since few users ever read sources, + credits must appear in the documentation. + +3. Altered versions must be plainly marked as such, and must not be + misrepresented as being the original software. Since few users + ever read sources, credits must appear in the documentation. + +4. This notice may not be removed or altered. diff --git a/regex/Makefile b/regex/Makefile new file mode 100644 index 0000000..3882b37 --- /dev/null +++ b/regex/Makefile @@ -0,0 +1,130 @@ +# You probably want to take -DREDEBUG out of CFLAGS, and put something like +# -O in, *after* testing (-DREDEBUG strengthens testing by enabling a lot of +# internal assertion checking and some debugging facilities). +# Put -Dconst= in for a pre-ANSI compiler. +# Do not take -DPOSIX_MISTAKE out. +# REGCFLAGS isn't important to you (it's for my use in some special contexts). +CFLAGS=-I. -DPOSIX_MISTAKE -DREDEBUG $(REGCFLAGS) + +# If you have a pre-ANSI compiler, put -o into MKHFLAGS. If you want +# the Berkeley __P macro, put -b in. +MKHFLAGS= + +# Flags for linking but not compiling, if any. +LDFLAGS= + +# Extra libraries for linking, if any. +LIBS= + +# Internal stuff, should not need changing. +OBJPRODN=regcomp.o regexec.o regerror.o regfree.o +OBJS=$(OBJPRODN) split.o debug.o main.o +H=cclass.h cname.h regex2.h utils.h +REGSRC=regcomp.c regerror.c regexec.c regfree.c +ALLSRC=$(REGSRC) engine.c debug.c main.c split.c + +# Stuff that matters only if you're trying to lint the package. +LINTFLAGS=-I. -Dstatic= -Dconst= -DREDEBUG +LINTC=regcomp.c regexec.c regerror.c regfree.c debug.c main.c +JUNKLINT=possible pointer alignment|null effect + +# arrangements to build forward-reference header files +.SUFFIXES: .ih .h +.c.ih: + sh ./mkh $(MKHFLAGS) -p $< >$@ + +default: r + +lib: purge $(OBJPRODN) + rm -f libregex.a + ar crv libregex.a $(OBJPRODN) + +purge: + rm -f *.o + +# stuff to build regex.h +REGEXH=regex.h +REGEXHSRC=regex2.h $(REGSRC) +$(REGEXH): $(REGEXHSRC) mkh + sh ./mkh $(MKHFLAGS) -i _REGEX_H_ $(REGEXHSRC) >regex.tmp + cmp -s regex.tmp regex.h 2>/dev/null || cp regex.tmp regex.h + rm -f regex.tmp + +# dependencies +$(OBJPRODN) debug.o: utils.h regex.h regex2.h +regcomp.o: cclass.h cname.h regcomp.ih +regexec.o: engine.c engine.ih +regerror.o: regerror.ih +debug.o: debug.ih +main.o: main.ih + +# tester +re: $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@ + +# regression test +r: re tests + ./re <tests + ./re -el <tests + ./re -er <tests + +# 57 variants, and other stuff, for development use -- not useful to you +ra: ./re tests + -./re <tests + -./re -el <tests + -./re -er <tests + +rx: ./re tests + ./re -x <tests + ./re -x -el <tests + ./re -x -er <tests + +t: ./re tests + -time ./re <tests + -time ./re -cs <tests + -time ./re -el <tests + -time ./re -cs -el <tests + +l: $(LINTC) + lint $(LINTFLAGS) -h $(LINTC) 2>&1 | egrep -v '$(JUNKLINT)' | tee lint + +fullprint: + ti README WHATSNEW notes todo | list + ti *.h | list + list *.c + list regex.3 regex.7 + +print: + ti README WHATSNEW notes todo | list + ti *.h | list + list reg*.c engine.c + + +mf.tmp: Makefile + sed '/^REGEXH=/s/=.*/=regex.h/' Makefile | sed '/#DEL$$/d' >$@ + +DTRH=cclass.h cname.h regex2.h utils.h +PRE=COPYRIGHT README WHATSNEW +POST=mkh regex.3 regex.7 tests $(DTRH) $(ALLSRC) fake/*.[ch] +FILES=$(PRE) Makefile $(POST) +DTR=$(PRE) Makefile=mf.tmp $(POST) +dtr: $(FILES) mf.tmp + makedtr $(DTR) >$@ + rm mf.tmp + +cio: $(FILES) + cio $(FILES) + +rdf: $(FILES) + rcsdiff -c $(FILES) 2>&1 | p + +# various forms of cleanup +tidy: + rm -f junk* core core.* *.core dtr *.tmp lint + +clean: tidy + rm -f *.o *.s *.ih re libregex.a + +# don't do this one unless you know what you're doing +spotless: clean + rm -f mkh regex.h diff --git a/regex/README b/regex/README new file mode 100644 index 0000000..e6ce373 --- /dev/null +++ b/regex/README @@ -0,0 +1,32 @@ +alpha3.8 release. +Tue Aug 10 15:51:48 EDT 1999 +henry@spsystems.net (formerly henry@zoo.toronto.edu) + +See WHATSNEW for change listing. + +installation notes: +-------- +Read the comments at the beginning of Makefile before running. + +Utils.h contains some things that just might have to be modified on +some systems, as well as a nested include (ugh) of <assert.h>. + +The "fake" directory contains quick-and-dirty fakes for some header +files and routines that old systems may not have. Note also that +-DUSEBCOPY will make utils.h substitute bcopy() for memmove(). + +After that, "make r" will build regcomp.o, regexec.o, regfree.o, +and regerror.o (the actual routines), bundle them together into a test +program, and run regression tests on them. No output is good output. + +"make lib" builds just the .o files for the actual routines (when +you're happy with testing and have adjusted CFLAGS for production), +and puts them together into libregex.a. You can pick up either the +library or *.o ("make lib" makes sure there are no other .o files left +around to confuse things). + +Main.c, debug.c, split.c are used for regression testing but are not part +of the RE routines themselves. + +Regex.h goes in /usr/include. All other .h files are internal only. +-------- diff --git a/regex/WHATSNEW b/regex/WHATSNEW new file mode 100644 index 0000000..1295343 --- /dev/null +++ b/regex/WHATSNEW @@ -0,0 +1,108 @@ +New in alpha3.8: Bug fix for signed/unsigned mixup, found and fixed +by the FreeBSD folks. + +New in alpha3.7: A bit of cleanup aimed at maximizing portability, +possibly at slight cost in efficiency. "ul" suffixes and "unsigned long" +no longer appear, in particular. + +New in alpha3.6: A couple more portability glitches fixed. + +New in alpha3.5: Active development of this code has been stopped -- +I'm working on a complete reimplementation -- but folks have found some +minor portability glitches and the like, hence this release to fix them. +One penalty: slightly reduced compatibility with old compilers, because +the ANSI C `unsigned long' type and `ul' constant suffix are used in a +few places (I could avoid this but it would be considerably more work). + +New in alpha3.4: The complex bug alluded to below has been fixed (in a +slightly kludgey temporary way that may hurt efficiency a bit; this is +another "get it out the door for 4.4" release). The tests at the end of +the tests file have accordingly been uncommented. The primary sign of +the bug was that something like a?b matching ab matched b rather than ab. +(The bug was essentially specific to this exact situation, else it would +have shown up earlier.) + +New in alpha3.3: The definition of word boundaries has been altered +slightly, to more closely match the usual programming notion that "_" +is an alphabetic. Stuff used for pre-ANSI systems is now in a subdir, +and the makefile no longer alludes to it in mysterious ways. The +makefile has generally been cleaned up some. Fixes have been made +(again!) so that the regression test will run without -DREDEBUG, at +the cost of weaker checking. A workaround for a bug in some folks' +<assert.h> has been added. And some more things have been added to +tests, including a couple right at the end which are commented out +because the code currently flunks them (complex bug; fix coming). +Plus the usual minor cleanup. + +New in alpha3.2: Assorted bits of cleanup and portability improvement +(the development base is now a BSDI system using GCC instead of an ancient +Sun system, and the newer compiler exposed some glitches). Fix for a +serious bug that affected REs using many [] (including REG_ICASE REs +because of the way they are implemented), *sometimes*, depending on +memory-allocation patterns. The header-file prototypes no longer name +the parameters, avoiding possible name conflicts. The possibility that +some clot has defined CHAR_MIN as (say) `-128' instead of `(-128)' is +now handled gracefully. "uchar" is no longer used as an internal type +name (too many people have the same idea). Still the same old lousy +performance, alas. + +New in alpha3.1: Basically nothing, this release is just a bookkeeping +convenience. Stay tuned. + +New in alpha3.0: Performance is no better, alas, but some fixes have been +made and some functionality has been added. (This is basically the "get +it out the door in time for 4.4" release.) One bug fix: regfree() didn't +free the main internal structure (how embarrassing). It is now possible +to put NULs in either the RE or the target string, using (resp.) a new +REG_PEND flag and the old REG_STARTEND flag. The REG_NOSPEC flag to +regcomp() makes all characters ordinary, so you can match a literal +string easily (this will become more useful when performance improves!). +There are now primitives to match beginnings and ends of words, although +the syntax is disgusting and so is the implementation. The REG_ATOI +debugging interface has changed a bit. And there has been considerable +internal cleanup of various kinds. + +New in alpha2.3: Split change list out of README, and moved flags notes +into Makefile. Macro-ized the name of regex(7) in regex(3), since it has +to change for 4.4BSD. Cleanup work in engine.c, and some new regression +tests to catch tricky cases thereof. + +New in alpha2.2: Out-of-date manpages updated. Regerror() acquires two +small extensions -- REG_ITOA and REG_ATOI -- which avoid debugging kludges +in my own test program and might be useful to others for similar purposes. +The regression test will now compile (and run) without REDEBUG. The +BRE \$ bug is fixed. Most uses of "uchar" are gone; it's all chars now. +Char/uchar parameters are now written int/unsigned, to avoid possible +portability problems with unpromoted parameters. Some unsigned casts have +been introduced to minimize portability problems with shifting into sign +bits. + +New in alpha2.1: Lots of little stuff, cleanup and fixes. The one big +thing is that regex.h is now generated, using mkh, rather than being +supplied in the distribution; due to circularities in dependencies, +you have to build regex.h explicitly by "make h". The two known bugs +have been fixed (and the regression test now checks for them), as has a +problem with assertions not being suppressed in the absence of REDEBUG. +No performance work yet. + +New in alpha2: Backslash-anything is an ordinary character, not an +error (except, of course, for the handful of backslashed metacharacters +in BREs), which should reduce script breakage. The regression test +checks *where* null strings are supposed to match, and has generally +been tightened up somewhat. Small bug fixes in parameter passing (not +harmful, but technically errors) and some other areas. Debugging +invoked by defining REDEBUG rather than not defining NDEBUG. + +New in alpha+3: full prototyping for internal routines, using a little +helper program, mkh, which extracts prototypes given in stylized comments. +More minor cleanup. Buglet fix: it's CHAR_BIT, not CHAR_BITS. Simple +pre-screening of input when a literal string is known to be part of the +RE; this does wonders for performance. + +New in alpha+2: minor bits of cleanup. Notably, the number "32" for the +word width isn't hardwired into regexec.c any more, the public header +file prototypes the functions if __STDC__ is defined, and some small typos +in the manpages have been fixed. + +New in alpha+1: improvements to the manual pages, and an important +extension, the REG_STARTEND option to regexec(). diff --git a/regex/cclass.h b/regex/cclass.h new file mode 100644 index 0000000..d892103 --- /dev/null +++ b/regex/cclass.h @@ -0,0 +1,27 @@ +/* character-class table */ +static struct cclass { + char *name; + char *chars; + char *multis; +} cclasses[] = { + {"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", ""}, + {"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + ""}, + {"blank", " \t", ""}, + {"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177", ""}, + {"digit", "0123456789", ""}, + {"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", + ""}, + {"lower", "abcdefghijklmnopqrstuvwxyz", + ""}, + {"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", + ""}, + {"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", + ""}, + {"space", "\t\n\v\f\r ", ""}, + {"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", + ""}, + {"xdigit", "0123456789ABCDEFabcdef", + ""}, + {NULL, 0, ""} +}; diff --git a/regex/cname.h b/regex/cname.h new file mode 100644 index 0000000..a4af252 --- /dev/null +++ b/regex/cname.h @@ -0,0 +1,102 @@ +/* character-name table */ +static struct cname { + char *name; + char code; +} cnames[] = { + {"NUL", '\0'}, + {"SOH", '\001'}, + {"STX", '\002'}, + {"ETX", '\003'}, + {"EOT", '\004'}, + {"ENQ", '\005'}, + {"ACK", '\006'}, + {"BEL", '\007'}, + {"alert", '\007'}, + {"BS", '\010'}, + {"backspace", '\b'}, + {"HT", '\011'}, + {"tab", '\t'}, + {"LF", '\012'}, + {"newline", '\n'}, + {"VT", '\013'}, + {"vertical-tab", '\v'}, + {"FF", '\014'}, + {"form-feed", '\f'}, + {"CR", '\015'}, + {"carriage-return", '\r'}, + {"SO", '\016'}, + {"SI", '\017'}, + {"DLE", '\020'}, + {"DC1", '\021'}, + {"DC2", '\022'}, + {"DC3", '\023'}, + {"DC4", '\024'}, + {"NAK", '\025'}, + {"SYN", '\026'}, + {"ETB", '\027'}, + {"CAN", '\030'}, + {"EM", '\031'}, + {"SUB", '\032'}, + {"ESC", '\033'}, + {"IS4", '\034'}, + {"FS", '\034'}, + {"IS3", '\035'}, + {"GS", '\035'}, + {"IS2", '\036'}, + {"RS", '\036'}, + {"IS1", '\037'}, + {"US", '\037'}, + {"space", ' '}, + {"exclamation-mark", '!'}, + {"quotation-mark", '"'}, + {"number-sign", '#'}, + {"dollar-sign", '$'}, + {"percent-sign", '%'}, + {"ampersand", '&'}, + {"apostrophe", '\''}, + {"left-parenthesis", '('}, + {"right-parenthesis", ')'}, + {"asterisk", '*'}, + {"plus-sign", '+'}, + {"comma", ','}, + {"hyphen", '-'}, + {"hyphen-minus", '-'}, + {"period", '.'}, + {"full-stop", '.'}, + {"slash", '/'}, + {"solidus", '/'}, + {"zero", '0'}, + {"one", '1'}, + {"two", '2'}, + {"three", '3'}, + {"four", '4'}, + {"five", '5'}, + {"six", '6'}, + {"seven", '7'}, + {"eight", '8'}, + {"nine", '9'}, + {"colon", ':'}, + {"semicolon", ';'}, + {"less-than-sign", '<'}, + {"equals-sign", '='}, + {"greater-than-sign", '>'}, + {"question-mark", '?'}, + {"commercial-at", '@'}, + {"left-square-bracket", '['}, + {"backslash", '\\'}, + {"reverse-solidus", '\\'}, + {"right-square-bracket", ']'}, + {"circumflex", '^'}, + {"circumflex-accent", '^'}, + {"underscore", '_'}, + {"low-line", '_'}, + {"grave-accent", '`'}, + {"left-brace", '{'}, + {"left-curly-bracket", '{'}, + {"vertical-line", '|'}, + {"right-brace", '}'}, + {"right-curly-bracket", '}'}, + {"tilde", '~'}, + {"DEL", '\177'}, + {NULL, 0} +}; diff --git a/regex/debug.c b/regex/debug.c new file mode 100644 index 0000000..99ce7da --- /dev/null +++ b/regex/debug.c @@ -0,0 +1,242 @@ +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <limits.h> +#include <stdlib.h> +#include <sys/types.h> +#include <regex.h> + +#include "utils.h" +#include "regex2.h" +#include "debug.ih" + +/* + - regprint - print a regexp for debugging + == void regprint(regex_t *r, FILE *d); + */ +void +regprint(r, d) +regex_t *r; +FILE *d; +{ + register struct re_guts *g = r->re_g; + register int i; + register int c; + register int last; + int nincat[NC]; + + fprintf(d, "%ld states, %d categories", (long)g->nstates, + g->ncategories); + fprintf(d, ", first %ld last %ld", (long)g->firststate, + (long)g->laststate); + if (g->iflags&USEBOL) + fprintf(d, ", USEBOL"); + if (g->iflags&USEEOL) + fprintf(d, ", USEEOL"); + if (g->iflags&BAD) + fprintf(d, ", BAD"); + if (g->nsub > 0) + fprintf(d, ", nsub=%ld", (long)g->nsub); + if (g->must != NULL) + fprintf(d, ", must(%ld) `%*s'", (long)g->mlen, (int)g->mlen, + g->must); + if (g->backrefs) + fprintf(d, ", backrefs"); + if (g->nplus > 0) + fprintf(d, ", nplus %ld", (long)g->nplus); + fprintf(d, "\n"); + s_print(g, d); + for (i = 0; i < g->ncategories; i++) { + nincat[i] = 0; + for (c = CHAR_MIN; c <= CHAR_MAX; c++) + if (g->categories[c] == i) + nincat[i]++; + } + fprintf(d, "cc0#%d", nincat[0]); + for (i = 1; i < g->ncategories; i++) + if (nincat[i] == 1) { + for (c = CHAR_MIN; c <= CHAR_MAX; c++) + if (g->categories[c] == i) + break; + fprintf(d, ", %d=%s", i, regchar(c)); + } + fprintf(d, "\n"); + for (i = 1; i < g->ncategories; i++) + if (nincat[i] != 1) { + fprintf(d, "cc%d\t", i); + last = -1; + for (c = CHAR_MIN; c <= CHAR_MAX+1; c++) /* +1 does flush */ + if (c <= CHAR_MAX && g->categories[c] == i) { + if (last < 0) { + fprintf(d, "%s", regchar(c)); + last = c; + } + } else { + if (last >= 0) { + if (last != c-1) + fprintf(d, "-%s", + regchar(c-1)); + last = -1; + } + } + fprintf(d, "\n"); + } +} + +/* + - s_print - print the strip for debugging + == static void s_print(register struct re_guts *g, FILE *d); + */ +static void +s_print(g, d) +register struct re_guts *g; +FILE *d; +{ + register sop *s; + register cset *cs; + register int i; + register int done = 0; + register sop opnd; + register int col = 0; + register int last; + register sopno offset = 2; +# define GAP() { if (offset % 5 == 0) { \ + if (col > 40) { \ + fprintf(d, "\n\t"); \ + col = 0; \ + } else { \ + fprintf(d, " "); \ + col++; \ + } \ + } else \ + col++; \ + offset++; \ + } + + if (OP(g->strip[0]) != OEND) + fprintf(d, "missing initial OEND!\n"); + for (s = &g->strip[1]; !done; s++) { + opnd = OPND(*s); + switch (OP(*s)) { + case OEND: + fprintf(d, "\n"); + done = 1; + break; + case OCHAR: + if (strchr("\\|()^$.[+*?{}!<> ", (char)opnd) != NULL) + fprintf(d, "\\%c", (char)opnd); + else + fprintf(d, "%s", regchar((char)opnd)); + break; + case OBOL: + fprintf(d, "^"); + break; + case OEOL: + fprintf(d, "$"); + break; + case OBOW: + fprintf(d, "\\{"); + break; + case OEOW: + fprintf(d, "\\}"); + break; + case OANY: + fprintf(d, "."); + break; + case OANYOF: + fprintf(d, "[(%ld)", (long)opnd); + cs = &g->sets[opnd]; + last = -1; + for (i = 0; i < g->csetsize+1; i++) /* +1 flushes */ + if (CHIN(cs, i) && i < g->csetsize) { + if (last < 0) { + fprintf(d, "%s", regchar(i)); + last = i; + } + } else { + if (last >= 0) { + if (last != i-1) + fprintf(d, "-%s", + regchar(i-1)); + last = -1; + } + } + fprintf(d, "]"); + break; + case OBACK_: + fprintf(d, "(\\<%ld>", (long)opnd); + break; + case O_BACK: + fprintf(d, "<%ld>\\)", (long)opnd); + break; + case OPLUS_: + fprintf(d, "(+"); + if (OP(*(s+opnd)) != O_PLUS) + fprintf(d, "<%ld>", (long)opnd); + break; + case O_PLUS: + if (OP(*(s-opnd)) != OPLUS_) + fprintf(d, "<%ld>", (long)opnd); + fprintf(d, "+)"); + break; + case OQUEST_: + fprintf(d, "(?"); + if (OP(*(s+opnd)) != O_QUEST) + fprintf(d, "<%ld>", (long)opnd); + break; + case O_QUEST: + if (OP(*(s-opnd)) != OQUEST_) + fprintf(d, "<%ld>", (long)opnd); + fprintf(d, "?)"); + break; + case OLPAREN: + fprintf(d, "((<%ld>", (long)opnd); + break; + case ORPAREN: + fprintf(d, "<%ld>))", (long)opnd); + break; + case OCH_: + fprintf(d, "<"); + if (OP(*(s+opnd)) != OOR2) + fprintf(d, "<%ld>", (long)opnd); + break; + case OOR1: + if (OP(*(s-opnd)) != OOR1 && OP(*(s-opnd)) != OCH_) + fprintf(d, "<%ld>", (long)opnd); + fprintf(d, "|"); + break; + case OOR2: + fprintf(d, "|"); + if (OP(*(s+opnd)) != OOR2 && OP(*(s+opnd)) != O_CH) + fprintf(d, "<%ld>", (long)opnd); + break; + case O_CH: + if (OP(*(s-opnd)) != OOR1) + fprintf(d, "<%ld>", (long)opnd); + fprintf(d, ">"); + break; + default: + fprintf(d, "!%d(%d)!", OP(*s), opnd); + break; + } + if (!done) + GAP(); + } +} + +/* + - regchar - make a character printable + == static char *regchar(int ch); + */ +static char * /* -> representation */ +regchar(ch) +int ch; +{ + static char buf[10]; + + if (isprint(ch) || ch == ' ') + sprintf(buf, "%c", ch); + else + sprintf(buf, "\\%o", ch); + return(buf); +} diff --git a/regex/debug.ih b/regex/debug.ih new file mode 100644 index 0000000..5f40ff7 --- /dev/null +++ b/regex/debug.ih @@ -0,0 +1,14 @@ +/* ========= begin header generated by ./mkh ========= */ +#ifdef __cplusplus +extern "C" { +#endif + +/* === debug.c === */ +void regprint(regex_t *r, FILE *d); +static void s_print(register struct re_guts *g, FILE *d); +static char *regchar(int ch); + +#ifdef __cplusplus +} +#endif +/* ========= end header generated by ./mkh ========= */ diff --git a/regex/engine.c b/regex/engine.c new file mode 100644 index 0000000..919fe3f --- /dev/null +++ b/regex/engine.c @@ -0,0 +1,1019 @@ +/* + * The matching engine and friends. This file is #included by regexec.c + * after suitable #defines of a variety of macros used herein, so that + * different state representations can be used without duplicating masses + * of code. + */ + +#ifdef SNAMES +#define matcher smatcher +#define fast sfast +#define slow sslow +#define dissect sdissect +#define backref sbackref +#define step sstep +#define print sprint +#define at sat +#define match smat +#endif +#ifdef LNAMES +#define matcher lmatcher +#define fast lfast +#define slow lslow +#define dissect ldissect +#define backref lbackref +#define step lstep +#define print lprint +#define at lat +#define match lmat +#endif + +/* another structure passed up and down to avoid zillions of parameters */ +struct match { + struct re_guts *g; + int eflags; + regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ + char *offp; /* offsets work from here */ + char *beginp; /* start of string -- virtual NUL precedes */ + char *endp; /* end of string -- virtual NUL here */ + char *coldp; /* can be no match starting before here */ + char **lastpos; /* [nplus+1] */ + STATEVARS; + states st; /* current states */ + states fresh; /* states for a fresh start */ + states tmp; /* temporary */ + states empty; /* empty set of states */ +}; + +#include "engine.ih" + +#ifdef REDEBUG +#define SP(t, s, c) print(m, t, s, c, stdout) +#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) +#define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } +#else +#define SP(t, s, c) /* nothing */ +#define AT(t, p1, p2, s1, s2) /* nothing */ +#define NOTE(s) /* nothing */ +#endif + +/* + - matcher - the actual matching engine + == static int matcher(register struct re_guts *g, char *string, \ + == size_t nmatch, regmatch_t pmatch[], int eflags); + */ +static int /* 0 success, REG_NOMATCH failure */ +matcher(g, string, nmatch, pmatch, eflags) +register struct re_guts *g; +char *string; +size_t nmatch; +regmatch_t pmatch[]; +int eflags; +{ + register char *endp; + register int i; + struct match mv; + register struct match *m = &mv; + register char *dp; + const register sopno gf = g->firststate+1; /* +1 for OEND */ + const register sopno gl = g->laststate; + char *start; + char *stop; + + /* simplify the situation where possible */ + if (g->cflags®_NOSUB) + nmatch = 0; + if (eflags®_STARTEND) { + start = string + pmatch[0].rm_so; + stop = string + pmatch[0].rm_eo; + } else { + start = string; + stop = start + strlen(start); + } + if (stop < start) + return(REG_INVARG); + + /* prescreening; this does wonders for this rather slow code */ + if (g->must != NULL) { + for (dp = start; dp < stop; dp++) + if (*dp == g->must[0] && stop - dp >= g->mlen && + memcmp(dp, g->must, (size_t)g->mlen) == 0) + break; + if (dp == stop) /* we didn't find g->must */ + return(REG_NOMATCH); + } + + /* match struct setup */ + m->g = g; + m->eflags = eflags; + m->pmatch = NULL; + m->lastpos = NULL; + m->offp = string; + m->beginp = start; + m->endp = stop; + STATESETUP(m, 4); + SETUP(m->st); + SETUP(m->fresh); + SETUP(m->tmp); + SETUP(m->empty); + CLEAR(m->empty); + + /* this loop does only one repetition except for backrefs */ + for (;;) { + endp = fast(m, start, stop, gf, gl); + if (endp == NULL) { /* a miss */ + STATETEARDOWN(m); + return(REG_NOMATCH); + } + if (nmatch == 0 && !g->backrefs) + break; /* no further info needed */ + + /* where? */ + assert(m->coldp != NULL); + for (;;) { + NOTE("finding start"); + endp = slow(m, m->coldp, stop, gf, gl); + if (endp != NULL) + break; + assert(m->coldp < m->endp); + m->coldp++; + } + if (nmatch == 1 && !g->backrefs) + break; /* no further info needed */ + + /* oh my, he wants the subexpressions... */ + if (m->pmatch == NULL) + m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * + sizeof(regmatch_t)); + if (m->pmatch == NULL) { + STATETEARDOWN(m); + return(REG_ESPACE); + } + for (i = 1; i <= m->g->nsub; i++) + m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; + if (!g->backrefs && !(m->eflags®_BACKR)) { + NOTE("dissecting"); + dp = dissect(m, m->coldp, endp, gf, gl); + } else { + if (g->nplus > 0 && m->lastpos == NULL) + m->lastpos = (char **)malloc((g->nplus+1) * + sizeof(char *)); + if (g->nplus > 0 && m->lastpos == NULL) { + free(m->pmatch); + STATETEARDOWN(m); + return(REG_ESPACE); + } + NOTE("backref dissect"); + dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); + } + if (dp != NULL) + break; + + /* uh-oh... we couldn't find a subexpression-level match */ + assert(g->backrefs); /* must be back references doing it */ + assert(g->nplus == 0 || m->lastpos != NULL); + for (;;) { + if (dp != NULL || endp <= m->coldp) + break; /* defeat */ + NOTE("backoff"); + endp = slow(m, m->coldp, endp-1, gf, gl); + if (endp == NULL) + break; /* defeat */ + /* try it on a shorter possibility */ +#ifndef NDEBUG + for (i = 1; i <= m->g->nsub; i++) { + assert(m->pmatch[i].rm_so == -1); + assert(m->pmatch[i].rm_eo == -1); + } +#endif + NOTE("backoff dissect"); + dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); + } + assert(dp == NULL || dp == endp); + if (dp != NULL) /* found a shorter one */ + break; + + /* despite initial appearances, there is no match here */ + NOTE("false alarm"); + start = m->coldp + 1; /* recycle starting later */ + assert(start <= stop); + } + + /* fill in the details if requested */ + if (nmatch > 0) { + pmatch[0].rm_so = m->coldp - m->offp; + pmatch[0].rm_eo = endp - m->offp; + } + if (nmatch > 1) { + assert(m->pmatch != NULL); + for (i = 1; i < nmatch; i++) + if (i <= m->g->nsub) + pmatch[i] = m->pmatch[i]; + else { + pmatch[i].rm_so = -1; + pmatch[i].rm_eo = -1; + } + } + + if (m->pmatch != NULL) + free((char *)m->pmatch); + if (m->lastpos != NULL) + free((char *)m->lastpos); + STATETEARDOWN(m); + return(0); +} + +/* + - dissect - figure out what matched what, no back references + == static char *dissect(register struct match *m, char *start, \ + == char *stop, sopno startst, sopno stopst); + */ +static char * /* == stop (success) always */ +dissect(m, start, stop, startst, stopst) +register struct match *m; +char *start; +char *stop; +sopno startst; +sopno stopst; +{ + register int i; + register sopno ss; /* start sop of current subRE */ + register sopno es; /* end sop of current subRE */ + register char *sp; /* start of string matched by it */ + register char *stp; /* string matched by it cannot pass here */ + register char *rest; /* start of rest of string */ + register char *tail; /* string unmatched by rest of RE */ + register sopno ssub; /* start sop of subsubRE */ + register sopno esub; /* end sop of subsubRE */ + register char *ssp; /* start of string matched by subsubRE */ + register char *sep; /* end of string matched by subsubRE */ + register char *oldssp; /* previous ssp */ + register char *dp; + + AT("diss", start, stop, startst, stopst); + sp = |