aboutsummaryrefslogtreecommitdiff
path: root/lib/Support
diff options
context:
space:
mode:
authorTorok Edwin <edwintorok@gmail.com>2009-08-30 08:24:09 +0000
committerTorok Edwin <edwintorok@gmail.com>2009-08-30 08:24:09 +0000
commitce0c81e7dd321e9f94f628daa5528f56cab0ab88 (patch)
tree29b76548d9f780040e5ef64de07c626114d28fa5 /lib/Support
parent743810620742c92be90a30c7fc7d5e6631baff1f (diff)
Add regular expression matching support, based on OpenBSD regexec()/regcomp()
implementation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@80493 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Support')
-rw-r--r--lib/Support/CMakeLists.txt6
-rw-r--r--lib/Support/COPYRIGHT.regex54
-rw-r--r--lib/Support/Regex.cpp97
-rw-r--r--lib/Support/regcclass.h70
-rw-r--r--lib/Support/regcname.h139
-rw-r--r--lib/Support/regcomp.c1524
-rw-r--r--lib/Support/regengine.inc1021
-rw-r--r--lib/Support/regerror.c131
-rw-r--r--lib/Support/regex2.h157
-rw-r--r--lib/Support/regex_impl.h108
-rw-r--r--lib/Support/regexec.c161
-rw-r--r--lib/Support/regfree.c72
-rw-r--r--lib/Support/regstrlcpy.c52
-rw-r--r--lib/Support/regutils.h55
14 files changed, 3647 insertions, 0 deletions
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 05f1ac7d98..0144b28d2e 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -32,6 +32,12 @@ add_llvm_library(LLVMSupport
Twine.cpp
raw_os_ostream.cpp
raw_ostream.cpp
+ Regex.cpp
+ regcomp.c
+ regerror.c
+ regexec.c
+ regfree.c
+ regstrlcpy.c
)
target_link_libraries (LLVMSupport LLVMSystem)
diff --git a/lib/Support/COPYRIGHT.regex b/lib/Support/COPYRIGHT.regex
new file mode 100644
index 0000000000..a6392fd37c
--- /dev/null
+++ b/lib/Support/COPYRIGHT.regex
@@ -0,0 +1,54 @@
+$OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $
+
+Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved.
+This software is not subject to any license of the American Telephone
+and Telegraph Company or of the Regents of the University of California.
+
+Permission is granted to anyone to use this software for any purpose on
+any computer system, and to alter it and redistribute it, subject
+to the following restrictions:
+
+1. The author is not responsible for the consequences of use of this
+ software, no matter how awful, even if they arise from flaws in it.
+
+2. The origin of this software must not be misrepresented, either by
+ explicit claim or by omission. Since few users ever read sources,
+ credits must appear in the documentation.
+
+3. Altered versions must be plainly marked as such, and must not be
+ misrepresented as being the original software. Since few users
+ ever read sources, credits must appear in the documentation.
+
+4. This notice may not be removed or altered.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+/*-
+ * Copyright (c) 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)COPYRIGHT 8.1 (Berkeley) 3/16/94
+ */
diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp
new file mode 100644
index 0000000000..f8b2446e3c
--- /dev/null
+++ b/lib/Support/Regex.cpp
@@ -0,0 +1,97 @@
+//===-- Regex.cpp - Regular Expression matcher implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a POSIX regular expression matcher.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "regex_impl.h"
+#include <string>
+
+using namespace llvm;
+Regex::Regex(const StringRef &regex, unsigned Flags)
+{
+ unsigned flags = 0;
+ preg = new struct llvm_regex;
+ preg->re_endp = regex.end();
+ if (Flags & IgnoreCase)
+ flags |= REG_ICASE;
+ if (Flags & NoSub) {
+ flags |= REG_NOSUB;
+ sub = false;
+ } else {
+ sub = true;
+ }
+ if (Flags & Newline)
+ flags |= REG_NEWLINE;
+ error = llvm_regcomp(preg, regex.data(), flags|REG_EXTENDED|REG_PEND);
+}
+
+bool Regex::isValid(std::string &Error)
+{
+ if (!error)
+ return true;
+
+ size_t len = llvm_regerror(error, preg, NULL, 0);
+ char *errbuff = new char[len];
+ llvm_regerror(error, preg, errbuff, len);
+ Error.assign(errbuff);
+ return false;
+}
+
+Regex::~Regex()
+{
+ llvm_regfree(preg);
+ delete preg;
+}
+
+bool Regex::match(const StringRef &String, SmallVectorImpl<StringRef> *Matches)
+{
+ unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
+
+ if (Matches) {
+ assert(sub && "Substring matching requested but pattern compiled without");
+ Matches->clear();
+ }
+
+ // pmatch needs to have at least one element.
+ SmallVector<llvm_regmatch_t, 2> pm;
+ pm.resize(nmatch > 0 ? nmatch : 1);
+ pm[0].rm_so = 0;
+ pm[0].rm_eo = String.size();
+
+ int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
+
+ if (rc == REG_NOMATCH)
+ return false;
+ if (rc != 0) {
+ // regexec can fail due to invalid pattern or running out of memory.
+ error = rc;
+ return false;
+ }
+
+ // There was a match.
+
+ if (Matches) { // match position requested
+ for (unsigned i=0;i<nmatch; i++) {
+ if (pm[i].rm_so == -1) {
+ // this group didn't match
+ Matches->push_back(StringRef());
+ continue;
+ }
+ assert(pm[i].rm_eo > pm[i].rm_so);
+ Matches->push_back(StringRef(String.data()+pm[i].rm_so,
+ pm[i].rm_eo-pm[i].rm_so));
+ }
+ }
+
+ return true;
+}
diff --git a/lib/Support/regcclass.h b/lib/Support/regcclass.h
new file mode 100644
index 0000000000..2cea3e4e54
--- /dev/null
+++ b/lib/Support/regcclass.h
@@ -0,0 +1,70 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)cclass.h 8.3 (Berkeley) 3/20/94
+ */
+
+/* character-class table */
+static struct cclass {
+ const char *name;
+ const char *chars;
+ const char *multis;
+} cclasses[] = {
+ { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789", ""} ,
+ { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+ ""} ,
+ { "blank", " \t", ""} ,
+ { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
+\25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
+ { "digit", "0123456789", ""} ,
+ { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+ ""} ,
+ { "lower", "abcdefghijklmnopqrstuvwxyz",
+ ""} ,
+ { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
+ ""} ,
+ { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+ ""} ,
+ { "space", "\t\n\v\f\r ", ""} ,
+ { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
+ ""} ,
+ { "xdigit", "0123456789ABCDEFabcdef",
+ ""} ,
+ { NULL, 0, "" }
+};
diff --git a/lib/Support/regcname.h b/lib/Support/regcname.h
new file mode 100644
index 0000000000..3c0bb248ff
--- /dev/null
+++ b/lib/Support/regcname.h
@@ -0,0 +1,139 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)cname.h 8.3 (Berkeley) 3/20/94
+ */
+
+/* character-name table */
+static struct cname {
+ const char *name;
+ char code;
+} cnames[] = {
+ { "NUL", '\0' },
+ { "SOH", '\001' },
+ { "STX", '\002' },
+ { "ETX", '\003' },
+ { "EOT", '\004' },
+ { "ENQ", '\005' },
+ { "ACK", '\006' },
+ { "BEL", '\007' },
+ { "alert", '\007' },
+ { "BS", '\010' },
+ { "backspace", '\b' },
+ { "HT", '\011' },
+ { "tab", '\t' },
+ { "LF", '\012' },
+ { "newline", '\n' },
+ { "VT", '\013' },
+ { "vertical-tab", '\v' },
+ { "FF", '\014' },
+ { "form-feed", '\f' },
+ { "CR", '\015' },
+ { "carriage-return", '\r' },
+ { "SO", '\016' },
+ { "SI", '\017' },
+ { "DLE", '\020' },
+ { "DC1", '\021' },
+ { "DC2", '\022' },
+ { "DC3", '\023' },
+ { "DC4", '\024' },
+ { "NAK", '\025' },
+ { "SYN", '\026' },
+ { "ETB", '\027' },
+ { "CAN", '\030' },
+ { "EM", '\031' },
+ { "SUB", '\032' },
+ { "ESC", '\033' },
+ { "IS4", '\034' },
+ { "FS", '\034' },
+ { "IS3", '\035' },
+ { "GS", '\035' },
+ { "IS2", '\036' },
+ { "RS", '\036' },
+ { "IS1", '\037' },
+ { "US", '\037' },
+ { "space", ' ' },
+ { "exclamation-mark", '!' },
+ { "quotation-mark", '"' },
+ { "number-sign", '#' },
+ { "dollar-sign", '$' },
+ { "percent-sign", '%' },
+ { "ampersand", '&' },
+ { "apostrophe", '\'' },
+ { "left-parenthesis", '(' },
+ { "right-parenthesis", ')' },
+ { "asterisk", '*' },
+ { "plus-sign", '+' },
+ { "comma", ',' },
+ { "hyphen", '-' },
+ { "hyphen-minus", '-' },
+ { "period", '.' },
+ { "full-stop", '.' },
+ { "slash", '/' },
+ { "solidus", '/' },
+ { "zero", '0' },
+ { "one", '1' },
+ { "two", '2' },
+ { "three", '3' },
+ { "four", '4' },
+ { "five", '5' },
+ { "six", '6' },
+ { "seven", '7' },
+ { "eight", '8' },
+ { "nine", '9' },
+ { "colon", ':' },
+ { "semicolon", ';' },
+ { "less-than-sign", '<' },
+ { "equals-sign", '=' },
+ { "greater-than-sign", '>' },
+ { "question-mark", '?' },
+ { "commercial-at", '@' },
+ { "left-square-bracket", '[' },
+ { "backslash", '\\' },
+ { "reverse-solidus", '\\' },
+ { "right-square-bracket", ']' },
+ { "circumflex", '^' },
+ { "circumflex-accent", '^' },
+ { "underscore", '_' },
+ { "low-line", '_' },
+ { "grave-accent", '`' },
+ { "left-brace", '{' },
+ { "left-curly-bracket", '{' },
+ { "vertical-line", '|' },
+ { "right-brace", '}' },
+ { "right-curly-bracket", '}' },
+ { "tilde", '~' },
+ { "DEL", '\177' },
+ { NULL, 0 }
+};
diff --git a/lib/Support/regcomp.c b/lib/Support/regcomp.c
new file mode 100644
index 0000000000..4fd88b1249
--- /dev/null
+++ b/lib/Support/regcomp.c
@@ -0,0 +1,1524 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdlib.h>
+#include "regex_impl.h"
+
+#include "regutils.h"
+#include "regex2.h"
+
+#include "regcclass.h"
+#include "regcname.h"
+
+/*
+ * parse structure, passed up and down to avoid global variables and
+ * other clumsinesses
+ */
+struct parse {
+ char *next; /* next character in RE */
+ char *end; /* end of string (-> NUL normally) */
+ int error; /* has an error been seen? */
+ sop *strip; /* malloced strip */
+ sopno ssize; /* malloced strip size (allocated) */
+ sopno slen; /* malloced strip length (used) */
+ int ncsalloc; /* number of csets allocated */
+ struct re_guts *g;
+# define NPAREN 10 /* we need to remember () 1-9 for back refs */
+ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
+ sopno pend[NPAREN]; /* -> ) ([0] unused) */
+};
+
+static void p_ere(struct parse *, int);
+static void p_ere_exp(struct parse *);
+static void p_str(struct parse *);
+static void p_bre(struct parse *, int, int);
+static int p_simp_re(struct parse *, int);
+static int p_count(struct parse *);
+static void p_bracket(struct parse *);
+static void p_b_term(struct parse *, cset *);
+static void p_b_cclass(struct parse *, cset *);
+static void p_b_eclass(struct parse *, cset *);
+static char p_b_symbol(struct parse *);
+static char p_b_coll_elem(struct parse *, int);
+static char othercase(int);
+static void bothcases(struct parse *, int);
+static void ordinary(struct parse *, int);
+static void nonnewline(struct parse *);
+static void repeat(struct parse *, sopno, int, int);
+static int seterr(struct parse *, int);
+static cset *allocset(struct parse *);
+static void freeset(struct parse *, cset *);
+static int freezeset(struct parse *, cset *);
+static int firstch(struct parse *, cset *);
+static int nch(struct parse *, cset *);
+static void mcadd(struct parse *, cset *, const char *);
+static void mcinvert(struct parse *, cset *);
+static void mccase(struct parse *, cset *);
+static int isinsets(struct re_guts *, int);
+static int samesets(struct re_guts *, int, int);
+static void categorize(struct parse *, struct re_guts *);
+static sopno dupl(struct parse *, sopno, sopno);
+static void doemit(struct parse *, sop, size_t);
+static void doinsert(struct parse *, sop, size_t, sopno);
+static void dofwd(struct parse *, sopno, sop);
+static void enlarge(struct parse *, sopno);
+static void stripsnug(struct parse *, struct re_guts *);
+static void findmust(struct parse *, struct re_guts *);
+static sopno pluscount(struct parse *, struct re_guts *);
+
+static char nuls[10]; /* place to point scanner in event of error */
+
+/*
+ * macros for use with parse structure
+ * BEWARE: these know that the parse structure is named `p' !!!
+ */
+#define PEEK() (*p->next)
+#define PEEK2() (*(p->next+1))
+#define MORE() (p->next < p->end)
+#define MORE2() (p->next+1 < p->end)
+#define SEE(c) (MORE() && PEEK() == (c))
+#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
+#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
+#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
+#define NEXT() (p->next++)
+#define NEXT2() (p->next += 2)
+#define NEXTn(n) (p->next += (n))
+#define GETNEXT() (*p->next++)
+#define SETERROR(e) seterr(p, (e))
+#define REQUIRE(co, e) (void)((co) || SETERROR(e))
+#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
+#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
+#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
+#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
+#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
+#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
+#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
+#define HERE() (p->slen)
+#define THERE() (p->slen - 1)
+#define THERETHERE() (p->slen - 2)
+#define DROP(n) (p->slen -= (n))
+
+#ifdef _POSIX2_RE_DUP_MAX
+#define DUPMAX _POSIX2_RE_DUP_MAX
+#else
+#define DUPMAX 255
+#endif
+
+#ifndef NDEBUG
+static int never = 0; /* for use in asserts; shuts lint up */
+#else
+#define never 0 /* some <assert.h>s have bugs too */
+#endif
+
+/*
+ - llvm_regcomp - interface for parser and compilation
+ */
+int /* 0 success, otherwise REG_something */
+llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
+{
+ struct parse pa;
+ struct re_guts *g;
+ struct parse *p = &pa;
+ int i;
+ size_t len;
+#ifdef REDEBUG
+# define GOODFLAGS(f) (f)
+#else
+# define GOODFLAGS(f) ((f)&~REG_DUMP)
+#endif
+
+ cflags = GOODFLAGS(cflags);
+ if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
+ return(REG_INVARG);
+
+ if (cflags&REG_PEND) {
+ if (preg->re_endp < pattern)
+ return(REG_INVARG);
+ len = preg->re_endp - pattern;
+ } else
+ len = strlen((const char *)pattern);
+
+ /* do the mallocs early so failure handling is easy */
+ g = (struct re_guts *)malloc(sizeof(struct re_guts) +
+ (NC-1)*sizeof(cat_t));
+ if (g == NULL)
+ return(REG_ESPACE);
+ p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
+ p->strip = (sop *)calloc(p->ssize, sizeof(sop));
+ p->slen = 0;
+ if (p->strip == NULL) {
+ free((char *)g);
+ return(REG_ESPACE);
+ }
+
+ /* set things up */
+ p->g = g;
+ p->next = (char *)pattern; /* convenience; we do not modify it */
+ p->end = p->next + len;
+ p->error = 0;
+ p->ncsalloc = 0;
+ for (i = 0; i < NPAREN; i++) {
+ p->pbegin[i] = 0;
+ p->pend[i] = 0;
+ }
+ g->csetsize = NC;
+ g->sets = NULL;
+ g->setbits = NULL;
+ g->ncsets = 0;
+ g->cflags = cflags;
+ g->iflags = 0;
+ g->nbol = 0;
+ g->neol = 0;
+ g->must = NULL;
+ g->mlen = 0;
+ g->nsub = 0;
+ g->ncategories = 1; /* category 0 is "everything else" */
+ g->categories = &g->catspace[-(CHAR_MIN)];
+ (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
+ g->backrefs = 0;
+
+ /* do it */
+ EMIT(OEND, 0);
+ g->firststate = THERE();
+ if (cflags&REG_EXTENDED)
+ p_ere(p, OUT);
+ else if (cflags&REG_NOSPEC)
+ p_str(p);
+ else
+ p_bre(p, OUT, OUT);
+ EMIT(OEND, 0);
+ g->laststate = THERE();
+
+ /* tidy up loose ends and fill things in */
+ categorize(p, g);
+ stripsnug(p, g);
+ findmust(p, g);
+ g->nplus = pluscount(p, g);
+ g->magic = MAGIC2;
+ preg->re_nsub = g->nsub;
+ preg->re_g = g;
+ preg->re_magic = MAGIC1;
+#ifndef REDEBUG
+ /* not debugging, so can't rely on the assert() in llvm_regexec() */
+ if (g->iflags&REGEX_BAD)
+ SETERROR(REG_ASSERT);
+#endif
+
+ /* win or lose, we're done */
+ if (p->error != 0) /* lose */
+ llvm_regfree(preg);
+ return(p->error);
+}
+
+/*
+ - p_ere - ERE parser top level, concatenation and alternation
+ */
+static void
+p_ere(struct parse *p, int stop) /* character this ERE should end at */
+{
+ char c;
+ sopno prevback = prevback;
+ sopno prevfwd = prevfwd;
+ sopno conc;
+ int first = 1; /* is this the first alternative? */
+
+ for (;;) {
+ /* do a bunch of concatenated expressions */
+ conc = HERE();
+ while (MORE() && (c = PEEK()) != '|' && c != stop)
+ p_ere_exp(p);
+ REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
+
+ if (!EAT('|'))
+ break; /* NOTE BREAK OUT */
+
+ if (first) {
+ INSERT(OCH_, conc); /* offset is wrong */
+ prevfwd = conc;
+ prevback = conc;
+ first = 0;
+ }
+ ASTERN(OOR1, prevback);
+ prevback = THERE();
+ AHEAD(prevfwd); /* fix previous offset */
+ prevfwd = HERE();
+ EMIT(OOR2, 0); /* offset is very wrong */
+ }
+
+ if (!first) { /* tail-end fixups */
+ AHEAD(prevfwd);
+ ASTERN(O_CH, prevback);
+ }
+
+ assert(!MORE() || SEE(stop));
+}
+
+/*
+ - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
+ */
+static void
+p_ere_exp(struct parse *p)
+{
+ char c;
+ sopno pos;
+ int count;
+ int count2;
+ sopno subno;
+ int wascaret = 0;
+
+ assert(MORE()); /* caller should have ensured this */
+ c = GETNEXT();
+
+ pos = HERE();
+ switch (c) {
+ case '(':
+ REQUIRE(MORE(), REG_EPAREN);
+ p->g->nsub++;
+ subno = p->g->nsub;
+ if (subno < NPAREN)
+ p->pbegin[subno] = HERE();
+ EMIT(OLPAREN, subno);
+ if (!SEE(')'))
+ p_ere(p, ')');
+ if (subno < NPAREN) {
+ p->pend[subno] = HERE();
+ assert(p->pend[subno] != 0);
+ }
+ EMIT(ORPAREN, subno);
+ MUSTEAT(')', REG_EPAREN);
+ break;
+#ifndef POSIX_MISTAKE
+ case ')': /* happens only if no current unmatched ( */
+ /*
+ * You may ask, why the ifndef? Because I didn't notice
+ * this until slightly too late for 1003.2, and none of the
+ * other 1003.2 regular-expression reviewers noticed it at
+ * all. So an unmatched ) is legal POSIX, at least until
+ * we can get it fixed.
+ */
+ SETERROR(REG_EPAREN);
+ break;
+#endif
+ case '^':
+ EMIT(OBOL, 0);
+ p->g->iflags |= USEBOL;
+ p->g->nbol++;
+ wascaret = 1;
+ break;
+ case '$':
+ EMIT(OEOL, 0);
+ p->g->iflags |= USEEOL;
+ p->g->neol++;
+ break;
+ case '|':
+ SETERROR(REG_EMPTY);
+ break;
+ case '*':
+ case '+':
+ case '?':
+ SETERROR(REG_BADRPT);
+ break;
+ case '.':
+ if (p->g->cflags&REG_NEWLINE)
+ nonnewline(p);
+ else
+ EMIT(OANY, 0);
+ break;
+ case '[':
+ p_bracket(p);
+ break;
+ case '\\':
+ REQUIRE(MORE(), REG_EESCAPE);
+ c = GETNEXT();
+ ordinary(p, c);
+ break;
+ case '{': /* okay as ordinary except if digit follows */
+ REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
+ /* FALLTHROUGH */
+ default:
+ ordinary(p, c);
+ break;
+ }
+
+ if (!MORE())
+ return;
+ c = PEEK();
+ /* we call { a repetition if followed by a digit */
+ if (!( c == '*' || c == '+' || c == '?' ||
+ (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
+ return; /* no repetition, we're done */
+ NEXT();
+
+ REQUIRE(!wascaret, REG_BADRPT);
+ switch (c) {
+ case '*': /* implemented as +? */
+ /* this case does not require the (y|) trick, noKLUDGE */
+ INSERT(OPLUS_, pos);
+ ASTERN(O_PLUS, pos);
+ INSERT(OQUEST_, pos);
+ ASTERN(O_QUEST, pos);
+ break;
+ case '+':
+ INSERT(OPLUS_, pos);
+ ASTERN(O_PLUS, pos);
+ break;
+ case '?':
+ /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
+ INSERT(OCH_, pos); /* offset slightly wrong */
+ ASTERN(OOR1, pos); /* this one's right */
+ AHEAD(pos); /* fix the OCH_ */
+ EMIT(OOR2, 0); /* offset very wrong... */
+ AHEAD(THERE()); /* ...so fix it */
+ ASTERN(O_CH, THERETHERE());
+ break;
+ case '{':
+ count = p_count(p);
+ if (EAT(',')) {
+ if (isdigit((uch)PEEK())) {
+ count2 = p_count(p);
+ REQUIRE(count <= count2, REG_BADBR);
+ } else /* single number with comma */
+ count2 = INFINITY;
+ } else /* just a single number */
+ count2 = count;
+ repeat(p, pos, count, count2);
+ if (!EAT('}')) { /* error heuristics */
+ while (MORE() && PEEK() != '}')
+ NEXT();
+ REQUIRE(MORE(), REG_EBRACE);
+ SETERROR(REG_BADBR);
+ }
+ break;
+ }
+
+ if (!MORE())
+ return;
+ c = PEEK();
+ if (!( c == '*' || c == '+' || c == '?' ||
+ (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
+ return;
+ SETERROR(REG_BADRPT);
+}
+
+/*
+ - p_str - string (no metacharacters) "parser"
+ */
+static void
+p_str(struct parse *p)
+{
+ REQUIRE(MORE(), REG_EMPTY);
+ while (MORE())
+ ordinary(p, GETNEXT());
+}
+
+/*
+ - p_bre - BRE parser top level, anchoring and concatenation
+ * Giving end1 as OUT essentially eliminates the end1/end2 check.
+ *
+ * This implementation is a bit of a kludge, in that a trailing $ is first
+ * taken as an ordinary character and then revised to be an anchor. The
+ * only undesirable side effect is that '$' gets included as a character
+ * category in such cases. This is fairly harmless; not worth fixing.
+ * The amount of lookahead needed to avoid this kludge is excessive.
+ */
+static void
+p_bre(struct parse *p,
+ int end1, /* first terminating character */
+ int end2) /* second terminating character */
+{
+ sopno start = HERE();
+ int first = 1; /* first subexpression? */
+ int wasdollar = 0;
+
+ if (EAT('^')) {
+ EMIT(OBOL, 0);
+ p->g->iflags |= USEBOL;
+ p->g->nbol++;
+ }
+ while (MORE() && !SEETWO(end1, end2)) {
+ wasdollar = p_simp_re(p, first);
+ first = 0;
+ }
+ if (wasdollar) { /* oops, that was a trailing anchor */
+ DROP(1);
+ EMIT(OEOL, 0);
+ p->g->iflags |= USEEOL;
+ p->g->neol++;
+ }
+
+ REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
+}
+
+/*
+ - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
+ */
+static int /* was the simple RE an unbackslashed $? */
+p_simp_re(struct parse *p,
+ int starordinary) /* is a leading * an ordinary character? */
+{
+ int c;
+ int count;
+ int count2;
+ sopno pos;
+ int i;
+ sopno subno;
+# define BACKSL (1<<CHAR_BIT)
+
+ pos = HERE(); /* repetion op, if any, covers from here */
+
+ assert(MORE()); /* caller should have ensured this */
+ c = GETNEXT();
+ if (c == '\\') {
+ REQUIRE(MORE(), REG_EESCAPE);
+ c = BACKSL | GETNEXT();
+ }
+ switch (c) {
+ case '.':
+ if (p->g->cflags&REG_NEWLINE)
+ nonnewline(p);
+ else
+ EMIT(OANY, 0);
+ break;
+ case '[':
+ p_bracket(p);
+ break;
+ case BACKSL|'{':
+ SETERROR(REG_BADRPT);
+ break;
+ case BACKSL|'(':
+ p->g->nsub++;
+ subno = p->g->nsub;
+ if (subno < NPAREN)
+ p->pbegin[subno] = HERE();
+ EMIT(OLPAREN, subno);
+ /* the MORE here is an error heuristic */
+ if (MORE() && !SEETWO('\\', ')'))
+ p_bre(p, '\\', ')');
+ if (subno < NPAREN) {
+ p->pend[subno] = HERE();
+ assert(p->pend[subno] != 0);
+ }
+ EMIT(ORPAREN, subno);
+ REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
+ break;
+ case BACKSL|')': /* should not get here -- must be user */
+ case BACKSL|'}':
+ SETERROR(REG_EPAREN);
+ break;
+ case BACKSL|'1':
+ case BACKSL|'2':
+ case BACKSL|'3':
+ case BACKSL|'4':
+ case BACKSL|'5':
+ case BACKSL|'6':
+ case BACKSL|'7':
+ case BACKSL|'8':
+ case BACKSL|'9':
+ i = (c&~BACKSL) - '0';
+ assert(i < NPAREN);
+ if (p->pend[i] != 0) {
+ assert(i <= p->g->nsub);
+ EMIT(OBACK_, i);
+ asse