aboutsummaryrefslogtreecommitdiff
path: root/treexpr.c
diff options
context:
space:
mode:
authorDavid Barksdale <amatus.amongus@gmail.com>2006-01-12 04:24:02 +0000
committerDavid Barksdale <amatus.amongus@gmail.com>2006-01-12 04:24:02 +0000
commitaeb110d90df145ac99009c1a7b9e3077d6b457a6 (patch)
tree165a254743e0bdc143eabbd74520a4e90f93a9e1 /treexpr.c
Initial revision
Diffstat (limited to 'treexpr.c')
-rw-r--r--treexpr.c1145
1 files changed, 1145 insertions, 0 deletions
diff --git a/treexpr.c b/treexpr.c
new file mode 100644
index 0000000..eb68f2e
--- /dev/null
+++ b/treexpr.c
@@ -0,0 +1,1145 @@
+/* treexpr.c - Tree expression language source file
+ + Copyright (C) 2005 Dell, Inc.
+ + Authors: David Barksdale <amatus@ocgnet.org>
+ +
+ + This library is free software; you can redistribute it and/or
+ + modify it under the terms of the GNU Lesser General Public
+ + License as published by the Free Software Foundation; either
+ + version 2.1 of the License, or (at your option) any later version.
+ +
+ + This library is distributed in the hope that it will be useful,
+ + but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ + Lesser General Public License for more details.
+ +
+ + You should have received a copy of the GNU Lesser General Public
+ + License along with this library; if not, write to the Free Software
+ + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+-- Syntax --
+ Expr ::= Term | Expr "|" Term
+ Term ::= Factor | Term Factor
+ Factor ::= Symbol | Symbol "*" | "~" | "(" Expr ")" | "(" Expr ")" "*"
+ | Symbol "->" Expr | Symbol ":" String | Symbol Attrs | Symbol Attrs "->" Expr
+ Attrs ::= "<" Attr* ">"
+ Attr ::= Symbol | Symbol "=" String
+ Symbol ::= any alphanumeric string with '_' and '-'
+ String ::= quoted string (skips over \")
+
+-- Semantics --
+A tree can be thought of as a hierarchical list, each element of the list can have a list of
+children. Consider the following tree:
+
+ html
+ / \
+ head body
+ / / \
+title h1 p
+
+root list: html
+list of html's children: head body
+list of head's children: title
+list of body's children: h1 p
+list of h1's chlidren: ~ (where ~ denotes the empty list)
+list of p's children: ~
+
+By the same token, a tree expression can be thought of as a hierarchical regular expression,
+each symbol in the expression can have a child expression. The "->" operator binds a child
+expression to a symbol in the parent expression. If a symbol has no child expression it is
+equivalent to having a child expression that matches everything. In other words the children
+of a symbol are ignored if there is no child expression. It can also be said that "->" adds
+a restriction to the symbol on it's left-hand side: the symbol will only be matched if the
+symbol's children match the child expression. Other restriction operators are discussed later to
+improve matching of HTML. Consider the following expressions:
+
+ These match the tree above:
+ html - matches the root list and ignores html's children
+ html -> head body - also matches html's children and ignores head and body's children
+ html -> (head -> title) body - also matches head's children and ignores title's children
+ html -> (head -> title -> ~) body - also matches title's children
+ html -> (head -> title -> ~) (body -> h1 p) - also matches body's children
+ html -> (head -> title -> ~) (body -> (h1 -> ~) (p -> ~)) - matches the tree exactly
+
+ These do not match the tree above:
+ body - does not match root list
+ html -> body - matches root list but does not match html's children (must match all)
+ html -> body head - matches root list but html's children are out of order
+
+Note the grouping around "->". In the second expression "->" binds the symbol "html" to the
+expression "head body". In the third expression we want the symbol "head" to be bound to the
+expression "title" and not "title body" so we must group them together using parenthesis.
+
+Tree expressions use the alternation "|" and closure "*" operators from reqular expressions.
+Consider the following expressions:
+ html* - matches the root list because it consists of zero or more "html" symbols
+ html | xml - matches the root list because it consists of "html" or "xml"
+ html -> (head -> title | title meta) body - matches
+
+In addition to the "~" special symbol there is "." which matches any symbol:
+ html -> .* - matches the root list and html's children because html has zero or more
+ html -> .* (body -> .* p .*) .* - matches a tree that has "html" at the root, html has
+ atleast one child named "body", and body has atleast
+ one child named "p".
+
+-- Extra restriction operators for HTML matching --
+
+The ":" operator binds a regular expression to a symbol, the symbol matches only if it's
+contents matches the regular expression. In HTML the only tags that have contents are "text"
+and "comment". So most of the time it will be used like:
+ p -> text:"ab*c" - matches the HTML '<p>abbbbbc</p>'
+
+The "<", "=", and ">" operators are used together to bind a symbol to an attribute list,
+the symbol matches only if every attribute matches an attribute of the HTML tag. An attribute
+restriction is a list of name and value pairs. A name without a value is matched only if it
+appears in the HTML without a value. Consider the following:
+ table <bgcolor="blue"> - matches the HTML '<table bgcolor="blue">'
+ and also the HTML '<table bgcolor="blue" border="1">'
+ foo <bar> - matches the HTML '<foo bar>' but not '<foo bar="">' or '<foo bar="baz">'
+ foo <bar> | foo <bar=".*"> - matches all three
+
+It is possible to combine the attribute restriction operators with the "->" operator (for example
+option <selected> -> text:"blue"), but it is not possible to combine the content restriction
+operator with the "->" operator (for example text:"blue" -> br) because "text" and "comment"
+symbols never have children.
+
+-- Extracting values from an HTML page --
+This section has been up for debate because it can be very confusing. The way we extract strings
+from the tree expressions is from the contents and attribute rescrictions. These restrictions
+contain regular expressions that match against some of the data contained in an HTML page. Most
+regular expression libraries allow you to use something called back references where each
+parenthasized sub-expression is given a number and the string matching that sub-expression can be
+referenced later. For example the expression "foo" has no sub-expressions, the expression "fo(o*)"
+has one sub-expression, namely "o*". When "fo(o*)" is matched against "fooo", the fist
+sub-expression references the string "oo". The sub-expressions are numbered by the position of
+their open parenthesis. For example "(b(a)(r))" has three sub-expressions, 1. "b(a)(r)" 2. "a"
+3. "r". If a sub-expression matches multiple strings then the last match is what is referenced.
+For example "(ab|ac)*" matches "ababac" and the string referenced will be "ac".
+
+A tree expression can have several regular expressions embedded in it. In order to find a specific
+sub-expression they are numbered in the order they appear in the tree expression. For example:
+ form -> input<value="([0-9]+)"> text:"."
+ input<value="([0-9]+)"> text:"."
+ input<value="([0-9]+)"> text:"."
+ input<value="([0-9]+)"> input
+has four sub-expressions. This tree expression would match against an HTML form that might be used
+for changing an IP address:
+<form method="POST" action="/cgi-bin/setip">
+ <input type="text" name="first" value="192"> .
+ <input type="text" name="second" value="168"> .
+ <input type="text" name="third" value="1"> .
+ <input type="text" name="fourth" value="42"> <input type="submit" value="Set IP">
+</form>
+In this case the extracted values will be:
+1: "192"
+2: "168"
+3: "1"
+4: "42"
+It's simple to specify a method for combining the strings together to form a usefull output.
+For example something like "\1.\2.\3.\4" would produce "192.168.1.42".
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <libxml/tree.h>
+#include <sys/types.h>
+#include "regex.h"
+#include "treexpr.h"
+
+#ifdef _WINDOWS
+#define strcasecmp stricmp
+#endif
+
+// allocator that never returns NULL, cheap hack to make code shorter
+void *zalloc( size_t size )
+{
+ void *p;
+
+ do { p = calloc( size, 1 );
+ } while( p == NULL );
+ return p;
+}
+
+// builds a machine that matches a symbol
+struct machine *symbol( char *name )
+{
+ struct machine *m;
+ struct state *start, *final;
+ struct trans *tr;
+
+ // alloc stuff
+ m = zalloc( sizeof( struct machine ));
+ start = zalloc( sizeof( struct state ));
+ final = zalloc( sizeof( struct state ));
+ tr = zalloc( sizeof( struct trans ));
+
+ // insert transition
+ tr->name = name;
+ tr->st = final;
+ start->tr = tr;
+
+ // maintain linked list
+ start->next = final;
+
+ // finish machine
+ m->start = start;
+ m->final = final;
+ return m;
+}
+
+// builds a machine that matches the empty string
+struct machine *epsilon( void )
+{
+ struct machine *m;
+ struct state *start, *final;
+ struct epsilon *cat;
+
+ // alloc stuff
+ m = zalloc( sizeof( struct machine ));
+ start = zalloc( sizeof( struct state ));
+ final = zalloc( sizeof( struct state ));
+ cat = zalloc( sizeof( struct epsilon ));
+
+ // insert epsilon transition
+ cat->st = final;
+ start->ep = cat;
+
+ // maintain liked list
+ start->next = final;
+
+ // finish machine
+ m->start = start;
+ m->final = final;
+ return m;
+}
+
+// builds a machine that matches nothing (not used at present, but here for completeness)
+struct machine *null( void )
+{
+ struct machine *m;
+ struct state *start, *final;
+
+ // alloc stuff
+ m = zalloc( sizeof( struct machine ));
+ start = zalloc( sizeof( struct state ));
+ final = zalloc( sizeof( struct state ));
+
+ // maintain linked list
+ start->next = final;
+
+ // finish machine
+ m->start = start;
+ m->final = final;
+ return m;
+}
+
+// concatanates two machines
+struct machine *concat( struct machine *a, struct machine *b )
+{
+ struct epsilon *cat;
+
+ if( a == NULL || b == NULL )
+ return NULL;
+
+ // alloc stuff
+ cat = zalloc( sizeof( struct epsilon ));
+
+ // insert epsilon transition
+ cat->st = b->start;
+ cat->next = a->final->ep;
+ a->final->ep = cat;
+
+ // maintain linked list
+ a->final->next = b->start;
+
+ // use machine 'a' as our new machine and free 'b'
+ a->final = b->final;
+ free( b );
+ return a;
+}
+
+// alternates two machines (spike)
+struct machine *alternate( struct machine *a, struct machine *b )
+{
+ struct state *start, *final;
+ struct epsilon *s1, *s2, *f1, *f2;
+
+ if( a == NULL || b == NULL )
+ return NULL;
+
+ // alloc stuff
+ start = zalloc( sizeof( struct state ));
+ final = zalloc( sizeof( struct state ));
+ s1 = zalloc( sizeof( struct epsilon ));
+ s2 = zalloc( sizeof( struct epsilon ));
+ f1 = zalloc( sizeof( struct epsilon ));
+ f2 = zalloc( sizeof( struct epsilon ));
+
+ // insert epsilon transitions from our new start state
+ // to the start states of the two machines
+ s1->st = a->start;
+ s2->st = b->start;
+ s1->next = s2;
+ start->ep = s1;
+
+ // insert epsilon transitions from the final states of the
+ // new machines to our new final state
+ f1->st = final;
+ f2->st = final;
+ f1->next = a->final->ep;
+ a->final->ep = f1;
+ f2->next = b->final->ep;
+ b->final->ep = f2;
+
+ // maintain linked list
+ start->next = a->start;
+ a->final->next = b->start;
+ b->final->next = final;
+
+ // use machine 'a' as our new machine and free 'b'
+ a->start = start;
+ a->final = final;
+ free( b );
+ return a;
+}
+
+// creates the closure of a machine (splat)
+struct machine *closure( struct machine *a )
+{
+ struct state *start, *final;
+ struct epsilon *s1, *s2, *f1, *f2;
+
+ if( a == NULL )
+ return NULL;
+
+ // alloc stuff
+ start = zalloc( sizeof( struct state ));
+ final = zalloc( sizeof( struct state ));
+ s1 = zalloc( sizeof( struct epsilon ));
+ s2 = zalloc( sizeof( struct epsilon ));
+ f1 = zalloc( sizeof( struct epsilon ));
+ f2 = zalloc( sizeof( struct epsilon ));
+
+ // insert epsilon transitions from our new start state to
+ // the machine's start state and to our new final state
+ s1->st = final;
+ s2->st = a->start;
+ s1->next = s2;
+ start->ep = s1;
+
+ // insert epsilon transitions from the machine's final state
+ // to the machine's start state and our new final state
+ f1->st = a->start;
+ f2->st = final;
+ f1->next = f2;
+ f2->next = a->final->ep;
+ a->final->ep = f1;
+
+ // maintain linked list
+ start->next = a->start;
+ a->final->next = final;
+
+ // use 'a' as our new machine
+ a->start = start;
+ a->final = final;
+ return a;
+}
+
+void free_machine( struct machine *m );
+
+// free each state in a linked list
+void free_states( struct state *sl )
+{
+ struct trans *tr;
+ struct epsilon *ep, *epn;
+ struct state *next;
+ struct attribute *at, *atn;
+
+ for( ; sl != NULL; sl = next )
+ {
+ next = sl->next;
+
+ // free each transition off this state
+ tr = sl->tr;
+ if( tr != NULL )
+ {
+ // free all the junk that can hang off of a transition
+ free_machine( tr->ptr );
+ free( tr->name );
+ if( tr->re.re_magic != 0 )
+ notbuiltin_regfree( &tr->re );
+ for( at = tr->attrs; at != NULL; at = atn )
+ {
+ atn = at->next;
+ free( at->name );
+ if( at->re.re_magic != 0 )
+ notbuiltin_regfree( &at->re );
+ free( at );
+ }
+ free( tr );
+ }
+
+ // free each epsilon transition (easy)
+ for( ep = sl->ep; ep != NULL; ep = epn )
+ {
+ epn = ep->next;
+ free( ep );
+ }
+
+ // finally free the state
+ free( sl );
+ }
+}
+
+void free_machine( struct machine *m )
+{
+ int i;
+
+ if( m == NULL )
+ return;
+
+ // free every state in the machine
+ free_states( m->start );
+
+ // if we have an E function free it
+ if( m->E != NULL )
+ {
+ for( i = 0; i < m->states; i++ )
+ free( m->E[i] );
+ free( m->E );
+ }
+
+ // free bitmasks
+ if( m->cur_state != NULL )
+ free( m->cur_state );
+ if( m->next_state != NULL )
+ free( m->next_state );
+
+ // finally free the machine
+ free( m );
+}
+
+/*
+ * Tokenizer
+ */
+struct token
+{
+ int t;
+ char *name;
+};
+
+#define T_EOL ( -2 ) // not really paid attention to
+#define T_ERROR ( -1 ) // tokenizing error, look at token->error and token->buf
+#define T_SYMBOL ( 0 ) // symbol
+#define T_SQUIGGLE ( 1 ) // ~
+#define T_WAX ( 2 ) // (
+#define T_WANE ( 3 ) // )
+#define T_SPIKE ( 4 ) // |
+#define T_SPLAT ( 5 ) // *
+#define T_PTR ( 6 ) // ->
+#define T_TWOSPOT ( 7 ) // :
+#define T_ANGLE ( 8 ) // <
+#define T_RIGHTANGLE ( 9 ) // >
+#define T_HALFMESH ( 10 ) // =
+#define T_STRING ( 11 ) // quoted string
+
+// parse the next token in a string and return a pointer into the string were we stoped
+const char *get_tok( const char *str, struct token *tk )
+{
+ static char *name;
+ const char *p;
+ int i, len = 0;
+
+ if( str == NULL )
+ {
+ tk->t = T_ERROR;
+ return NULL;
+ }
+
+ // skip whitespace
+ for( p = str; *p != 0 && isspace( *p ); p++ );
+
+ switch( *p )
+ {
+ case 0:
+ tk->t = T_EOL;
+ return p;
+ case '~':
+ tk->t = T_SQUIGGLE;
+ return p + 1;
+ case '(':
+ tk->t = T_WAX;
+ return p + 1;
+ case ')':
+ tk->t = T_WANE;
+ return p + 1;
+ case '|':
+ tk->t = T_SPIKE;
+ return p + 1;
+ case '*':
+ tk->t = T_SPLAT;
+ return p + 1;
+ case '-':
+ if( p[1] == '>' )
+ {
+ tk->t = T_PTR;
+ return p + 2;
+ }
+ tk->t = T_ERROR;
+ return p;
+ case ':':
+ tk->t = T_TWOSPOT;
+ return p + 1;
+ case '<':
+ tk->t = T_ANGLE;
+ return p + 1;
+ case '>':
+ tk->t = T_RIGHTANGLE;
+ return p + 1;
+ case '=':
+ tk->t = T_HALFMESH;
+ return p + 1;
+ case '\"':
+ p++;
+ len = 16;
+ name = zalloc( len );
+ for( i = 0; p[i] != 0 && p[i] != '\"'; i++ )
+ {
+ if( i + 1 >= len )
+ name = realloc( name, len *= 2 );
+ if( p[i] == '\\' && p[i + 1] == '\"' && i < 254 )
+ {
+ name[i] = p[i];
+ i++;
+ }
+ name[i] = p[i];
+ }
+ name[i] = 0;
+ tk->t = p[i] == 0 ? T_ERROR : T_STRING;
+ tk->name = name;
+ return p + i + ( p[i] == '\"' ? 1 : 0 );
+ case '.':
+ name = zalloc( 2 );
+ name[0] = '.';
+ name[1] = 0;
+ tk->t = T_SYMBOL;
+ tk->name = name;
+ return p + 1;
+ default:
+ len = 16;
+ name = zalloc( len );
+ for( i = 0; isalnum( p[i] ) || p[i] == '_' || p[i] == '-'; i++ )
+ {
+ if( i + 1 >= len )
+ name = realloc( name, len *= 2 );
+ name[i] = p[i];
+ }
+ name[i] = 0;
+ tk->t = i > 0 ? T_SYMBOL : T_ERROR;
+ tk->name = name;
+ return p + i;
+ }
+}
+
+const char *parse_treexpr( const char *expr, struct machine **m );
+
+// parses an attribute construction: <foo="bar" baz="quux" quuux>
+// the opening angle has already been consumed
+const char *parse_attrs( const char *expr, struct attribute **a )
+{
+ struct token tk;
+ struct attribute *prev = NULL, *attr;
+ const char *next;
+
+ // build a list of attributes in order
+ for( next = get_tok( expr, &tk ); tk.t == T_SYMBOL; )
+ {
+ // fist we grab the name
+ attr = zalloc( sizeof( struct attribute ));
+ if( prev == NULL )
+ prev = *a = attr;
+ else
+ prev = prev->next = attr;
+ attr->name = tk.name;
+
+ // check for optional =
+ next = get_tok( next, &tk );
+ if( tk.t != T_HALFMESH )
+ continue;
+
+ // grab regex and compile it
+ next = get_tok( next, &tk );
+ if( tk.t != T_STRING )
+ return NULL;
+ if( notbuiltin_regcomp( &attr->re, tk.name, REG_EXTENDED
+ | REG_ICASE ) != 0 )
+ {
+ notbuiltin_regfree( &attr->re );
+ attr->re.re_magic = 0;
+ return NULL;
+ }
+ free( tk.name );
+ next = get_tok( next, &tk );
+ }
+ if( tk.t == T_RIGHTANGLE )
+ return next;
+ return NULL;
+}
+
+// parse a tree expression "factor"
+// a factor is what I'm calling the operands to concatenation
+// so it's either a symbol (with optional restrictions), a parenthasized expression
+// or a factor with a *
+// this does not allow foolishness like "foo**" (equivalent to "foo*") or "~*" (equivalent to "~")
+// even though they are technically valid
+const char *parse_factor( const char *expr, struct machine **m )
+{
+ struct token tk;
+ struct machine *r;
+ const char *next, *cur;
+
+ // this token should either be a symbol, a ~, or a (
+ cur = get_tok( expr, &tk );
+ if( tk.t == T_ERROR )
+ {
+ *m = zalloc( sizeof( struct machine ));
+ ( *m )->error = "Tokenizing error";
+ ( *m )->buf = expr;
+ return NULL;
+ }
+ if( tk.t == T_SYMBOL )
+ {
+ // build a machine to match the symbol
+ *m = symbol( tk.name );
+ next = get_tok( cur, &tk );
+ if( tk.t == T_ERROR )
+ {
+ ( *m )->error = "Tokenizing error";
+ ( *m )->buf = cur;
+ return NULL;
+ }
+
+ // if there's a * build the closure
+ if( tk.t == T_SPLAT )
+ {
+ *m = closure( *m );
+ return next;
+ }
+
+ // if there's a -> then parse the expression on the rhs and add it as a restriction
+ if( tk.t == T_PTR )
+ {
+ next = parse_treexpr( next, &r );
+ ( *m )->start->tr->ptr = r;
+ if( next == NULL )
+ {
+ ( *m )->error = r->error;
+ ( *m )->buf = r->buf;
+ return NULL;
+ }
+ return next;
+ }
+
+ // if there's a : then compile the regex and add it as a restriction
+ if( tk.t == T_TWOSPOT )
+ {
+ cur = next;
+ next = get_tok( cur, &tk );
+ if( tk.t == T_ERROR )
+ {
+ ( *m )->error = "Tokenizing error";
+ ( *m )->buf = cur;
+ return NULL;
+ }
+ if( tk.t != T_STRING )
+ {
+ ( *m )->error = "Expecting a \"-delimited string";
+ ( *m )->buf = cur;
+ return NULL;
+ }
+ if( notbuiltin_regcomp( &( *m )->start->tr->re, tk.name,
+ REG_EXTENDED | REG_ICASE ) != 0 )
+ {
+ notbuiltin_regfree( &( *m )->start->tr->re );
+ ( *m )->start->tr->re.re_magic = 0;
+ ( *m )->error = "Error parsing regular expression";
+ ( *m )->buf = cur;
+ }
+ free( tk.name );
+ return next;
+ }
+
+ // if there's a < then parse attributes and add it as a restriction
+ if( tk.t == T_ANGLE )
+ {
+ next = parse_attrs( next, &( *m )->start->tr->attrs );
+ if( next == NULL )
+ {
+ ( *m )->error = "Expecting attribute list, ie <name=\"value\" name2=\"value2\">";
+ ( *m )->buf = cur;
+ return NULL;
+ }
+ cur = next;
+
+ // we also allow a -> restriction in this case
+ next = get_tok( cur, &tk );
+ if( tk.t == T_PTR )
+ {
+ next = parse_treexpr( next, &r );
+ ( *m )->start->tr->ptr = r;
+ if( next == NULL )
+ {
+ ( *m )->error = r->error;
+ ( *m )->buf = r->buf;
+ return NULL;
+ }
+ return next;
+ }
+ return cur;
+ }
+ return cur;
+ }
+ if( tk.t == T_SQUIGGLE )
+ {
+ // build epsilon machine
+ *m = epsilon( );
+ return cur;
+ }
+ if( tk.t == T_WAX )
+ {
+ // parse expression
+ cur = parse_treexpr( cur, m );
+ if( cur == NULL )
+ return NULL;
+
+ // make sure we end in )
+ next = get_tok( cur, &tk );
+ if( tk.t == T_ERROR )
+ {
+ ( *m )->error = "Tokenizing error";
+ ( *m )->buf = cur;
+ return NULL;
+ }
+ if( tk.t != T_WANE )
+ {
+ ( *m )->error = "Expected ')'";
+ ( *m )->buf = cur;
+ return NULL;
+ }
+
+ // check for * and build closure
+ cur = next;
+ next = get_tok( cur, &tk );
+ if( tk.t == T_ERROR )
+ {
+ ( *m )->error = "Tokenizing error";
+ ( *m )->buf = cur;
+ return NULL;
+ }
+ if( tk.t == T_SPLAT )
+ {
+ *m = closure( *m );
+ return next;
+ }
+ return cur;
+ }
+
+ // invalid symbol
+ *m = zalloc( sizeof( struct machine ));
+ ( *m )->error = "Expected symbol or '~' or '('";
+ ( *m )->buf = expr;
+ return NULL;
+}
+
+// parse a tree expression "term"
+// a term is what I'm calling the operands to alternation
+// a term is either a factor, or a list of factors concatenated together
+const char *parse_term( const char *expr, struct machine **m )
+{
+ struct token tk;
+ struct machine *r;
+ const char *cur;
+
+ // grab the first factor
+ cur = parse_factor( expr, m );
+ if( cur == NULL )
+ return NULL;
+
+ // grab zero or more factors
+ for( get_tok( cur, &tk ); tk.t == T_SYMBOL || tk.t == T_WAX || tk.t == T_SQUIGGLE;
+ get_tok( cur, &tk ))
+ {
+ cur = parse_factor( cur, &r );
+ if( cur == NULL )
+ {
+ ( *m )->error = r->error;
+ ( *m )->buf = r->buf;
+ free_machine( r );
+ return NULL;
+ }
+
+ // concatenate the factors
+ *m = concat( *m, r );
+ }
+ if( tk.t == T_ERROR )
+ {
+ ( *m )->error = "Tokenizing error";
+ ( *m )->buf = cur;
+ return NULL;
+ }
+ return cur;
+}
+
+// parse a tree expression
+// an expression is just a term or a list of terms seperated by |
+// this looks almost exactly like parse_term()
+const char *parse_treexpr( const char *expr, struct machine **m )
+{
+ struct token tk;
+ const char *next, *cur;
+ struct machine *r;
+
+ // grab the first term
+ cur = parse_term( expr, m );
+ if( cur == NULL )
+ return NULL;
+
+ // grab zero or more terms
+ for( next = get_tok( cur, &tk ); tk.t == T_SPIKE; next = get_tok( cur, &tk ))
+ {
+ cur = parse_term( next, &r );
+ if( cur == NULL )
+ {
+ ( *m )->error = r->error;
+ ( *m )->buf = r->buf;
+ free_machine( r );
+ return NULL;
+ }
+
+ // combine the terms using alternation
+ *m = alternate( *m, r );
+ }
+ if( tk.t == T_ERROR )
+ {
+ ( *m )->error = "Tokenizing error";
+ ( *m )->buf = cur;
+ return NULL;
+ }
+ return cur;
+}
+
+
+// process a regex restriction (basically just executes the regex)
+int regex_process( struct trans *tr, char *content )
+{
+ if( content == NULL )
+ return 0;
+ if( notbuiltin_regexec( &tr->re, content, RESUBR, tr->match, 0 ) == 0 )
+ {
+ tr->str = content;
+ return 1;
+ }
+ tr->str = NULL;
+ return 0;
+}
+
+// process an attribute restriction
+// it's important that we don't save the matches for all regexes until we know all of them
+// match. otherwise if you were trying to match <foo="bar" bar="baz"> and you had a list like
+// <foo="bar" bar="baz"> (both regexes match)
+// <foo="barr" bar="quux"> (the first one matches and overwrites the previous match for foo)
+// then you would be left with foo="barr" bar="baz" as your matches
+int attrs_process( struct trans *tr, struct _xmlAttr *properties )
+{
+ struct attribute *attr;
+ struct _xmlAttr *cur;
+ regmatch_t match[RESUBR];
+
+ // first pass makes sure each attribute matches
+ for( attr = tr->attrs; attr != NULL; attr = attr->next )
+ {
+ for( cur = properties; cur != NULL; cur = cur->next )
+ {
+ if( strcasecmp( attr->name, (char *)cur->name ) == 0 )
+ {
+ // if there's no value it will only match if we didn't specify a regex
+ if( cur->children == NULL )
+ {
+ if( attr->re.re_magic == 0 )
+ break;
+ return 0;
+ }
+ // otherwise the regex has to match the value
+ if( notbuiltin_regexec( &attr->re,
+ (char *)cur->children->content, RESUBR,
+ match, 0 ) == 0 )
+ break;
+ else
+ attr->str = NULL;
+ return 0;
+ }
+ }
+ if( cur == NULL )
+ return 0;
+ }
+ // second pass saves the matches
+ for( attr = tr->attrs; attr != NULL; attr = attr->next )
+ {
+ for( cur = properties; cur != NULL; cur = cur->next )
+ {
+ if( strcasecmp( attr->name, (char *)cur->name ) == 0 )
+ {
+ // if there's no value it will only match if we didn't specify a regex
+ if( cur->children == NULL )
+ {
+ if( attr->re.re_magic == 0 )
+ break;
+ return 0;
+ }
+ // otherwise the regex has to match the value
+ if( notbuiltin_regexec( &attr->re,
+ (char *)cur->children->content, RESUBR,
+ attr->match, 0 ) == 0 )
+ {
+ attr->str = (char *)cur->children->content;
+ break;
+ }
+ else
+ attr->str = NULL;
+ return 0;
+ }
+ }
+ if( cur == NULL )
+ return 0;
+ }
+ return 1;
+}
+
+// some handy macros:
+// number of bits in type pointed to by x
+#define B( x ) (sizeof(*(x))*8)
+
+// number of elements in array pointed to by x in order to contain n bits
+#define N( x, n ) (((n)+B(x)-1)/B(x))
+
+// test bit b of array pointed to by x
+#define TEST_BIT( x, b ) (((x)[(b)/B(x)]>>((b)%B(x)))&1)
+
+// set bit b of array pointed to by x
+#define SET_BIT( x, b ) ((x)[(b)/B(x)]|=1<<((b)%B(x)))
+
+// compute the disjunction of two bitfields of n bits
+#define OR( x, y, n ) {unsigned int _i;for(_i=0;_i<N(x,n);(x)[_i]|=(y)[_i],_i++);}
+
+// applies a machine to an xml tree
+// returns true iff the machine accepts
+// all matches to regexes are contained within the machine
+int tree_process( struct machine *m, xmlNodePtr node )
+{
+ int *e, done;
+ struct state *cur, *cur2;
+ struct trans *tr;
+ struct epsilon *ep;
+
+ if( m == NULL )
+ return 1;
+ if( m->E == NULL )
+ {
+ // Generate E function
+ // the E function is an array of bitmasks indexed by state number
+ // the bitmask represents the states you can reach by epsilon transitions
+
+ // number states
+ m->states = 0;
+ for( cur = m->start; cur != NULL; cur = cur->next )
+ cur->num = m->states++;
+
+ // fill E
+ m->E = zalloc( sizeof( *m->E ) * m->states );
+ for( cur = m->start; cur != NULL; cur = cur->next )
+ {
+ e = m->E[cur->num] = zalloc( N( *m->E, m->states ) * sizeof( **m->E ));
+
+ // we can reach ourself
+ SET_BIT( e, cur->num );
+
+ // try to add new states until an iteration of this loop makes no progress
+ done = 0;
+ while( !done )
+ {
+ done = 1;
+ // for each state that is already in the bitmask, add states you can reach
+ // by epsilon transitions
+ for( cur2 = m->start; cur2 != NULL; cur2 = cur2->next )
+ if( TEST_BIT( e, cur2->num ))
+ for( ep = cur2->ep; ep != NULL; ep = ep->next )
+ if( !TEST_BIT( e, ep->st->num ))
+ {
+ SET_BIT( e, ep->st->num );
+ done = 0;
+ }
+ }
+ }
+ }
+
+ // allocate current state and next state bitmasks
+ if( m->cur_state == NULL )
+ m->cur_state = zalloc( N( m->cur_state, m->states ) * sizeof( *m->cur_state ));
+ if( m->next_state == NULL )
+ m->next_state = zalloc( N( m->next_state, m->states ) * sizeof( *m->next_state ));
+
+ // our inital current state is E(start)
+ OR( m->cur_state, m->E[m->start->num], m->states );
+
+ // main loop, terminate when we run out of input or when there's no states in the cur_state bitmask
+ while( node != NULL )
+ {
+ unsigned int i;
+ int sum = 0;
+
+ // are we still alive?
+ for( i = 0; i < N( m->cur_state, m->states ); i++ )
+ sum |= m->cur_state[i];
+ if( sum == 0 ) break;
+
+ // zero out next_state and start adding states we can reach by normal transitions
+ memset( m->next_state, 0, N( m->next_state, m->states ) * sizeof( *m->next_state ));
+
+ // for each state in the cur_state bitmask we try a transition
+ for( cur = m->start; cur != NULL; cur = cur->next )
+ if( TEST_BIT( m->cur_state, cur->num ))
+ {
+ tr = cur->tr;
+ if( tr != NULL )
+ {
+ // first we must match the name and attributes
+ if( strcmp( tr->name, "." ) != 0 &&
+ ( node->name == NULL || strcasecmp( tr->name, (char *)node->name ) != 0 ))
+ continue;
+ if( tr->attrs != NULL && !attrs_process( tr, node->properties ))
+ continue;
+ // second we can match a machine and regexp
+ if( tr->ptr != NULL && !tree_process( tr->ptr, node->children ))
+ continue;
+ if( tr->re.re_magic != 0 && !regex_process( tr, (char *)node->content ))
+ continue;
+ // we have a winner! add E(st) to the next state bitmap
+ OR( m->next_state, m->E[tr->st->num], m->states );
+ }
+ }
+ // advance input
+ node = node->next;
+
+ // swap cur_state and next_state pointers, saves us a copy operation
+ e = m->cur_state;
+ m->cur_state = m->next_state;
+ m->next_state = e;
+ }
+
+ // the machine accepts the input if we end up in the final state
+ done = TEST_BIT( m->cur_state, m->final->num );
+ return done;
+}
+
+// extracts regex matches from a machine after it has been run
+struct regex_match *find_matches( struct state *s, struct regex_match *m )
+{
+ struct regex_match *cur, *head;
+ struct trans *tr;
+ struct attribute *attr;
+ int i;
+
+ if( s == NULL )
+ return m;
+
+ // recurse to next state first, so we can add the matches in the current state to the
+ // front of the list
+ m = find_matches( s->next, m );
+
+ // grab matches in this state
+ tr = s->tr;
+ if( tr != NULL )
+ {
+ // the -> comes after <foo> in the expression, so search it first
+ if( tr->ptr != NULL )
+ m = find_matches( tr->ptr->start, m );
+
+ // next search : "foo"
+ if( tr->str != NULL )
+ {
+ // loop backwards so that list builds forwards
+ for( i = RESUBR - 1; i > 0; i-- )
+ if( tr->match[i].rm_eo != -1 )
+ {
+ cur = zalloc( sizeof( struct regex_match ));
+ cur->match = tr->match[i];
+ cur->str = tr->str;
+ cur->next = m;
+ m = cur;
+ }
+ }
+
+ // last search <foo>
+ // loop forwards because we have no choice, but build list backwards
+ head = NULL;
+ cur = NULL;
+ for( attr = tr->attrs; attr != NULL; attr = attr->next )
+ {
+ if( attr->str == NULL )
+ continue;
+ for( i = 1; i < RESUBR; i++ )
+ if( attr->match[i].rm_eo != -1 )
+ {
+ if( head == NULL )
+ cur = head = zalloc( sizeof( struct regex_match ));
+ else
+ cur = cur->next = zalloc( sizeof( struct regex_match ));
+ cur->match = attr->match[i];
+ cur->str = attr->str;
+ cur->next = NULL;
+ }
+ }
+ if( head != NULL )
+ {
+ cur->next = m;
+ m = head;
+ }
+ }
+ return m;
+}
+
+// runs machine m on each xml node at this level, then recurses to it's children, building a
+// list of matches
+struct match *node_recurse( struct machine *m, xmlNodePtr node, struct match *n )
+{
+ xmlNodePtr cur, next;
+ struct match *xml;
+
+ if( node == NULL )
+ return n;
+ for( cur = node; cur != NULL; cur = cur->next )
+ {
+ // this will consider each node by itself (without siblings)
+ next = cur->next;
+ cur->next = NULL;
+ if( tree_process( m, cur ))
+ {
+ xml = zalloc( sizeof( struct match ));
+ xml->next = n;
+ xml->node = cur;
+ xml->re = find_matches( m->start, NULL );
+ n = xml;
+ }
+ cur->next = next;
+ // recurse to children
+ n = node_recurse( m, cur->children, n );
+ }
+ return n;
+}
+
+// run a machine on each node in an xml document and return a list of matches
+struct match *document_process( struct machine *m, xmlDocPtr doc )
+{
+ return node_recurse( m, doc->children->next, NULL );
+}