diff options
author | David Barksdale <amatus.amongus@gmail.com> | 2006-01-12 04:24:02 +0000 |
---|---|---|
committer | David Barksdale <amatus.amongus@gmail.com> | 2006-01-12 04:24:02 +0000 |
commit | aeb110d90df145ac99009c1a7b9e3077d6b457a6 (patch) | |
tree | 165a254743e0bdc143eabbd74520a4e90f93a9e1 /treexpr.c |
Initial revision
Diffstat (limited to 'treexpr.c')
-rw-r--r-- | treexpr.c | 1145 |
1 files changed, 1145 insertions, 0 deletions
diff --git a/treexpr.c b/treexpr.c new file mode 100644 index 0000000..eb68f2e --- /dev/null +++ b/treexpr.c @@ -0,0 +1,1145 @@ +/* treexpr.c - Tree expression language source file + + Copyright (C) 2005 Dell, Inc. + + Authors: David Barksdale <amatus@ocgnet.org> + + + + This library is free software; you can redistribute it and/or + + modify it under the terms of the GNU Lesser General Public + + License as published by the Free Software Foundation; either + + version 2.1 of the License, or (at your option) any later version. + + + + This library is distributed in the hope that it will be useful, + + but WITHOUT ANY WARRANTY; without even the implied warranty of + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + + Lesser General Public License for more details. + + + + You should have received a copy of the GNU Lesser General Public + + License along with this library; if not, write to the Free Software + + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +-- Syntax -- + Expr ::= Term | Expr "|" Term + Term ::= Factor | Term Factor + Factor ::= Symbol | Symbol "*" | "~" | "(" Expr ")" | "(" Expr ")" "*" + | Symbol "->" Expr | Symbol ":" String | Symbol Attrs | Symbol Attrs "->" Expr + Attrs ::= "<" Attr* ">" + Attr ::= Symbol | Symbol "=" String + Symbol ::= any alphanumeric string with '_' and '-' + String ::= quoted string (skips over \") + +-- Semantics -- +A tree can be thought of as a hierarchical list, each element of the list can have a list of +children. Consider the following tree: + + html + / \ + head body + / / \ +title h1 p + +root list: html +list of html's children: head body +list of head's children: title +list of body's children: h1 p +list of h1's chlidren: ~ (where ~ denotes the empty list) +list of p's children: ~ + +By the same token, a tree expression can be thought of as a hierarchical regular expression, +each symbol in the expression can have a child expression. The "->" operator binds a child +expression to a symbol in the parent expression. If a symbol has no child expression it is +equivalent to having a child expression that matches everything. In other words the children +of a symbol are ignored if there is no child expression. It can also be said that "->" adds +a restriction to the symbol on it's left-hand side: the symbol will only be matched if the +symbol's children match the child expression. Other restriction operators are discussed later to +improve matching of HTML. Consider the following expressions: + + These match the tree above: + html - matches the root list and ignores html's children + html -> head body - also matches html's children and ignores head and body's children + html -> (head -> title) body - also matches head's children and ignores title's children + html -> (head -> title -> ~) body - also matches title's children + html -> (head -> title -> ~) (body -> h1 p) - also matches body's children + html -> (head -> title -> ~) (body -> (h1 -> ~) (p -> ~)) - matches the tree exactly + + These do not match the tree above: + body - does not match root list + html -> body - matches root list but does not match html's children (must match all) + html -> body head - matches root list but html's children are out of order + +Note the grouping around "->". In the second expression "->" binds the symbol "html" to the +expression "head body". In the third expression we want the symbol "head" to be bound to the +expression "title" and not "title body" so we must group them together using parenthesis. + +Tree expressions use the alternation "|" and closure "*" operators from reqular expressions. +Consider the following expressions: + html* - matches the root list because it consists of zero or more "html" symbols + html | xml - matches the root list because it consists of "html" or "xml" + html -> (head -> title | title meta) body - matches + +In addition to the "~" special symbol there is "." which matches any symbol: + html -> .* - matches the root list and html's children because html has zero or more + html -> .* (body -> .* p .*) .* - matches a tree that has "html" at the root, html has + atleast one child named "body", and body has atleast + one child named "p". + +-- Extra restriction operators for HTML matching -- + +The ":" operator binds a regular expression to a symbol, the symbol matches only if it's +contents matches the regular expression. In HTML the only tags that have contents are "text" +and "comment". So most of the time it will be used like: + p -> text:"ab*c" - matches the HTML '<p>abbbbbc</p>' + +The "<", "=", and ">" operators are used together to bind a symbol to an attribute list, +the symbol matches only if every attribute matches an attribute of the HTML tag. An attribute +restriction is a list of name and value pairs. A name without a value is matched only if it +appears in the HTML without a value. Consider the following: + table <bgcolor="blue"> - matches the HTML '<table bgcolor="blue">' + and also the HTML '<table bgcolor="blue" border="1">' + foo <bar> - matches the HTML '<foo bar>' but not '<foo bar="">' or '<foo bar="baz">' + foo <bar> | foo <bar=".*"> - matches all three + +It is possible to combine the attribute restriction operators with the "->" operator (for example +option <selected> -> text:"blue"), but it is not possible to combine the content restriction +operator with the "->" operator (for example text:"blue" -> br) because "text" and "comment" +symbols never have children. + +-- Extracting values from an HTML page -- +This section has been up for debate because it can be very confusing. The way we extract strings +from the tree expressions is from the contents and attribute rescrictions. These restrictions +contain regular expressions that match against some of the data contained in an HTML page. Most +regular expression libraries allow you to use something called back references where each +parenthasized sub-expression is given a number and the string matching that sub-expression can be +referenced later. For example the expression "foo" has no sub-expressions, the expression "fo(o*)" +has one sub-expression, namely "o*". When "fo(o*)" is matched against "fooo", the fist +sub-expression references the string "oo". The sub-expressions are numbered by the position of +their open parenthesis. For example "(b(a)(r))" has three sub-expressions, 1. "b(a)(r)" 2. "a" +3. "r". If a sub-expression matches multiple strings then the last match is what is referenced. +For example "(ab|ac)*" matches "ababac" and the string referenced will be "ac". + +A tree expression can have several regular expressions embedded in it. In order to find a specific +sub-expression they are numbered in the order they appear in the tree expression. For example: + form -> input<value="([0-9]+)"> text:"." + input<value="([0-9]+)"> text:"." + input<value="([0-9]+)"> text:"." + input<value="([0-9]+)"> input +has four sub-expressions. This tree expression would match against an HTML form that might be used +for changing an IP address: +<form method="POST" action="/cgi-bin/setip"> + <input type="text" name="first" value="192"> . + <input type="text" name="second" value="168"> . + <input type="text" name="third" value="1"> . + <input type="text" name="fourth" value="42"> <input type="submit" value="Set IP"> +</form> +In this case the extracted values will be: +1: "192" +2: "168" +3: "1" +4: "42" +It's simple to specify a method for combining the strings together to form a usefull output. +For example something like "\1.\2.\3.\4" would produce "192.168.1.42". +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <libxml/tree.h> +#include <sys/types.h> +#include "regex.h" +#include "treexpr.h" + +#ifdef _WINDOWS +#define strcasecmp stricmp +#endif + +// allocator that never returns NULL, cheap hack to make code shorter +void *zalloc( size_t size ) +{ + void *p; + + do { p = calloc( size, 1 ); + } while( p == NULL ); + return p; +} + +// builds a machine that matches a symbol +struct machine *symbol( char *name ) +{ + struct machine *m; + struct state *start, *final; + struct trans *tr; + + // alloc stuff + m = zalloc( sizeof( struct machine )); + start = zalloc( sizeof( struct state )); + final = zalloc( sizeof( struct state )); + tr = zalloc( sizeof( struct trans )); + + // insert transition + tr->name = name; + tr->st = final; + start->tr = tr; + + // maintain linked list + start->next = final; + + // finish machine + m->start = start; + m->final = final; + return m; +} + +// builds a machine that matches the empty string +struct machine *epsilon( void ) +{ + struct machine *m; + struct state *start, *final; + struct epsilon *cat; + + // alloc stuff + m = zalloc( sizeof( struct machine )); + start = zalloc( sizeof( struct state )); + final = zalloc( sizeof( struct state )); + cat = zalloc( sizeof( struct epsilon )); + + // insert epsilon transition + cat->st = final; + start->ep = cat; + + // maintain liked list + start->next = final; + + // finish machine + m->start = start; + m->final = final; + return m; +} + +// builds a machine that matches nothing (not used at present, but here for completeness) +struct machine *null( void ) +{ + struct machine *m; + struct state *start, *final; + + // alloc stuff + m = zalloc( sizeof( struct machine )); + start = zalloc( sizeof( struct state )); + final = zalloc( sizeof( struct state )); + + // maintain linked list + start->next = final; + + // finish machine + m->start = start; + m->final = final; + return m; +} + +// concatanates two machines +struct machine *concat( struct machine *a, struct machine *b ) +{ + struct epsilon *cat; + + if( a == NULL || b == NULL ) + return NULL; + + // alloc stuff + cat = zalloc( sizeof( struct epsilon )); + + // insert epsilon transition + cat->st = b->start; + cat->next = a->final->ep; + a->final->ep = cat; + + // maintain linked list + a->final->next = b->start; + + // use machine 'a' as our new machine and free 'b' + a->final = b->final; + free( b ); + return a; +} + +// alternates two machines (spike) +struct machine *alternate( struct machine *a, struct machine *b ) +{ + struct state *start, *final; + struct epsilon *s1, *s2, *f1, *f2; + + if( a == NULL || b == NULL ) + return NULL; + + // alloc stuff + start = zalloc( sizeof( struct state )); + final = zalloc( sizeof( struct state )); + s1 = zalloc( sizeof( struct epsilon )); + s2 = zalloc( sizeof( struct epsilon )); + f1 = zalloc( sizeof( struct epsilon )); + f2 = zalloc( sizeof( struct epsilon )); + + // insert epsilon transitions from our new start state + // to the start states of the two machines + s1->st = a->start; + s2->st = b->start; + s1->next = s2; + start->ep = s1; + + // insert epsilon transitions from the final states of the + // new machines to our new final state + f1->st = final; + f2->st = final; + f1->next = a->final->ep; + a->final->ep = f1; + f2->next = b->final->ep; + b->final->ep = f2; + + // maintain linked list + start->next = a->start; + a->final->next = b->start; + b->final->next = final; + + // use machine 'a' as our new machine and free 'b' + a->start = start; + a->final = final; + free( b ); + return a; +} + +// creates the closure of a machine (splat) +struct machine *closure( struct machine *a ) +{ + struct state *start, *final; + struct epsilon *s1, *s2, *f1, *f2; + + if( a == NULL ) + return NULL; + + // alloc stuff + start = zalloc( sizeof( struct state )); + final = zalloc( sizeof( struct state )); + s1 = zalloc( sizeof( struct epsilon )); + s2 = zalloc( sizeof( struct epsilon )); + f1 = zalloc( sizeof( struct epsilon )); + f2 = zalloc( sizeof( struct epsilon )); + + // insert epsilon transitions from our new start state to + // the machine's start state and to our new final state + s1->st = final; + s2->st = a->start; + s1->next = s2; + start->ep = s1; + + // insert epsilon transitions from the machine's final state + // to the machine's start state and our new final state + f1->st = a->start; + f2->st = final; + f1->next = f2; + f2->next = a->final->ep; + a->final->ep = f1; + + // maintain linked list + start->next = a->start; + a->final->next = final; + + // use 'a' as our new machine + a->start = start; + a->final = final; + return a; +} + +void free_machine( struct machine *m ); + +// free each state in a linked list +void free_states( struct state *sl ) +{ + struct trans *tr; + struct epsilon *ep, *epn; + struct state *next; + struct attribute *at, *atn; + + for( ; sl != NULL; sl = next ) + { + next = sl->next; + + // free each transition off this state + tr = sl->tr; + if( tr != NULL ) + { + // free all the junk that can hang off of a transition + free_machine( tr->ptr ); + free( tr->name ); + if( tr->re.re_magic != 0 ) + notbuiltin_regfree( &tr->re ); + for( at = tr->attrs; at != NULL; at = atn ) + { + atn = at->next; + free( at->name ); + if( at->re.re_magic != 0 ) + notbuiltin_regfree( &at->re ); + free( at ); + } + free( tr ); + } + + // free each epsilon transition (easy) + for( ep = sl->ep; ep != NULL; ep = epn ) + { + epn = ep->next; + free( ep ); + } + + // finally free the state + free( sl ); + } +} + +void free_machine( struct machine *m ) +{ + int i; + + if( m == NULL ) + return; + + // free every state in the machine + free_states( m->start ); + + // if we have an E function free it + if( m->E != NULL ) + { + for( i = 0; i < m->states; i++ ) + free( m->E[i] ); + free( m->E ); + } + + // free bitmasks + if( m->cur_state != NULL ) + free( m->cur_state ); + if( m->next_state != NULL ) + free( m->next_state ); + + // finally free the machine + free( m ); +} + +/* + * Tokenizer + */ +struct token +{ + int t; + char *name; +}; + +#define T_EOL ( -2 ) // not really paid attention to +#define T_ERROR ( -1 ) // tokenizing error, look at token->error and token->buf +#define T_SYMBOL ( 0 ) // symbol +#define T_SQUIGGLE ( 1 ) // ~ +#define T_WAX ( 2 ) // ( +#define T_WANE ( 3 ) // ) +#define T_SPIKE ( 4 ) // | +#define T_SPLAT ( 5 ) // * +#define T_PTR ( 6 ) // -> +#define T_TWOSPOT ( 7 ) // : +#define T_ANGLE ( 8 ) // < +#define T_RIGHTANGLE ( 9 ) // > +#define T_HALFMESH ( 10 ) // = +#define T_STRING ( 11 ) // quoted string + +// parse the next token in a string and return a pointer into the string were we stoped +const char *get_tok( const char *str, struct token *tk ) +{ + static char *name; + const char *p; + int i, len = 0; + + if( str == NULL ) + { + tk->t = T_ERROR; + return NULL; + } + + // skip whitespace + for( p = str; *p != 0 && isspace( *p ); p++ ); + + switch( *p ) + { + case 0: + tk->t = T_EOL; + return p; + case '~': + tk->t = T_SQUIGGLE; + return p + 1; + case '(': + tk->t = T_WAX; + return p + 1; + case ')': + tk->t = T_WANE; + return p + 1; + case '|': + tk->t = T_SPIKE; + return p + 1; + case '*': + tk->t = T_SPLAT; + return p + 1; + case '-': + if( p[1] == '>' ) + { + tk->t = T_PTR; + return p + 2; + } + tk->t = T_ERROR; + return p; + case ':': + tk->t = T_TWOSPOT; + return p + 1; + case '<': + tk->t = T_ANGLE; + return p + 1; + case '>': + tk->t = T_RIGHTANGLE; + return p + 1; + case '=': + tk->t = T_HALFMESH; + return p + 1; + case '\"': + p++; + len = 16; + name = zalloc( len ); + for( i = 0; p[i] != 0 && p[i] != '\"'; i++ ) + { + if( i + 1 >= len ) + name = realloc( name, len *= 2 ); + if( p[i] == '\\' && p[i + 1] == '\"' && i < 254 ) + { + name[i] = p[i]; + i++; + } + name[i] = p[i]; + } + name[i] = 0; + tk->t = p[i] == 0 ? T_ERROR : T_STRING; + tk->name = name; + return p + i + ( p[i] == '\"' ? 1 : 0 ); + case '.': + name = zalloc( 2 ); + name[0] = '.'; + name[1] = 0; + tk->t = T_SYMBOL; + tk->name = name; + return p + 1; + default: + len = 16; + name = zalloc( len ); + for( i = 0; isalnum( p[i] ) || p[i] == '_' || p[i] == '-'; i++ ) + { + if( i + 1 >= len ) + name = realloc( name, len *= 2 ); + name[i] = p[i]; + } + name[i] = 0; + tk->t = i > 0 ? T_SYMBOL : T_ERROR; + tk->name = name; + return p + i; + } +} + +const char *parse_treexpr( const char *expr, struct machine **m ); + +// parses an attribute construction: <foo="bar" baz="quux" quuux> +// the opening angle has already been consumed +const char *parse_attrs( const char *expr, struct attribute **a ) +{ + struct token tk; + struct attribute *prev = NULL, *attr; + const char *next; + + // build a list of attributes in order + for( next = get_tok( expr, &tk ); tk.t == T_SYMBOL; ) + { + // fist we grab the name + attr = zalloc( sizeof( struct attribute )); + if( prev == NULL ) + prev = *a = attr; + else + prev = prev->next = attr; + attr->name = tk.name; + + // check for optional = + next = get_tok( next, &tk ); + if( tk.t != T_HALFMESH ) + continue; + + // grab regex and compile it + next = get_tok( next, &tk ); + if( tk.t != T_STRING ) + return NULL; + if( notbuiltin_regcomp( &attr->re, tk.name, REG_EXTENDED + | REG_ICASE ) != 0 ) + { + notbuiltin_regfree( &attr->re ); + attr->re.re_magic = 0; + return NULL; + } + free( tk.name ); + next = get_tok( next, &tk ); + } + if( tk.t == T_RIGHTANGLE ) + return next; + return NULL; +} + +// parse a tree expression "factor" +// a factor is what I'm calling the operands to concatenation +// so it's either a symbol (with optional restrictions), a parenthasized expression +// or a factor with a * +// this does not allow foolishness like "foo**" (equivalent to "foo*") or "~*" (equivalent to "~") +// even though they are technically valid +const char *parse_factor( const char *expr, struct machine **m ) +{ + struct token tk; + struct machine *r; + const char *next, *cur; + + // this token should either be a symbol, a ~, or a ( + cur = get_tok( expr, &tk ); + if( tk.t == T_ERROR ) + { + *m = zalloc( sizeof( struct machine )); + ( *m )->error = "Tokenizing error"; + ( *m )->buf = expr; + return NULL; + } + if( tk.t == T_SYMBOL ) + { + // build a machine to match the symbol + *m = symbol( tk.name ); + next = get_tok( cur, &tk ); + if( tk.t == T_ERROR ) + { + ( *m )->error = "Tokenizing error"; + ( *m )->buf = cur; + return NULL; + } + + // if there's a * build the closure + if( tk.t == T_SPLAT ) + { + *m = closure( *m ); + return next; + } + + // if there's a -> then parse the expression on the rhs and add it as a restriction + if( tk.t == T_PTR ) + { + next = parse_treexpr( next, &r ); + ( *m )->start->tr->ptr = r; + if( next == NULL ) + { + ( *m )->error = r->error; + ( *m )->buf = r->buf; + return NULL; + } + return next; + } + + // if there's a : then compile the regex and add it as a restriction + if( tk.t == T_TWOSPOT ) + { + cur = next; + next = get_tok( cur, &tk ); + if( tk.t == T_ERROR ) + { + ( *m )->error = "Tokenizing error"; + ( *m )->buf = cur; + return NULL; + } + if( tk.t != T_STRING ) + { + ( *m )->error = "Expecting a \"-delimited string"; + ( *m )->buf = cur; + return NULL; + } + if( notbuiltin_regcomp( &( *m )->start->tr->re, tk.name, + REG_EXTENDED | REG_ICASE ) != 0 ) + { + notbuiltin_regfree( &( *m )->start->tr->re ); + ( *m )->start->tr->re.re_magic = 0; + ( *m )->error = "Error parsing regular expression"; + ( *m )->buf = cur; + } + free( tk.name ); + return next; + } + + // if there's a < then parse attributes and add it as a restriction + if( tk.t == T_ANGLE ) + { + next = parse_attrs( next, &( *m )->start->tr->attrs ); + if( next == NULL ) + { + ( *m )->error = "Expecting attribute list, ie <name=\"value\" name2=\"value2\">"; + ( *m )->buf = cur; + return NULL; + } + cur = next; + + // we also allow a -> restriction in this case + next = get_tok( cur, &tk ); + if( tk.t == T_PTR ) + { + next = parse_treexpr( next, &r ); + ( *m )->start->tr->ptr = r; + if( next == NULL ) + { + ( *m )->error = r->error; + ( *m )->buf = r->buf; + return NULL; + } + return next; + } + return cur; + } + return cur; + } + if( tk.t == T_SQUIGGLE ) + { + // build epsilon machine + *m = epsilon( ); + return cur; + } + if( tk.t == T_WAX ) + { + // parse expression + cur = parse_treexpr( cur, m ); + if( cur == NULL ) + return NULL; + + // make sure we end in ) + next = get_tok( cur, &tk ); + if( tk.t == T_ERROR ) + { + ( *m )->error = "Tokenizing error"; + ( *m )->buf = cur; + return NULL; + } + if( tk.t != T_WANE ) + { + ( *m )->error = "Expected ')'"; + ( *m )->buf = cur; + return NULL; + } + + // check for * and build closure + cur = next; + next = get_tok( cur, &tk ); + if( tk.t == T_ERROR ) + { + ( *m )->error = "Tokenizing error"; + ( *m )->buf = cur; + return NULL; + } + if( tk.t == T_SPLAT ) + { + *m = closure( *m ); + return next; + } + return cur; + } + + // invalid symbol + *m = zalloc( sizeof( struct machine )); + ( *m )->error = "Expected symbol or '~' or '('"; + ( *m )->buf = expr; + return NULL; +} + +// parse a tree expression "term" +// a term is what I'm calling the operands to alternation +// a term is either a factor, or a list of factors concatenated together +const char *parse_term( const char *expr, struct machine **m ) +{ + struct token tk; + struct machine *r; + const char *cur; + + // grab the first factor + cur = parse_factor( expr, m ); + if( cur == NULL ) + return NULL; + + // grab zero or more factors + for( get_tok( cur, &tk ); tk.t == T_SYMBOL || tk.t == T_WAX || tk.t == T_SQUIGGLE; + get_tok( cur, &tk )) + { + cur = parse_factor( cur, &r ); + if( cur == NULL ) + { + ( *m )->error = r->error; + ( *m )->buf = r->buf; + free_machine( r ); + return NULL; + } + + // concatenate the factors + *m = concat( *m, r ); + } + if( tk.t == T_ERROR ) + { + ( *m )->error = "Tokenizing error"; + ( *m )->buf = cur; + return NULL; + } + return cur; +} + +// parse a tree expression +// an expression is just a term or a list of terms seperated by | +// this looks almost exactly like parse_term() +const char *parse_treexpr( const char *expr, struct machine **m ) +{ + struct token tk; + const char *next, *cur; + struct machine *r; + + // grab the first term + cur = parse_term( expr, m ); + if( cur == NULL ) + return NULL; + + // grab zero or more terms + for( next = get_tok( cur, &tk ); tk.t == T_SPIKE; next = get_tok( cur, &tk )) + { + cur = parse_term( next, &r ); + if( cur == NULL ) + { + ( *m )->error = r->error; + ( *m )->buf = r->buf; + free_machine( r ); + return NULL; + } + + // combine the terms using alternation + *m = alternate( *m, r ); + } + if( tk.t == T_ERROR ) + { + ( *m )->error = "Tokenizing error"; + ( *m )->buf = cur; + return NULL; + } + return cur; +} + + +// process a regex restriction (basically just executes the regex) +int regex_process( struct trans *tr, char *content ) +{ + if( content == NULL ) + return 0; + if( notbuiltin_regexec( &tr->re, content, RESUBR, tr->match, 0 ) == 0 ) + { + tr->str = content; + return 1; + } + tr->str = NULL; + return 0; +} + +// process an attribute restriction +// it's important that we don't save the matches for all regexes until we know all of them +// match. otherwise if you were trying to match <foo="bar" bar="baz"> and you had a list like +// <foo="bar" bar="baz"> (both regexes match) +// <foo="barr" bar="quux"> (the first one matches and overwrites the previous match for foo) +// then you would be left with foo="barr" bar="baz" as your matches +int attrs_process( struct trans *tr, struct _xmlAttr *properties ) +{ + struct attribute *attr; + struct _xmlAttr *cur; + regmatch_t match[RESUBR]; + + // first pass makes sure each attribute matches + for( attr = tr->attrs; attr != NULL; attr = attr->next ) + { + for( cur = properties; cur != NULL; cur = cur->next ) + { + if( strcasecmp( attr->name, (char *)cur->name ) == 0 ) + { + // if there's no value it will only match if we didn't specify a regex + if( cur->children == NULL ) + { + if( attr->re.re_magic == 0 ) + break; + return 0; + } + // otherwise the regex has to match the value + if( notbuiltin_regexec( &attr->re, + (char *)cur->children->content, RESUBR, + match, 0 ) == 0 ) + break; + else + attr->str = NULL; + return 0; + } + } + if( cur == NULL ) + return 0; + } + // second pass saves the matches + for( attr = tr->attrs; attr != NULL; attr = attr->next ) + { + for( cur = properties; cur != NULL; cur = cur->next ) + { + if( strcasecmp( attr->name, (char *)cur->name ) == 0 ) + { + // if there's no value it will only match if we didn't specify a regex + if( cur->children == NULL ) + { + if( attr->re.re_magic == 0 ) + break; + return 0; + } + // otherwise the regex has to match the value + if( notbuiltin_regexec( &attr->re, + (char *)cur->children->content, RESUBR, + attr->match, 0 ) == 0 ) + { + attr->str = (char *)cur->children->content; + break; + } + else + attr->str = NULL; + return 0; + } + } + if( cur == NULL ) + return 0; + } + return 1; +} + +// some handy macros: +// number of bits in type pointed to by x +#define B( x ) (sizeof(*(x))*8) + +// number of elements in array pointed to by x in order to contain n bits +#define N( x, n ) (((n)+B(x)-1)/B(x)) + +// test bit b of array pointed to by x +#define TEST_BIT( x, b ) (((x)[(b)/B(x)]>>((b)%B(x)))&1) + +// set bit b of array pointed to by x +#define SET_BIT( x, b ) ((x)[(b)/B(x)]|=1<<((b)%B(x))) + +// compute the disjunction of two bitfields of n bits +#define OR( x, y, n ) {unsigned int _i;for(_i=0;_i<N(x,n);(x)[_i]|=(y)[_i],_i++);} + +// applies a machine to an xml tree +// returns true iff the machine accepts +// all matches to regexes are contained within the machine +int tree_process( struct machine *m, xmlNodePtr node ) +{ + int *e, done; + struct state *cur, *cur2; + struct trans *tr; + struct epsilon *ep; + + if( m == NULL ) + return 1; + if( m->E == NULL ) + { + // Generate E function + // the E function is an array of bitmasks indexed by state number + // the bitmask represents the states you can reach by epsilon transitions + + // number states + m->states = 0; + for( cur = m->start; cur != NULL; cur = cur->next ) + cur->num = m->states++; + + // fill E + m->E = zalloc( sizeof( *m->E ) * m->states ); + for( cur = m->start; cur != NULL; cur = cur->next ) + { + e = m->E[cur->num] = zalloc( N( *m->E, m->states ) * sizeof( **m->E )); + + // we can reach ourself + SET_BIT( e, cur->num ); + + // try to add new states until an iteration of this loop makes no progress + done = 0; + while( !done ) + { + done = 1; + // for each state that is already in the bitmask, add states you can reach + // by epsilon transitions + for( cur2 = m->start; cur2 != NULL; cur2 = cur2->next ) + if( TEST_BIT( e, cur2->num )) + for( ep = cur2->ep; ep != NULL; ep = ep->next ) + if( !TEST_BIT( e, ep->st->num )) + { + SET_BIT( e, ep->st->num ); + done = 0; + } + } + } + } + + // allocate current state and next state bitmasks + if( m->cur_state == NULL ) + m->cur_state = zalloc( N( m->cur_state, m->states ) * sizeof( *m->cur_state )); + if( m->next_state == NULL ) + m->next_state = zalloc( N( m->next_state, m->states ) * sizeof( *m->next_state )); + + // our inital current state is E(start) + OR( m->cur_state, m->E[m->start->num], m->states ); + + // main loop, terminate when we run out of input or when there's no states in the cur_state bitmask + while( node != NULL ) + { + unsigned int i; + int sum = 0; + + // are we still alive? + for( i = 0; i < N( m->cur_state, m->states ); i++ ) + sum |= m->cur_state[i]; + if( sum == 0 ) break; + + // zero out next_state and start adding states we can reach by normal transitions + memset( m->next_state, 0, N( m->next_state, m->states ) * sizeof( *m->next_state )); + + // for each state in the cur_state bitmask we try a transition + for( cur = m->start; cur != NULL; cur = cur->next ) + if( TEST_BIT( m->cur_state, cur->num )) + { + tr = cur->tr; + if( tr != NULL ) + { + // first we must match the name and attributes + if( strcmp( tr->name, "." ) != 0 && + ( node->name == NULL || strcasecmp( tr->name, (char *)node->name ) != 0 )) + continue; + if( tr->attrs != NULL && !attrs_process( tr, node->properties )) + continue; + // second we can match a machine and regexp + if( tr->ptr != NULL && !tree_process( tr->ptr, node->children )) + continue; + if( tr->re.re_magic != 0 && !regex_process( tr, (char *)node->content )) + continue; + // we have a winner! add E(st) to the next state bitmap + OR( m->next_state, m->E[tr->st->num], m->states ); + } + } + // advance input + node = node->next; + + // swap cur_state and next_state pointers, saves us a copy operation + e = m->cur_state; + m->cur_state = m->next_state; + m->next_state = e; + } + + // the machine accepts the input if we end up in the final state + done = TEST_BIT( m->cur_state, m->final->num ); + return done; +} + +// extracts regex matches from a machine after it has been run +struct regex_match *find_matches( struct state *s, struct regex_match *m ) +{ + struct regex_match *cur, *head; + struct trans *tr; + struct attribute *attr; + int i; + + if( s == NULL ) + return m; + + // recurse to next state first, so we can add the matches in the current state to the + // front of the list + m = find_matches( s->next, m ); + + // grab matches in this state + tr = s->tr; + if( tr != NULL ) + { + // the -> comes after <foo> in the expression, so search it first + if( tr->ptr != NULL ) + m = find_matches( tr->ptr->start, m ); + + // next search : "foo" + if( tr->str != NULL ) + { + // loop backwards so that list builds forwards + for( i = RESUBR - 1; i > 0; i-- ) + if( tr->match[i].rm_eo != -1 ) + { + cur = zalloc( sizeof( struct regex_match )); + cur->match = tr->match[i]; + cur->str = tr->str; + cur->next = m; + m = cur; + } + } + + // last search <foo> + // loop forwards because we have no choice, but build list backwards + head = NULL; + cur = NULL; + for( attr = tr->attrs; attr != NULL; attr = attr->next ) + { + if( attr->str == NULL ) + continue; + for( i = 1; i < RESUBR; i++ ) + if( attr->match[i].rm_eo != -1 ) + { + if( head == NULL ) + cur = head = zalloc( sizeof( struct regex_match )); + else + cur = cur->next = zalloc( sizeof( struct regex_match )); + cur->match = attr->match[i]; + cur->str = attr->str; + cur->next = NULL; + } + } + if( head != NULL ) + { + cur->next = m; + m = head; + } + } + return m; +} + +// runs machine m on each xml node at this level, then recurses to it's children, building a +// list of matches +struct match *node_recurse( struct machine *m, xmlNodePtr node, struct match *n ) +{ + xmlNodePtr cur, next; + struct match *xml; + + if( node == NULL ) + return n; + for( cur = node; cur != NULL; cur = cur->next ) + { + // this will consider each node by itself (without siblings) + next = cur->next; + cur->next = NULL; + if( tree_process( m, cur )) + { + xml = zalloc( sizeof( struct match )); + xml->next = n; + xml->node = cur; + xml->re = find_matches( m->start, NULL ); + n = xml; + } + cur->next = next; + // recurse to children + n = node_recurse( m, cur->children, n ); + } + return n; +} + +// run a machine on each node in an xml document and return a list of matches +struct match *document_process( struct machine *m, xmlDocPtr doc ) +{ + return node_recurse( m, doc->children->next, NULL ); +} |