diff options
Diffstat (limited to 'src/regex/regex_internal.h')
-rw-r--r-- | src/regex/regex_internal.h | 484 |
1 files changed, 484 insertions, 0 deletions
diff --git a/src/regex/regex_internal.h b/src/regex/regex_internal.h new file mode 100644 index 0000000..00badc5 --- /dev/null +++ b/src/regex/regex_internal.h @@ -0,0 +1,484 @@ +/* + This file is part of GNUnet + (C) 2012 Christian Grothoff (and other contributing authors) + + GNUnet is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GNUnet is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GNUnet; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. +*/ +/** + * @file src/regex/regex_internal.h + * @brief common internal definitions for regex library. + * @author Maximilian Szengel + */ +#ifndef REGEX_INTERNAL_H +#define REGEX_INTERNAL_H + +#include "gnunet_regex_lib.h" + +#ifdef __cplusplus +extern "C" +{ +#if 0 /* keep Emacsens' auto-indent happy */ +} +#endif +#endif + +/** + * char array of literals that are allowed inside a regex (apart from the + * operators) + */ +#define ALLOWED_LITERALS "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + + +/** + * Transition between two states. Transitions are stored at the states from + * which they origin ('from_state'). Each state can have 0-n transitions. + * If label is NULL, this is considered to be an epsilon transition. + */ +struct GNUNET_REGEX_Transition +{ + /** + * This is a linked list. + */ + struct GNUNET_REGEX_Transition *prev; + + /** + * This is a linked list. + */ + struct GNUNET_REGEX_Transition *next; + + /** + * Unique id of this transition. + */ + unsigned int id; + + /** + * Label for this transition. This is basically the edge label for the graph. + */ + char *label; + + /** + * State to which this transition leads. + */ + struct GNUNET_REGEX_State *to_state; + + /** + * State from which this transition origins. + */ + struct GNUNET_REGEX_State *from_state; +}; + + +/** + * A state. Can be used in DFA and NFA automatons. + */ +struct GNUNET_REGEX_State; + + +/** + * Set of states. + */ +struct GNUNET_REGEX_StateSet +{ + /** + * Array of states. + */ + struct GNUNET_REGEX_State **states; + + /** + * Number of entries in *use* in the 'states' array. + */ + unsigned int off; + + /** + * Length of the 'states' array. + */ + unsigned int size; +}; + + +/** + * A state. Can be used in DFA and NFA automatons. + */ +struct GNUNET_REGEX_State +{ + /** + * This is a linked list to keep states in an automaton. + */ + struct GNUNET_REGEX_State *prev; + + /** + * This is a linked list to keep states in an automaton. + */ + struct GNUNET_REGEX_State *next; + + /** + * This is a multi DLL for StateSet_MDLL. + */ + struct GNUNET_REGEX_State *prev_SS; + + /** + * This is a multi DLL for StateSet_MDLL. + */ + struct GNUNET_REGEX_State *next_SS; + + /** + * This is a multi DLL for StateSet_MDLL Stack. + */ + struct GNUNET_REGEX_State *prev_ST; + + /** + * This is a multi DLL for StateSet_MDLL Stack. + */ + struct GNUNET_REGEX_State *next_ST; + + /** + * Unique state id. + */ + unsigned int id; + + /** + * Unique state id that is used for traversing the automaton. It is guaranteed + * to be > 0 and < state_count. + */ + unsigned int traversal_id; + + /** + * If this is an accepting state or not. + */ + int accepting; + + /** + * Marking of the state. This is used for marking all visited states when + * traversing all states of an automaton and for cases where the state id + * cannot be used (dfa minimization). + */ + int marked; + + /** + * Marking the state as contained. This is used for checking, if the state is + * contained in a set in constant time. + */ + int contained; + + /** + * Marking the state as part of an SCC (Strongly Connected Component). All + * states with the same scc_id are part of the same SCC. scc_id is 0, if state + * is not a part of any SCC. + */ + unsigned int scc_id; + + /** + * Used for SCC detection. + */ + int index; + + /** + * Used for SCC detection. + */ + int lowlink; + + /** + * Human readable name of the state. Used for debugging and graph + * creation. + */ + char *name; + + /** + * Hash of the state. + */ + struct GNUNET_HashCode hash; + + /** + * Linear state ID accquired by depth-first-search. This ID should be used for + * storing information about the state in an array, because the 'id' of the + * state is not guaranteed to be linear. The 'dfs_id' is guaranteed to be > 0 + * and < 'state_count'. + */ + unsigned int dfs_id; + + /** + * Proof for this state. + */ + char *proof; + + /** + * Number of transitions from this state to other states. + */ + unsigned int transition_count; + + /** + * DLL of transitions. + */ + struct GNUNET_REGEX_Transition *transitions_head; + + /** + * DLL of transitions. + */ + struct GNUNET_REGEX_Transition *transitions_tail; + + /** + * Number of incoming transitions. Used for compressing DFA paths. + */ + unsigned int incoming_transition_count; + + /** + * Set of states on which this state is based on. Used when creating a DFA out + * of several NFA states. + */ + struct GNUNET_REGEX_StateSet nfa_set; +}; + + +/** + * Type of an automaton. + */ +enum GNUNET_REGEX_AutomatonType +{ + NFA, + DFA +}; + + +/** + * Automaton representation. + */ +struct GNUNET_REGEX_Automaton +{ + /** + * Linked list of NFAs used for partial NFA creation. + */ + struct GNUNET_REGEX_Automaton *prev; + + /** + * Linked list of NFAs used for partial NFA creation. + */ + struct GNUNET_REGEX_Automaton *next; + + /** + * First state of the automaton. This is mainly used for constructing an NFA, + * where each NFA itself consists of one or more NFAs linked together. + */ + struct GNUNET_REGEX_State *start; + + /** + * End state of the partial NFA. This is undefined for DFAs + */ + struct GNUNET_REGEX_State *end; + + /** + * Number of states in the automaton. + */ + unsigned int state_count; + + /** + * DLL of states. + */ + struct GNUNET_REGEX_State *states_head; + + /** + * DLL of states + */ + struct GNUNET_REGEX_State *states_tail; + + /** + * Type of the automaton. + */ + enum GNUNET_REGEX_AutomatonType type; + + /** + * Regex + */ + char *regex; + + /** + * Canonical regex (result of RX->NFA->DFA->RX) + */ + char *canonical_regex; + + /** + * GNUNET_YES, if multi strides have been added to the Automaton. + */ + int is_multistrided; +}; + + +/** + * Construct an NFA by parsing the regex string of length 'len'. + * + * @param regex regular expression string. + * @param len length of the string. + * + * @return NFA, needs to be freed using GNUNET_REGEX_automaton_destroy. + */ +struct GNUNET_REGEX_Automaton * +GNUNET_REGEX_construct_nfa (const char *regex, const size_t len); + + +/** + * Function that get's passed to automaton traversal and is called before each + * next traversal from state 's' using transition 't' to check if traversal + * should proceed. Return GNUNET_NO to stop traversal or GNUNET_YES to continue. + * + * @param cls closure for the check. + * @param s current state in the traversal. + * @param t current transition from state 's' that will be used for the next + * step. + * + * @return GNUNET_YES to proceed traversal, GNUNET_NO to stop. + */ +typedef int (*GNUNET_REGEX_traverse_check) (void *cls, + struct GNUNET_REGEX_State * s, + struct GNUNET_REGEX_Transition * t); + + +/** + * Function that is called with each state, when traversing an automaton. + * + * @param cls closure. + * @param count current count of the state, from 0 to a->state_count -1. + * @param s state. + */ +typedef void (*GNUNET_REGEX_traverse_action) (void *cls, + const unsigned int count, + struct GNUNET_REGEX_State * s); + + +/** + * Traverses the given automaton using depth-first-search (DFS) from it's start + * state, visiting all reachable states and calling 'action' on each one of + * them. + * + * @param a automaton to be traversed. + * @param start start state, pass a->start or NULL to traverse the whole automaton. + * @param check function that is checked before advancing on each transition + * in the DFS. + * @param check_cls closure for check. + * @param action action to be performed on each state. + * @param action_cls closure for action + */ +void +GNUNET_REGEX_automaton_traverse (const struct GNUNET_REGEX_Automaton *a, + struct GNUNET_REGEX_State *start, + GNUNET_REGEX_traverse_check check, + void *check_cls, + GNUNET_REGEX_traverse_action action, + void *action_cls); + +/** + * Get the canonical regex of the given automaton. + * When constructing the automaton a proof is computed for each state, + * consisting of the regular expression leading to this state. A complete + * regex for the automaton can be computed by combining these proofs. + * As of now this function is only useful for testing. + * + * @param a automaton for which the canonical regex should be returned. + * + * @return canonical regex string. + */ +const char * +GNUNET_REGEX_get_canonical_regex (struct GNUNET_REGEX_Automaton *a); + + +/** + * Get the number of transitions that are contained in the given automaton. + * + * @param a automaton for which the number of transitions should be returned. + * + * @return number of transitions in the given automaton. + */ +unsigned int +GNUNET_REGEX_get_transition_count (struct GNUNET_REGEX_Automaton *a); + + +/** + * Context that contains an id counter for states and transitions as well as a + * DLL of automatons used as a stack for NFA construction. + */ +struct GNUNET_REGEX_Context +{ + /** + * Unique state id. + */ + unsigned int state_id; + + /** + * Unique transition id. + */ + unsigned int transition_id; + + /** + * DLL of GNUNET_REGEX_Automaton's used as a stack. + */ + struct GNUNET_REGEX_Automaton *stack_head; + + /** + * DLL of GNUNET_REGEX_Automaton's used as a stack. + */ + struct GNUNET_REGEX_Automaton *stack_tail; +}; + + +/** + * Adds multi-strided transitions to the given 'dfa'. + * + * @param regex_ctx regex context needed to add transitions to the automaton. + * @param dfa DFA to which the multi strided transitions should be added. + * @param stride_len length of the strides. + */ +void +GNUNET_REGEX_dfa_add_multi_strides (struct GNUNET_REGEX_Context *regex_ctx, + struct GNUNET_REGEX_Automaton *dfa, + const unsigned int stride_len); + + +/** + * Generate a (pseudo) random regular expression of length 'rx_length', as well + * as a (optional) string that will be matched by the generated regex. The + * returned regex needs to be freed. + * + * @param rx_length length of the random regex. + * @param matching_str (optional) pointer to a string that will contain a string + * that will be matched by the generated regex, if + * 'matching_str' pointer was not NULL. + * + * @return NULL if 'rx_length' is 0, a random regex of length 'rx_length', which + * needs to be freed, otherwise. + */ +char * +GNUNET_REGEX_generate_random_regex (size_t rx_length, char *matching_str); + + +/** + * Generate a random string of maximum length 'max_len' that only contains literals allowed + * in a regular expression. The string might be 0 chars long but is garantueed + * to be shorter or equal to 'max_len'. + * + * @param max_len maximum length of the string that should be generated. + * + * @return random string that needs to be freed. + */ +char * +GNUNET_REGEX_generate_random_string (size_t max_len); + + +#if 0 /* keep Emacsens' auto-indent happy */ +{ +#endif +#ifdef __cplusplus +} +#endif + +#endif |