aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xGrokHtml.c146
-rw-r--r--Makefile13
-rw-r--r--treexpr.c18
-rw-r--r--treexpr.h1
4 files changed, 116 insertions, 62 deletions
diff --git a/GrokHtml.c b/GrokHtml.c
index 2314336..7623047 100755
--- a/GrokHtml.c
+++ b/GrokHtml.c
@@ -26,7 +26,8 @@
#include "GrokHtml.h"
#ifdef HTML_PARSE_RECOVER
-#define PARSER_OPTIONS ( HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING )
+#define PARSER_OPTIONS ( HTML_PARSE_RECOVER | HTML_PARSE_NOERROR \
+ | HTML_PARSE_NOWARNING )
#else
#define PARSER_OPTIONS ( HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING )
#endif
@@ -46,7 +47,8 @@
* Method: FreeMachine
* Signature: (J)V
*/
-JNIEXPORT void JNICALL Java_GrokHtml_FreeMachine( JNIEnv *env, jobject obj, jlong Machine )
+JNIEXPORT void JNICALL Java_GrokHtml_FreeMachine( JNIEnv *env, jobject obj,
+ jlong Machine )
{
free_machine((struct machine *)JLONG_TO_POINTER( Machine ));
}
@@ -56,7 +58,8 @@ JNIEXPORT void JNICALL Java_GrokHtml_FreeMachine( JNIEnv *env, jobject obj, jlon
* Method: ParseExpression
* Signature: (Ljava/lang/String;)J
*/
-JNIEXPORT jlong JNICALL Java_GrokHtml_ParseExpression( JNIEnv *env, jobject obj, jstring Expression )
+JNIEXPORT jlong JNICALL Java_GrokHtml_ParseExpression( JNIEnv *env, jobject obj,
+ jstring Expression )
{
const char *expr, *residue;
struct machine *m;
@@ -79,19 +82,29 @@ JNIEXPORT jlong JNICALL Java_GrokHtml_ParseExpression( JNIEnv *env, jobject obj,
if( exc == NULL )
return 0;
- /* Grab the method ID for the constructor: ParseException( String s, int errorOffset ) */
- id = (*env)->GetMethodID( env, exc, "<init>", "(Ljava/lang/String;I)V" );
+ /*
+ * Grab the method ID for the constructor: ParseException(
+ * String s, int errorOffset )
+ */
+ id = (*env)->GetMethodID( env, exc, "<init>",
+ "(Ljava/lang/String;I)V" );
if( id == 0 )
return 0;
- /* Convert the error message from the parser to a java string object */
+ /*
+ * Convert the error message from the parser to a java string
+ * object
+ */
message = (*env)->NewStringUTF( env, m->error );
/* Create the exception object */
- /* NB: m->buf is a pointer into the origional string where the error occured, so we
- take the difference from expr to find errorOffset, this will not be accurate if
- the string contained any non-ASCII characters */
- ex = (*env)->NewObject( env, exc, id, message, (jint)( m->buf - expr ));
+ /* NB: m->buf is a pointer into the origional string where the
+ * error occured, so we take the difference from expr to find
+ * errorOffset, this will not be accurate if the string
+ * contained any non-ASCII characters
+ */
+ ex = (*env)->NewObject( env, exc, id, message,
+ (jint)( m->buf - expr ));
/* Throw it */
(*env)->Throw( env, ex );
@@ -103,9 +116,10 @@ JNIEXPORT jlong JNICALL Java_GrokHtml_ParseExpression( JNIEnv *env, jobject obj,
}
/*
- * If residue is an empty string here then the entire thing was read as an expression.
- * It might be considered an error if there is some stuff left over, but it also might
- * be usefull to ignore it to be a little more robust. For now it's ignored.
+ * If residue is an empty string here then the entire thing was read as
+ * an expression. It might be considered an error if there is some stuff
+ * left over, but it also might be usefull to ignore it to be a little
+ * more robust. For now it's ignored.
*/
/* Clean up and return the pointer as a long */
@@ -118,7 +132,8 @@ JNIEXPORT jlong JNICALL Java_GrokHtml_ParseExpression( JNIEnv *env, jobject obj,
* Method: FreeDocument
* Signature: (J)V
*/
-JNIEXPORT void JNICALL Java_GrokHtml_FreeDocument( JNIEnv *env, jobject obj, jlong Document )
+JNIEXPORT void JNICALL Java_GrokHtml_FreeDocument( JNIEnv *env, jobject obj,
+ jlong Document )
{
xmlFreeDoc((htmlDocPtr)JLONG_TO_POINTER( Document ));
}
@@ -128,7 +143,8 @@ JNIEXPORT void JNICALL Java_GrokHtml_FreeDocument( JNIEnv *env, jobject obj, jlo
* Method: OpenDocumentFromURI
* Signature: (Ljava/lang/String;)J
*/
-JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromURI( JNIEnv *env, jobject obj, jstring URI )
+JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromURI( JNIEnv *env,
+ jobject obj, jstring URI )
{
const char *uri;
htmlDocPtr doc;
@@ -150,51 +166,52 @@ JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromURI( JNIEnv *env, jobject
}
return POINTER_TO_JLONG( doc );
}
-
-/*
- * Class: GrokHtml
- * Method: OpenDocumentFromBytes
- * Signature: ([B)J
- */
-JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromBytes( JNIEnv *env, jobject obj,
- jbyteArray Bytes )
-{
- jsize len;
- jbyte *bytes;
- htmlDocPtr doc;
-
- len = (*env)->GetArrayLength( env, Bytes );
- bytes = (*env)->GetByteArrayElements( env, Bytes, NULL );
- doc = htmlReadMemory((char *)bytes, len, NULL, NULL, PARSER_OPTIONS );
- (*env)->ReleaseByteArrayElements( env, Bytes, bytes, JNI_ABORT );
- if( doc == NULL )
- {
- jclass exc;
-
- /* Throw a runtime exception */
- exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
- if( exc == NULL )
- return 0;
-
- (*env)->ThrowNew( env, exc, "Error opening HTML document" );
- return 0;
- }
- return POINTER_TO_JLONG( doc );
-
-}
+
+/*
+ * Class: GrokHtml
+ * Method: OpenDocumentFromBytes
+ * Signature: ([B)J
+ */
+JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromBytes( JNIEnv *env,
+ jobject obj, jbyteArray Bytes )
+{
+ jsize len;
+ jbyte *bytes;
+ htmlDocPtr doc;
+
+ len = (*env)->GetArrayLength( env, Bytes );
+ bytes = (*env)->GetByteArrayElements( env, Bytes, NULL );
+ doc = htmlReadMemory((char *)bytes, len, NULL, NULL, PARSER_OPTIONS );
+ (*env)->ReleaseByteArrayElements( env, Bytes, bytes, JNI_ABORT );
+ if( doc == NULL )
+ {
+ jclass exc;
+
+ /* Throw a runtime exception */
+ exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
+ if( exc == NULL )
+ return 0;
+
+ (*env)->ThrowNew( env, exc, "Error opening HTML document" );
+ return 0;
+ }
+ return POINTER_TO_JLONG( doc );
+
+}
/*
* Class: GrokHtml
* Method: OpenDocumentFromString
* Signature: (Ljava/lang/String;)J
*/
-JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromString( JNIEnv *env, jobject obj,
- jstring Document )
+JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromString( JNIEnv *env,
+ jobject obj, jstring Document )
{
const char *document;
htmlDocPtr doc;
- document = (char *)(*env)->GetStringUTFChars( env, Document, JNI_FALSE );
+ document = (char *)(*env)->GetStringUTFChars( env, Document,
+ JNI_FALSE );
doc = htmlReadMemory( document, strlen( document ), NULL,
NULL, PARSER_OPTIONS );
(*env)->ReleaseStringUTFChars( env, Document, document );
@@ -218,8 +235,8 @@ JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromString( JNIEnv *env, jobje
* Method: SearchDocument
* Signature: (JLjava/lang/String;J)Ljava/lang/String;
*/
-JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj, jlong Document,
- jstring Pattern, jlong Machine )
+JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env,
+ jobject obj, jlong Document, jstring Pattern, jlong Machine )
{
const char *pattern;
struct match *z;
@@ -243,6 +260,7 @@ JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj
(*env)->ThrowNew( env, exc, "No match found" );
/* Clean up and return */
+ free_matches( z );
return NULL;
}
@@ -250,10 +268,14 @@ JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj
pattern = (char *)(*env)->GetStringUTFChars( env, Pattern, JNI_FALSE );
sub = 0;
for( i = 0; pattern[i] != 0; i++ )
- if( pattern[i] == '\\' && isdigit( pattern[i + 1] ) && sub < pattern[i + 1] - '0' )
+ if( pattern[i] == '\\' && isdigit( pattern[i + 1] )
+ && sub < pattern[i + 1] - '0' )
sub = pattern[i + 1] - '0';
- /* Allocate an array for matches, only enough to hold what we're going to use */
+ /*
+ * Allocate an array for matches, only enough to hold what we're going
+ * to use
+ */
re = malloc( sub * sizeof( struct regex_match * ));
if( re == NULL )
{
@@ -268,6 +290,7 @@ JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj
/* Clean up and return */
(*env)->ReleaseStringUTFChars( env, Pattern, pattern );
+ free_matches( z );
return NULL;
}
@@ -279,15 +302,18 @@ JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj
jclass exc;
/* Throw an index out of bounds exception */
- exc = (*env)->FindClass( env, "java/lang/IndexOutOfBoundsException" );
+ exc = (*env)->FindClass( env,
+ "java/lang/IndexOutOfBoundsException" );
if( exc == NULL )
return NULL;
- (*env)->ThrowNew( env, exc, "Not enough matches to satisfy pattern" );
+ (*env)->ThrowNew( env, exc,
+ "Not enough matches to satisfy pattern" );
/* Clean up and return */
free( re );
(*env)->ReleaseStringUTFChars( env, Pattern, pattern );
+ free_matches( z );
return NULL;
}
@@ -297,7 +323,8 @@ JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj
if( pattern[i] == '\\' && isdigit( pattern[i + 1] ))
{
sub = pattern[++i] - '1';
- for( k = re[sub]->match.rm_so; k < re[sub]->match.rm_eo; k++ )
+ for( k = re[sub]->match.rm_so; k < re[sub]->match.rm_eo;
+ k++ )
len++;
}
else
@@ -319,6 +346,7 @@ JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj
/* Clean up and return */
free( re );
(*env)->ReleaseStringUTFChars( env, Pattern, pattern );
+ free_matches( z );
return NULL;
}
@@ -327,7 +355,8 @@ JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj
if( pattern[i] == '\\' && isdigit( pattern[i + 1] ))
{
sub = pattern[++i] - '1';
- for( k = re[sub]->match.rm_so; k < re[sub]->match.rm_eo; k++ )
+ for( k = re[sub]->match.rm_so; k < re[sub]->match.rm_eo;
+ k++ )
buf[j++] = re[sub]->str[k];
}
else
@@ -337,6 +366,7 @@ JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj
/* Clean up and return the string */
free( re );
(*env)->ReleaseStringUTFChars( env, Pattern, pattern );
+ free_matches( z );
retval = (*env)->NewStringUTF( env, buf );
free( buf );
return retval;
diff --git a/Makefile b/Makefile
index 589cbfa..624869f 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,8 @@ DOTSO = .so
# Source files
SOURCES = regex/regcomp.c regex/regerror.c regex/regexec.c regex/regfree.c \
- treexpr.c GrokHtml.c
+ treexpr.c
+JNISOURCES = $(SOURCES) GrokHtml.c
# libxml2 flags
XMLINCL = $(shell xml2-config --cflags)
@@ -30,11 +31,11 @@ JAVAINCL = -I/usr/include/sablevm
# for freebsd using jdk15 from portage:
# JAVAINCL = -I/usr/local/jdk1.5.0/include -I/usr/local/jdk1.5.0/include/freebsd
-INCL = -I./regex $(XMLINCL) $(JAVAINCL)
+INCL = -I./regex $(XMLINCL)
LIBS = $(XMLLIBS)
CFLAGS += -O2 -Wall
-all: $(LIB)GrokHtml$(DOTSO)
+all: $(LIB)GrokHtml$(DOTSO) $(LIB)treexpr$(DOTSO)
GrokHtml.h: GrokHtml.class
$(JAVAH) -classpath . -jni -o $@ GrokHtml
@@ -45,7 +46,10 @@ GrokHtml.class: GrokHtml.java
TestIt.class: TestIt.java
$(JAVAC) $<
-$(LIB)GrokHtml$(DOTSO): $(SOURCES) GrokHtml.h
+$(LIB)GrokHtml$(DOTSO): $(JNISOURCES) GrokHtml.h
+ $(CC) $(LIBS) $(CFLAGS) $(INCL) $(JAVAINCL) -shared -o $@ $(JNISOURCES)
+
+$(LIB)treexpr$(DOTSO): $(SOURCES)
$(CC) $(LIBS) $(CFLAGS) $(INCL) -shared -o $@ $(SOURCES)
test: $(LIB)GrokHtml$(DOTSO) TestIt.class
@@ -53,5 +57,6 @@ test: $(LIB)GrokHtml$(DOTSO) TestIt.class
clean:
$(RM) $(LIB)GrokHtml$(DOTSO)
+ $(RM) $(LIB)treexpr$(DOTSO)
$(RM) GrokHtml.h
$(RM) *.class
diff --git a/treexpr.c b/treexpr.c
index eb68f2e..3b6a836 100644
--- a/treexpr.c
+++ b/treexpr.c
@@ -1143,3 +1143,21 @@ struct match *document_process( struct machine *m, xmlDocPtr doc )
{
return node_recurse( m, doc->children->next, NULL );
}
+
+// free matches returned by document_process
+void free_matches( struct match *z )
+{
+ struct match *nextz;
+ struct regex_match *nextre;
+
+ for( ; z != NULL; z = nextz )
+ {
+ nextz = z->next;
+ for( ; z->re != NULL; z->re = nextre )
+ {
+ nextre = z->re->next;
+ free( z->re );
+ }
+ free( z );
+ }
+}
diff --git a/treexpr.h b/treexpr.h
index c424a54..d1705cc 100644
--- a/treexpr.h
+++ b/treexpr.h
@@ -103,5 +103,6 @@ struct match
const char *parse_treexpr( const char *expr, struct machine **m );
void free_machine( struct machine *m );
struct match *document_process( struct machine *m, xmlDocPtr doc );
+void free_matches( struct match *z );
#endif