aboutsummaryrefslogtreecommitdiff
path: root/GrokHtml.c
diff options
context:
space:
mode:
Diffstat (limited to 'GrokHtml.c')
-rwxr-xr-xGrokHtml.c343
1 files changed, 343 insertions, 0 deletions
diff --git a/GrokHtml.c b/GrokHtml.c
new file mode 100755
index 0000000..2314336
--- /dev/null
+++ b/GrokHtml.c
@@ -0,0 +1,343 @@
+/* GrokHtml.c - Tree expression language JNI interface source file
+ + Copyright (C) 2005 Dell, Inc.
+ + Authors: David Barksdale <amatus@ocgnet.org>
+ +
+ + This library is free software; you can redistribute it and/or
+ + modify it under the terms of the GNU Lesser General Public
+ + License as published by the Free Software Foundation; either
+ + version 2.1 of the License, or (at your option) any later version.
+ +
+ + This library is distributed in the hope that it will be useful,
+ + but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ + Lesser General Public License for more details.
+ +
+ + You should have received a copy of the GNU Lesser General Public
+ + License along with this library; if not, write to the Free Software
+ + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <ctype.h>
+#include <string.h>
+#include <jni.h>
+#include <libxml/HTMLtree.h>
+#include <libxml/HTMLparser.h>
+#include "treexpr.h"
+#include "GrokHtml.h"
+
+#ifdef HTML_PARSE_RECOVER
+#define PARSER_OPTIONS ( HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING )
+#else
+#define PARSER_OPTIONS ( HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING )
+#endif
+
+#if 0
+/* 64-bit machine */
+#define JLONG_TO_POINTER( x ) ((void *)( x ))
+#define POINTER_TO_JLONG( x ) ((jlong)( x ))
+#else
+/* 32-bit machine */
+#define JLONG_TO_POINTER( x ) ((void *)(int)( x ))
+#define POINTER_TO_JLONG( x ) ((jlong)(int)( x ))
+#endif
+
+/*
+ * Class: GrokHtml
+ * Method: FreeMachine
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_GrokHtml_FreeMachine( JNIEnv *env, jobject obj, jlong Machine )
+{
+ free_machine((struct machine *)JLONG_TO_POINTER( Machine ));
+}
+
+/*
+ * Class: GrokHtml
+ * Method: ParseExpression
+ * Signature: (Ljava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL Java_GrokHtml_ParseExpression( JNIEnv *env, jobject obj, jstring Expression )
+{
+ const char *expr, *residue;
+ struct machine *m;
+
+ expr = (char *)(*env)->GetStringUTFChars( env, Expression, JNI_FALSE );
+ residue = parse_treexpr( expr, &m );
+ if( residue == NULL )
+ {
+ jclass exc;
+ jmethodID id;
+ jstring message;
+ jobject ex;
+
+ /*
+ * There was an error parsing, throw Java exception
+ */
+
+ /* First lookup the java.text.ParseException class */
+ exc = (*env)->FindClass( env, "java/text/ParseException" );
+ if( exc == NULL )
+ return 0;
+
+ /* Grab the method ID for the constructor: ParseException( String s, int errorOffset ) */
+ id = (*env)->GetMethodID( env, exc, "<init>", "(Ljava/lang/String;I)V" );
+ if( id == 0 )
+ return 0;
+
+ /* Convert the error message from the parser to a java string object */
+ message = (*env)->NewStringUTF( env, m->error );
+
+ /* Create the exception object */
+ /* NB: m->buf is a pointer into the origional string where the error occured, so we
+ take the difference from expr to find errorOffset, this will not be accurate if
+ the string contained any non-ASCII characters */
+ ex = (*env)->NewObject( env, exc, id, message, (jint)( m->buf - expr ));
+
+ /* Throw it */
+ (*env)->Throw( env, ex );
+
+ /* Free the left-over memory used by the parser */
+ free_machine( m );
+ (*env)->ReleaseStringUTFChars( env, Expression, expr );
+ return 0;
+ }
+
+ /*
+ * If residue is an empty string here then the entire thing was read as an expression.
+ * It might be considered an error if there is some stuff left over, but it also might
+ * be usefull to ignore it to be a little more robust. For now it's ignored.
+ */
+
+ /* Clean up and return the pointer as a long */
+ (*env)->ReleaseStringUTFChars( env, Expression, expr );
+ return POINTER_TO_JLONG( m );
+}
+
+/*
+ * Class: GrokHtml
+ * Method: FreeDocument
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_GrokHtml_FreeDocument( JNIEnv *env, jobject obj, jlong Document )
+{
+ xmlFreeDoc((htmlDocPtr)JLONG_TO_POINTER( Document ));
+}
+
+/*
+ * Class: GrokHtml
+ * Method: OpenDocumentFromURI
+ * Signature: (Ljava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromURI( JNIEnv *env, jobject obj, jstring URI )
+{
+ const char *uri;
+ htmlDocPtr doc;
+
+ uri = (char *)(*env)->GetStringUTFChars( env, URI, JNI_FALSE );
+ doc = htmlReadFile( uri, NULL, PARSER_OPTIONS );
+ (*env)->ReleaseStringUTFChars( env, URI, uri );
+ if( doc == NULL )
+ {
+ jclass exc;
+
+ /* Throw a runtime exception */
+ exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
+ if( exc == NULL )
+ return 0;
+
+ (*env)->ThrowNew( env, exc, "Error opening HTML document" );
+ return 0;
+ }
+ return POINTER_TO_JLONG( doc );
+}
+
+/*
+ * Class: GrokHtml
+ * Method: OpenDocumentFromBytes
+ * Signature: ([B)J
+ */
+JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromBytes( JNIEnv *env, jobject obj,
+ jbyteArray Bytes )
+{
+ jsize len;
+ jbyte *bytes;
+ htmlDocPtr doc;
+
+ len = (*env)->GetArrayLength( env, Bytes );
+ bytes = (*env)->GetByteArrayElements( env, Bytes, NULL );
+ doc = htmlReadMemory((char *)bytes, len, NULL, NULL, PARSER_OPTIONS );
+ (*env)->ReleaseByteArrayElements( env, Bytes, bytes, JNI_ABORT );
+ if( doc == NULL )
+ {
+ jclass exc;
+
+ /* Throw a runtime exception */
+ exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
+ if( exc == NULL )
+ return 0;
+
+ (*env)->ThrowNew( env, exc, "Error opening HTML document" );
+ return 0;
+ }
+ return POINTER_TO_JLONG( doc );
+
+}
+
+/*
+ * Class: GrokHtml
+ * Method: OpenDocumentFromString
+ * Signature: (Ljava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromString( JNIEnv *env, jobject obj,
+ jstring Document )
+{
+ const char *document;
+ htmlDocPtr doc;
+
+ document = (char *)(*env)->GetStringUTFChars( env, Document, JNI_FALSE );
+ doc = htmlReadMemory( document, strlen( document ), NULL,
+ NULL, PARSER_OPTIONS );
+ (*env)->ReleaseStringUTFChars( env, Document, document );
+ if( doc == NULL )
+ {
+ jclass exc;
+
+ /* Throw a runtime exception */
+ exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
+ if( exc == NULL )
+ return 0;
+
+ (*env)->ThrowNew( env, exc, "Error opening HTML document" );
+ return 0;
+ }
+ return POINTER_TO_JLONG( doc );
+}
+
+/*
+ * Class: GrokHtml
+ * Method: SearchDocument
+ * Signature: (JLjava/lang/String;J)Ljava/lang/String;
+ */
+JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj, jlong Document,
+ jstring Pattern, jlong Machine )
+{
+ const char *pattern;
+ struct match *z;
+ struct regex_match **re, *cur;
+ char *buf;
+ jstring retval;
+ int i, j, k, sub, len;
+
+ /* Search the document */
+ z = document_process((struct machine *)JLONG_TO_POINTER( Machine ),
+ (htmlDocPtr)JLONG_TO_POINTER( Document ));
+ if( z == NULL )
+ {
+ jclass exc;
+
+ /* Throw a runtime exception */
+ exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
+ if( exc == NULL )
+ return NULL;
+
+ (*env)->ThrowNew( env, exc, "No match found" );
+
+ /* Clean up and return */
+ return NULL;
+ }
+
+ /* Count subexpressions */
+ pattern = (char *)(*env)->GetStringUTFChars( env, Pattern, JNI_FALSE );
+ sub = 0;
+ for( i = 0; pattern[i] != 0; i++ )
+ if( pattern[i] == '\\' && isdigit( pattern[i + 1] ) && sub < pattern[i + 1] - '0' )
+ sub = pattern[i + 1] - '0';
+
+ /* Allocate an array for matches, only enough to hold what we're going to use */
+ re = malloc( sub * sizeof( struct regex_match * ));
+ if( re == NULL )
+ {
+ jclass exc;
+
+ /* Throw an out of memory error */
+ exc = (*env)->FindClass( env, "java/lang/OutOfMemoryError" );
+ if( exc == NULL )
+ return NULL;
+
+ (*env)->ThrowNew( env, exc, "Unable to allocate memory" );
+
+ /* Clean up and return */
+ (*env)->ReleaseStringUTFChars( env, Pattern, pattern );
+ return NULL;
+ }
+
+ /* Fill in array */
+ for( i = 0, cur = z->re; cur != NULL && i < sub; cur = cur->next, i++ )
+ re[i] = cur;
+ if( i < sub )
+ {
+ jclass exc;
+
+ /* Throw an index out of bounds exception */
+ exc = (*env)->FindClass( env, "java/lang/IndexOutOfBoundsException" );
+ if( exc == NULL )
+ return NULL;
+
+ (*env)->ThrowNew( env, exc, "Not enough matches to satisfy pattern" );
+
+ /* Clean up and return */
+ free( re );
+ (*env)->ReleaseStringUTFChars( env, Pattern, pattern );
+ return NULL;
+ }
+
+ /* Calculate size of output buffer */
+ len = 1;
+ for( i = j = 0; pattern[i] != 0; i++ )
+ if( pattern[i] == '\\' && isdigit( pattern[i + 1] ))
+ {
+ sub = pattern[++i] - '1';
+ for( k = re[sub]->match.rm_so; k < re[sub]->match.rm_eo; k++ )
+ len++;
+ }
+ else
+ len++;
+
+ /* Allocate output buffer */
+ buf = malloc( len );
+ if( buf == NULL )
+ {
+ jclass exc;
+
+ /* Throw an out of memory error */
+ exc = (*env)->FindClass( env, "java/lang/OutOfMemoryError" );
+ if( exc == NULL )
+ return NULL;
+
+ (*env)->ThrowNew( env, exc, "Unable to allocate memory" );
+
+ /* Clean up and return */
+ free( re );
+ (*env)->ReleaseStringUTFChars( env, Pattern, pattern );
+ return NULL;
+ }
+
+ /* Fill in output buffer */
+ for( i = j = 0; pattern[i] != 0; i++ )
+ if( pattern[i] == '\\' && isdigit( pattern[i + 1] ))
+ {
+ sub = pattern[++i] - '1';
+ for( k = re[sub]->match.rm_so; k < re[sub]->match.rm_eo; k++ )
+ buf[j++] = re[sub]->str[k];
+ }
+ else
+ buf[j++] = pattern[i];
+ buf[j++] = 0;
+
+ /* Clean up and return the string */
+ free( re );
+ (*env)->ReleaseStringUTFChars( env, Pattern, pattern );
+ retval = (*env)->NewStringUTF( env, buf );
+ free( buf );
+ return retval;
+}