/* GrokHtml.c - Tree expression language JNI interface source file
+ Copyright (C) 2005 Dell, Inc.
+ Authors: David Barksdale
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include
#include
#include
#include
#include
#include "treexpr.h"
#include "GrokHtml.h"
#ifdef HTML_PARSE_RECOVER
#define PARSER_OPTIONS ( HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING )
#else
#define PARSER_OPTIONS ( HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING )
#endif
#if 0
/* 64-bit machine */
#define JLONG_TO_POINTER( x ) ((void *)( x ))
#define POINTER_TO_JLONG( x ) ((jlong)( x ))
#else
/* 32-bit machine */
#define JLONG_TO_POINTER( x ) ((void *)(int)( x ))
#define POINTER_TO_JLONG( x ) ((jlong)(int)( x ))
#endif
/*
* Class: GrokHtml
* Method: FreeMachine
* Signature: (J)V
*/
JNIEXPORT void JNICALL Java_GrokHtml_FreeMachine( JNIEnv *env, jobject obj, jlong Machine )
{
free_machine((struct machine *)JLONG_TO_POINTER( Machine ));
}
/*
* Class: GrokHtml
* Method: ParseExpression
* Signature: (Ljava/lang/String;)J
*/
JNIEXPORT jlong JNICALL Java_GrokHtml_ParseExpression( JNIEnv *env, jobject obj, jstring Expression )
{
const char *expr, *residue;
struct machine *m;
expr = (char *)(*env)->GetStringUTFChars( env, Expression, JNI_FALSE );
residue = parse_treexpr( expr, &m );
if( residue == NULL )
{
jclass exc;
jmethodID id;
jstring message;
jobject ex;
/*
* There was an error parsing, throw Java exception
*/
/* First lookup the java.text.ParseException class */
exc = (*env)->FindClass( env, "java/text/ParseException" );
if( exc == NULL )
return 0;
/* Grab the method ID for the constructor: ParseException( String s, int errorOffset ) */
id = (*env)->GetMethodID( env, exc, "", "(Ljava/lang/String;I)V" );
if( id == 0 )
return 0;
/* Convert the error message from the parser to a java string object */
message = (*env)->NewStringUTF( env, m->error );
/* Create the exception object */
/* NB: m->buf is a pointer into the origional string where the error occured, so we
take the difference from expr to find errorOffset, this will not be accurate if
the string contained any non-ASCII characters */
ex = (*env)->NewObject( env, exc, id, message, (jint)( m->buf - expr ));
/* Throw it */
(*env)->Throw( env, ex );
/* Free the left-over memory used by the parser */
free_machine( m );
(*env)->ReleaseStringUTFChars( env, Expression, expr );
return 0;
}
/*
* If residue is an empty string here then the entire thing was read as an expression.
* It might be considered an error if there is some stuff left over, but it also might
* be usefull to ignore it to be a little more robust. For now it's ignored.
*/
/* Clean up and return the pointer as a long */
(*env)->ReleaseStringUTFChars( env, Expression, expr );
return POINTER_TO_JLONG( m );
}
/*
* Class: GrokHtml
* Method: FreeDocument
* Signature: (J)V
*/
JNIEXPORT void JNICALL Java_GrokHtml_FreeDocument( JNIEnv *env, jobject obj, jlong Document )
{
xmlFreeDoc((htmlDocPtr)JLONG_TO_POINTER( Document ));
}
/*
* Class: GrokHtml
* Method: OpenDocumentFromURI
* Signature: (Ljava/lang/String;)J
*/
JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromURI( JNIEnv *env, jobject obj, jstring URI )
{
const char *uri;
htmlDocPtr doc;
uri = (char *)(*env)->GetStringUTFChars( env, URI, JNI_FALSE );
doc = htmlReadFile( uri, NULL, PARSER_OPTIONS );
(*env)->ReleaseStringUTFChars( env, URI, uri );
if( doc == NULL )
{
jclass exc;
/* Throw a runtime exception */
exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
if( exc == NULL )
return 0;
(*env)->ThrowNew( env, exc, "Error opening HTML document" );
return 0;
}
return POINTER_TO_JLONG( doc );
}
/*
* Class: GrokHtml
* Method: OpenDocumentFromBytes
* Signature: ([B)J
*/
JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromBytes( JNIEnv *env, jobject obj,
jbyteArray Bytes )
{
jsize len;
jbyte *bytes;
htmlDocPtr doc;
len = (*env)->GetArrayLength( env, Bytes );
bytes = (*env)->GetByteArrayElements( env, Bytes, NULL );
doc = htmlReadMemory((char *)bytes, len, NULL, NULL, PARSER_OPTIONS );
(*env)->ReleaseByteArrayElements( env, Bytes, bytes, JNI_ABORT );
if( doc == NULL )
{
jclass exc;
/* Throw a runtime exception */
exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
if( exc == NULL )
return 0;
(*env)->ThrowNew( env, exc, "Error opening HTML document" );
return 0;
}
return POINTER_TO_JLONG( doc );
}
/*
* Class: GrokHtml
* Method: OpenDocumentFromString
* Signature: (Ljava/lang/String;)J
*/
JNIEXPORT jlong JNICALL Java_GrokHtml_OpenDocumentFromString( JNIEnv *env, jobject obj,
jstring Document )
{
const char *document;
htmlDocPtr doc;
document = (char *)(*env)->GetStringUTFChars( env, Document, JNI_FALSE );
doc = htmlReadMemory( document, strlen( document ), NULL,
NULL, PARSER_OPTIONS );
(*env)->ReleaseStringUTFChars( env, Document, document );
if( doc == NULL )
{
jclass exc;
/* Throw a runtime exception */
exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
if( exc == NULL )
return 0;
(*env)->ThrowNew( env, exc, "Error opening HTML document" );
return 0;
}
return POINTER_TO_JLONG( doc );
}
/*
* Class: GrokHtml
* Method: SearchDocument
* Signature: (JLjava/lang/String;J)Ljava/lang/String;
*/
JNIEXPORT jstring JNICALL Java_GrokHtml_SearchDocument( JNIEnv *env, jobject obj, jlong Document,
jstring Pattern, jlong Machine )
{
const char *pattern;
struct match *z;
struct regex_match **re, *cur;
char *buf;
jstring retval;
int i, j, k, sub, len;
/* Search the document */
z = document_process((struct machine *)JLONG_TO_POINTER( Machine ),
(htmlDocPtr)JLONG_TO_POINTER( Document ));
if( z == NULL )
{
jclass exc;
/* Throw a runtime exception */
exc = (*env)->FindClass( env, "java/lang/RuntimeException" );
if( exc == NULL )
return NULL;
(*env)->ThrowNew( env, exc, "No match found" );
/* Clean up and return */
return NULL;
}
/* Count subexpressions */
pattern = (char *)(*env)->GetStringUTFChars( env, Pattern, JNI_FALSE );
sub = 0;
for( i = 0; pattern[i] != 0; i++ )
if( pattern[i] == '\\' && isdigit( pattern[i + 1] ) && sub < pattern[i + 1] - '0' )
sub = pattern[i + 1] - '0';
/* Allocate an array for matches, only enough to hold what we're going to use */
re = malloc( sub * sizeof( struct regex_match * ));
if( re == NULL )
{
jclass exc;
/* Throw an out of memory error */
exc = (*env)->FindClass( env, "java/lang/OutOfMemoryError" );
if( exc == NULL )
return NULL;
(*env)->ThrowNew( env, exc, "Unable to allocate memory" );
/* Clean up and return */
(*env)->ReleaseStringUTFChars( env, Pattern, pattern );
return NULL;
}
/* Fill in array */
for( i = 0, cur = z->re; cur != NULL && i < sub; cur = cur->next, i++ )
re[i] = cur;
if( i < sub )
{
jclass exc;
/* Throw an index out of bounds exception */
exc = (*env)->FindClass( env, "java/lang/IndexOutOfBoundsException" );
if( exc == NULL )
return NULL;
(*env)->ThrowNew( env, exc, "Not enough matches to satisfy pattern" );
/* Clean up and return */
free( re );
(*env)->ReleaseStringUTFChars( env, Pattern, pattern );
return NULL;
}
/* Calculate size of output buffer */
len = 1;
for( i = j = 0; pattern[i] != 0; i++ )
if( pattern[i] == '\\' && isdigit( pattern[i + 1] ))
{
sub = pattern[++i] - '1';
for( k = re[sub]->match.rm_so; k < re[sub]->match.rm_eo; k++ )
len++;
}
else
len++;
/* Allocate output buffer */
buf = malloc( len );
if( buf == NULL )
{
jclass exc;
/* Throw an out of memory error */
exc = (*env)->FindClass( env, "java/lang/OutOfMemoryError" );
if( exc == NULL )
return NULL;
(*env)->ThrowNew( env, exc, "Unable to allocate memory" );
/* Clean up and return */
free( re );
(*env)->ReleaseStringUTFChars( env, Pattern, pattern );
return NULL;
}
/* Fill in output buffer */
for( i = j = 0; pattern[i] != 0; i++ )
if( pattern[i] == '\\' && isdigit( pattern[i + 1] ))
{
sub = pattern[++i] - '1';
for( k = re[sub]->match.rm_so; k < re[sub]->match.rm_eo; k++ )
buf[j++] = re[sub]->str[k];
}
else
buf[j++] = pattern[i];
buf[j++] = 0;
/* Clean up and return the string */
free( re );
(*env)->ReleaseStringUTFChars( env, Pattern, pattern );
retval = (*env)->NewStringUTF( env, buf );
free( buf );
return retval;
}