diff options
Diffstat (limited to 'xml-stream-reader.clj')
-rw-r--r-- | xml-stream-reader.clj | 282 |
1 files changed, 282 insertions, 0 deletions
diff --git a/xml-stream-reader.clj b/xml-stream-reader.clj new file mode 100644 index 00000000..51abb204 --- /dev/null +++ b/xml-stream-reader.clj @@ -0,0 +1,282 @@ +;;; xml-stream-reader.clj -- StAX XML reading for Clojure + +;; by Stuart Sierra +;; Version 1; April 21, 2008 + +;; THIS IS AN 'ALPHA' RELEASE AND THE API IS SUBJECT TO CHANGE. + + +;; Copyright (c) Stuart Sierra. All rights reserved. The use and +;; distribution terms for this software are covered by the Common +;; Public License 1.0 (http://opensource.org/licenses/cpl1.0.php) +;; which can be found in the file CPL.TXT at the root of the Clojure +;; distribution. By using this software in any fashion, you are +;; agreeing to be bound by the terms of this license. You must not +;; remove this notice, or any other, from this software. + + +;; This file defines a stream-based parser for XML, based on the StAX +;; API, included with the JDK since version 6. +;; +;; StAX documentation is available at +;; https://jaxp-sources.dev.java.net/nonav/docs/api/javax/xml/stream/XMLStreamReader.html +;; +;; The operation of the parser is similar to SAX event-based parsers, +;; but the API is simpler and (supposedly) faster than SAX. It also +;; provides XML namespace support. +;; +;; This is a lower-level interface than Clojure's xml.clj. It does +;; not build up a data structure representing the XML. It merely +;; calls a handler function, which you must provide, for each event in +;; the XML stream, and provides some convenient macros to query the +;; current event. +;; +;; See the "PUBLIC API", below, for more instructions. +;; +;; This parser will use the Apache XML Commons resolver, if it is +;; available, to avoid downloading DTDs from the web. See +;; http://xml.apache.org/commons/components/resolver/index.html +;; +;; If the XML Commons resolver jar is on your classpath, and your XML +;; catalogs are properly configured, this library will use local +;; versions of the XML DTDs. + + +(clojure/in-ns 'xml-stream-reader) +(clojure/refer 'clojure) + + +;;; PRIVATE + +(import '(javax.xml.stream XMLInputFactory XMLStreamConstants + XMLStreamReader)) + +(def #^XMLStreamReader *xml-input-stream*) ; dynamically bound during parsing + +;; Use Apache's DTD Resolver if it is available. +(try + (def + #^{:private true + :doc "Instance of ResolvingXMLReader from the Apach XML Commons, + or nil if that class cannot be found."} + *dtd-resolver-impl* + (.. Class (forName "org.apache.xml.resolver.tools.ResolvingXMLReader") + (newInstance))) + ;; XMLStreamReader uses a non-SAX interface, so we have to wrap + ;; the entity resolver in a different interface. + (def + #^{:private true + :doc "Proxy class to implement the StAX XMLResolver interface."} + *dtd-resolver* + (proxy [javax.xml.stream.XMLResolver] [] + ;; Uncomment (prn ... ) lines below to debug the resolver. + (resolveEntity [publicID systemID baseURI namespace] + ;;(prn "Resolving" publicID systemID baseURI namespace) + (let [entity (. *dtd-resolver-impl* (resolveEntity publicID systemID))] + (when entity + (let [#^String systemid (. entity (getSystemId))] + (when (and systemid (. systemid (startsWith "file:"))) + ;;(prn "Found local system ID" systemid) + (new java.io.FileInputStream (subs systemid 5))))))))) + (catch Exception e ;; could not find ResolvingXMLReader + (def *dtd-resolver-impl* nil) + (def *dtd-resolver* nil))) + +(def + #^{:private true + :doc "Global XMLInputFactory. Note: the API documentation does + not specify if XMLInputFactory is thread-safe."} + *xml-input-factory* + (. XMLInputFactory (newInstance))) + +;; Use the Apache DTD Resolver if it's available. +(when *dtd-resolver* (. *xml-input-factory* (setXMLResolver *dtd-resolver*))) + +(def + #^{:private true + :doc "Pre-allocated Throwable used to quit XML stream parsing."} + +stop-parsing+ (new Throwable "Finished parsing XML.")) + + +;;; PUBLIC API + +(defn xml-stream-reader + "Creates an XMLStreamReader from the source (a Reader or + InputStream). You should call close() on the XMLStreamReader when + you are finished with it. Normally this is called automatically by + 'parse-xml-stream'." + [input] + (. *xml-input-factory* (createXMLStreamReader input))) + + +(defn parse-xml-stream + "Parses 'source' (a Reader or InputStream) with XMLStreamReader. + Calls 'handler-function' once for each event in the XML stream, with + the event type (one of the XMLStreamConstants) as the argument. + + Use the accessor functions like (lname), (text), and (attr-value...) + to get information about the current XML event. + + Use (stop-xml-parse) to quit parsing before the entire document has + been read. + + This function ensures that the XMLStreamReader is closed when + parsing is completed or stopped, but it does NOT close the provided + input source." + [handler-function source] + (let [xml-stream (xml-stream-reader source)] + (try + (binding [*xml-input-stream* xml-stream] + (loop [] ; while + (when (. *xml-input-stream* (hasNext)) + (handler-function (. *xml-input-stream* (next))) + (recur)))) + (catch Throwable t + (when-not (identical? t +stop-parsing+) + (throw t))) + (finally (. xml-stream (close)))))) + + +;;; XML EVENT TYPE CONSTANTS +;; Use these to dispatch based on event type in your handler function. + +(def ATTRIBUTE (. XMLStreamConstants ATTRIBUTE)) +(def CDATA (. XMLStreamConstants CDATA)) +(def CHARACTERS (. XMLStreamConstants CHARACTERS)) +(def COMMENT (. XMLStreamConstants COMMENT)) +(def DTD (. XMLStreamConstants DTD)) +(def END_DOCUMENT (. XMLStreamConstants END_DOCUMENT)) +(def END_ELEMENT (. XMLStreamConstants END_ELEMENT)) +(def ENTITY_DECLARATION (. XMLStreamConstants ENTITY_DECLARATION)) +(def ENTITY_REFERENCE (. XMLStreamConstants ENTITY_REFERENCE)) +(def NAMESPACE (. XMLStreamConstants NAMESPACE)) +(def NOTATION_DECLARATION (. XMLStreamConstants NOTATION_DECLARATION)) +(def PROCESSING_INSTRUCTION (. XMLStreamConstants PROCESSING_INSTRUCTION)) +(def SPACE (. XMLStreamConstants SPACE)) +(def START_DOCUMENT (. XMLStreamConstants START_DOCUMENT)) +(def START_ELEMENT (. XMLStreamConstants START_ELEMENT)) + + +;;; PUBLIC XML EVENT ACCESSORS + +(defn stop-xml-parse + "Stop the XML stream parser and return from 'parse-xml-stream'." + [] (throw +stop-parsing+)) + +(defmacro event-type + "Returns the event type (a static in XMLStreamConstants) of the + current XML event. Normally not needed, because your handler + function will receive the event type as its argument." + [] + '(. *xml-input-stream* (getEventType))) + +(defmacro pi-target + "Returns the target of an XML processing instruction." + [] + '(. *xml-input-stream* (getPITarget))) + +(defmacro pi-data + "Returns the data of an XML processing instruction." + [] + '(. *xml-input-stream* (getPIData))) + +(defmacro prefix + "Returns the prefix of a namespace-qualified XML element." + [] + '(let [p (. *xml-input-stream* (getPrefix))] + (if (. p (isEmpty)) nil (keyword p)))) + +(defmacro lname + "Returns the local name of an XML element." + [] + '(. *xml-input-stream* (getLocalName))) + +(defmacro xmlns + "Returns the namespace URI of a namespace-qualified XML element." + [] + '(. *xml-input-stream* (getNamespaceURI))) + +(defmacro whitespace? + "Returns true if the current XML character node contains only + whitespace. Implementation-dependent." + [] + '(. *xml-input-stream* (isWhiteSpace))) + +(defmacro text + "Returns the text of the current XML character node." + [] + '(. *xml-input-stream* (getText))) + +(defmacro attr-count + "Returns the number of attributes on the current XML element." + [] + '(. *xml-input-stream* (getAttributeCount))) + +(defn attr-value + "Returns the value of the attribute on the current XML element." + ([local-name] (. *xml-input-stream* (getAttributeValue nil local-name))) + ([local-name xmlns] (. *xml-input-stream* (getAttributeValue xmlns local-name)))) + +(defstruct attribute :lname :xmlns :prefix :value) + +(defn nth-attr + "Returns a struct representing the nth attribute of the current XML + element. The struct has 4 parts, :lname (local name), :xmlns, + :prefix, and :value." + [n] + (struct attribute + (. *xml-input-stream* (getAttributeLocalName n)) + (. *xml-input-stream* (getAttributeNamespace n)) + (. *xml-input-stream* (getAttributePrefix n)) + (. *xml-input-stream* (getAttributeValue n)))) + +(defn attrs + "Returns a seq of attribute structures for all attributes on the + current XML element." + [] + (doall ; have to get all attributes before next XML event + (for [index (range (attr-count))] + (nth-attr index)))) + + + +;; Valid methods for each state: +;; from https://jaxp-sources.dev.java.net/nonav/docs/api/javax/xml/stream/XMLStreamReader.html +;; +;; All States getProperty(), hasNext(), require(), close(), +;; getNamespaceURI(), isStartElement(), isEndElement(), +;; isCharacters(), isWhiteSpace(), getNamespaceContext(), +;; getEventType(),getLocation(), hasText() +;; +;; START_ELEMENT next(), getName(), getLocalName(), hasName(), +;; getPrefix(), getAttributeXXX(), +;; isAttributeSpecified(), getNamespaceXXX(), +;; getElementText(), nextTag() +;; +;; ATTRIBUTE next(), nextTag() getAttributeXXX(), +;; isAttributeSpecified(), +;; +;; NAMESPACE next(), nextTag() getNamespaceXXX() +;; +;; END_ELEMENT next(), getName(), getLocalName(), hasName(), +;; getPrefix(), getNamespaceXXX(), nextTag() +;; +;; CHARACTERS next(), getTextXXX(), nextTag() +;; +;; CDATA next(), getTextXXX(), nextTag() +;; +;; COMMENT next(), getTextXXX(), nextTag() +;; +;; SPACE next(), getTextXXX(), nextTag() +;; +;; START_DOCUMENT next(), getEncoding(), next(), getPrefix(), +;; getVersion(), isStandalone(), standaloneSet(), +;; getCharacterEncodingScheme(), nextTag() +;; +;; END_DOCUMENT close() +;; +;; PROCESSING_INSTRUCTION next(), getPITarget(), getPIData(), nextTag() +;; +;; ENTITY_REFERENCE next(), getLocalName(), getText(), nextTag() +;; +;; DTD next(), getText(), nextTag() |