diff options
author | Stuart Sierra <mail@stuartsierra.com> | 2008-04-21 14:04:30 +0000 |
---|---|---|
committer | Stuart Sierra <mail@stuartsierra.com> | 2008-04-21 14:04:30 +0000 |
commit | 72d5e41479b462b5ce0115abbfbc50787780dafc (patch) | |
tree | 90e0bb38fe1015e370dbec55d11bcfcbfdd072d1 | |
parent | da17800b12d2c769adc24cc11f027257f7bd4345 (diff) |
Added ALPHA xml-stream-reader.clj and xml-stream-writer.clj
-rw-r--r-- | xml-stream-reader.clj | 282 | ||||
-rw-r--r-- | xml-stream-writer.clj | 225 |
2 files changed, 507 insertions, 0 deletions
diff --git a/xml-stream-reader.clj b/xml-stream-reader.clj new file mode 100644 index 00000000..51abb204 --- /dev/null +++ b/xml-stream-reader.clj @@ -0,0 +1,282 @@ +;;; xml-stream-reader.clj -- StAX XML reading for Clojure + +;; by Stuart Sierra +;; Version 1; April 21, 2008 + +;; THIS IS AN 'ALPHA' RELEASE AND THE API IS SUBJECT TO CHANGE. + + +;; Copyright (c) Stuart Sierra. All rights reserved. The use and +;; distribution terms for this software are covered by the Common +;; Public License 1.0 (http://opensource.org/licenses/cpl1.0.php) +;; which can be found in the file CPL.TXT at the root of the Clojure +;; distribution. By using this software in any fashion, you are +;; agreeing to be bound by the terms of this license. You must not +;; remove this notice, or any other, from this software. + + +;; This file defines a stream-based parser for XML, based on the StAX +;; API, included with the JDK since version 6. +;; +;; StAX documentation is available at +;; https://jaxp-sources.dev.java.net/nonav/docs/api/javax/xml/stream/XMLStreamReader.html +;; +;; The operation of the parser is similar to SAX event-based parsers, +;; but the API is simpler and (supposedly) faster than SAX. It also +;; provides XML namespace support. +;; +;; This is a lower-level interface than Clojure's xml.clj. It does +;; not build up a data structure representing the XML. It merely +;; calls a handler function, which you must provide, for each event in +;; the XML stream, and provides some convenient macros to query the +;; current event. +;; +;; See the "PUBLIC API", below, for more instructions. +;; +;; This parser will use the Apache XML Commons resolver, if it is +;; available, to avoid downloading DTDs from the web. See +;; http://xml.apache.org/commons/components/resolver/index.html +;; +;; If the XML Commons resolver jar is on your classpath, and your XML +;; catalogs are properly configured, this library will use local +;; versions of the XML DTDs. + + +(clojure/in-ns 'xml-stream-reader) +(clojure/refer 'clojure) + + +;;; PRIVATE + +(import '(javax.xml.stream XMLInputFactory XMLStreamConstants + XMLStreamReader)) + +(def #^XMLStreamReader *xml-input-stream*) ; dynamically bound during parsing + +;; Use Apache's DTD Resolver if it is available. +(try + (def + #^{:private true + :doc "Instance of ResolvingXMLReader from the Apach XML Commons, + or nil if that class cannot be found."} + *dtd-resolver-impl* + (.. Class (forName "org.apache.xml.resolver.tools.ResolvingXMLReader") + (newInstance))) + ;; XMLStreamReader uses a non-SAX interface, so we have to wrap + ;; the entity resolver in a different interface. + (def + #^{:private true + :doc "Proxy class to implement the StAX XMLResolver interface."} + *dtd-resolver* + (proxy [javax.xml.stream.XMLResolver] [] + ;; Uncomment (prn ... ) lines below to debug the resolver. + (resolveEntity [publicID systemID baseURI namespace] + ;;(prn "Resolving" publicID systemID baseURI namespace) + (let [entity (. *dtd-resolver-impl* (resolveEntity publicID systemID))] + (when entity + (let [#^String systemid (. entity (getSystemId))] + (when (and systemid (. systemid (startsWith "file:"))) + ;;(prn "Found local system ID" systemid) + (new java.io.FileInputStream (subs systemid 5))))))))) + (catch Exception e ;; could not find ResolvingXMLReader + (def *dtd-resolver-impl* nil) + (def *dtd-resolver* nil))) + +(def + #^{:private true + :doc "Global XMLInputFactory. Note: the API documentation does + not specify if XMLInputFactory is thread-safe."} + *xml-input-factory* + (. XMLInputFactory (newInstance))) + +;; Use the Apache DTD Resolver if it's available. +(when *dtd-resolver* (. *xml-input-factory* (setXMLResolver *dtd-resolver*))) + +(def + #^{:private true + :doc "Pre-allocated Throwable used to quit XML stream parsing."} + +stop-parsing+ (new Throwable "Finished parsing XML.")) + + +;;; PUBLIC API + +(defn xml-stream-reader + "Creates an XMLStreamReader from the source (a Reader or + InputStream). You should call close() on the XMLStreamReader when + you are finished with it. Normally this is called automatically by + 'parse-xml-stream'." + [input] + (. *xml-input-factory* (createXMLStreamReader input))) + + +(defn parse-xml-stream + "Parses 'source' (a Reader or InputStream) with XMLStreamReader. + Calls 'handler-function' once for each event in the XML stream, with + the event type (one of the XMLStreamConstants) as the argument. + + Use the accessor functions like (lname), (text), and (attr-value...) + to get information about the current XML event. + + Use (stop-xml-parse) to quit parsing before the entire document has + been read. + + This function ensures that the XMLStreamReader is closed when + parsing is completed or stopped, but it does NOT close the provided + input source." + [handler-function source] + (let [xml-stream (xml-stream-reader source)] + (try + (binding [*xml-input-stream* xml-stream] + (loop [] ; while + (when (. *xml-input-stream* (hasNext)) + (handler-function (. *xml-input-stream* (next))) + (recur)))) + (catch Throwable t + (when-not (identical? t +stop-parsing+) + (throw t))) + (finally (. xml-stream (close)))))) + + +;;; XML EVENT TYPE CONSTANTS +;; Use these to dispatch based on event type in your handler function. + +(def ATTRIBUTE (. XMLStreamConstants ATTRIBUTE)) +(def CDATA (. XMLStreamConstants CDATA)) +(def CHARACTERS (. XMLStreamConstants CHARACTERS)) +(def COMMENT (. XMLStreamConstants COMMENT)) +(def DTD (. XMLStreamConstants DTD)) +(def END_DOCUMENT (. XMLStreamConstants END_DOCUMENT)) +(def END_ELEMENT (. XMLStreamConstants END_ELEMENT)) +(def ENTITY_DECLARATION (. XMLStreamConstants ENTITY_DECLARATION)) +(def ENTITY_REFERENCE (. XMLStreamConstants ENTITY_REFERENCE)) +(def NAMESPACE (. XMLStreamConstants NAMESPACE)) +(def NOTATION_DECLARATION (. XMLStreamConstants NOTATION_DECLARATION)) +(def PROCESSING_INSTRUCTION (. XMLStreamConstants PROCESSING_INSTRUCTION)) +(def SPACE (. XMLStreamConstants SPACE)) +(def START_DOCUMENT (. XMLStreamConstants START_DOCUMENT)) +(def START_ELEMENT (. XMLStreamConstants START_ELEMENT)) + + +;;; PUBLIC XML EVENT ACCESSORS + +(defn stop-xml-parse + "Stop the XML stream parser and return from 'parse-xml-stream'." + [] (throw +stop-parsing+)) + +(defmacro event-type + "Returns the event type (a static in XMLStreamConstants) of the + current XML event. Normally not needed, because your handler + function will receive the event type as its argument." + [] + '(. *xml-input-stream* (getEventType))) + +(defmacro pi-target + "Returns the target of an XML processing instruction." + [] + '(. *xml-input-stream* (getPITarget))) + +(defmacro pi-data + "Returns the data of an XML processing instruction." + [] + '(. *xml-input-stream* (getPIData))) + +(defmacro prefix + "Returns the prefix of a namespace-qualified XML element." + [] + '(let [p (. *xml-input-stream* (getPrefix))] + (if (. p (isEmpty)) nil (keyword p)))) + +(defmacro lname + "Returns the local name of an XML element." + [] + '(. *xml-input-stream* (getLocalName))) + +(defmacro xmlns + "Returns the namespace URI of a namespace-qualified XML element." + [] + '(. *xml-input-stream* (getNamespaceURI))) + +(defmacro whitespace? + "Returns true if the current XML character node contains only + whitespace. Implementation-dependent." + [] + '(. *xml-input-stream* (isWhiteSpace))) + +(defmacro text + "Returns the text of the current XML character node." + [] + '(. *xml-input-stream* (getText))) + +(defmacro attr-count + "Returns the number of attributes on the current XML element." + [] + '(. *xml-input-stream* (getAttributeCount))) + +(defn attr-value + "Returns the value of the attribute on the current XML element." + ([local-name] (. *xml-input-stream* (getAttributeValue nil local-name))) + ([local-name xmlns] (. *xml-input-stream* (getAttributeValue xmlns local-name)))) + +(defstruct attribute :lname :xmlns :prefix :value) + +(defn nth-attr + "Returns a struct representing the nth attribute of the current XML + element. The struct has 4 parts, :lname (local name), :xmlns, + :prefix, and :value." + [n] + (struct attribute + (. *xml-input-stream* (getAttributeLocalName n)) + (. *xml-input-stream* (getAttributeNamespace n)) + (. *xml-input-stream* (getAttributePrefix n)) + (. *xml-input-stream* (getAttributeValue n)))) + +(defn attrs + "Returns a seq of attribute structures for all attributes on the + current XML element." + [] + (doall ; have to get all attributes before next XML event + (for [index (range (attr-count))] + (nth-attr index)))) + + + +;; Valid methods for each state: +;; from https://jaxp-sources.dev.java.net/nonav/docs/api/javax/xml/stream/XMLStreamReader.html +;; +;; All States getProperty(), hasNext(), require(), close(), +;; getNamespaceURI(), isStartElement(), isEndElement(), +;; isCharacters(), isWhiteSpace(), getNamespaceContext(), +;; getEventType(),getLocation(), hasText() +;; +;; START_ELEMENT next(), getName(), getLocalName(), hasName(), +;; getPrefix(), getAttributeXXX(), +;; isAttributeSpecified(), getNamespaceXXX(), +;; getElementText(), nextTag() +;; +;; ATTRIBUTE next(), nextTag() getAttributeXXX(), +;; isAttributeSpecified(), +;; +;; NAMESPACE next(), nextTag() getNamespaceXXX() +;; +;; END_ELEMENT next(), getName(), getLocalName(), hasName(), +;; getPrefix(), getNamespaceXXX(), nextTag() +;; +;; CHARACTERS next(), getTextXXX(), nextTag() +;; +;; CDATA next(), getTextXXX(), nextTag() +;; +;; COMMENT next(), getTextXXX(), nextTag() +;; +;; SPACE next(), getTextXXX(), nextTag() +;; +;; START_DOCUMENT next(), getEncoding(), next(), getPrefix(), +;; getVersion(), isStandalone(), standaloneSet(), +;; getCharacterEncodingScheme(), nextTag() +;; +;; END_DOCUMENT close() +;; +;; PROCESSING_INSTRUCTION next(), getPITarget(), getPIData(), nextTag() +;; +;; ENTITY_REFERENCE next(), getLocalName(), getText(), nextTag() +;; +;; DTD next(), getText(), nextTag() diff --git a/xml-stream-writer.clj b/xml-stream-writer.clj new file mode 100644 index 00000000..7566ac46 --- /dev/null +++ b/xml-stream-writer.clj @@ -0,0 +1,225 @@ +;;; xml-stream-writer.clj -- Clojure interface to XMLStreamWriter + +;; by Stuart Sierra <mail@stuartsierra.com> +;; Version 1; April 21, 2008 + +;; THIS IS AN 'ALPHA' RELEASE AND THE API IS SUBJECT TO CHANGE. + + +;; Copyright (c) 2008 Stuart Sierra. All rights reserved. The use and +;; distribution terms for this software are covered by the Common +;; Public License 1.0 (http://www.opensource.org/licenses/cpl1.0.php) +;; which can be found in the file CPL.TXT at the root of this +;; distribution. By using this software in any fashion, you are +;; agreeing to be bound by the terms of this license. You must not +;; remove this notice, or any other, from this software. + + +;; This file defines a Clojure interface to the StAX XML writer API, +;; defined in javax.xml.stream.XMLOutputStream, included with the JDK +;; since version 6. +;; +;; The main entry point is the 'with-xml-output' macro, which takes +;; one argument, an open java.io.Writer, and a body. This macro opens +;; a new XMLStreamWriter on the given output stream and ensures that +;; all document tags are closed. It does NOT close the underlying +;; Writer. +;; +;; Within the body of 'with-xml-output', certain forms will be +;; translated into XML: +;; +;; "This is a string." => This is a string. +;; [:foo "this & that"] => <foo>this & that</foo> +;; [:foo] => <foo/> +;; +;; Attributes are specified as maps: +;; +;; [:foo {:bar "this & that"}] => <foo bar="this & that"/> +;; +;; Declare XML namespace prefixes inside 'emit-xml' with +;; (xmlns prefix uri), where 'prefix' is a keyword. +;; +;; Once a namespace has been declared, create elements in that +;; namespace by using a 2-element vector [prefix localname] as the tag +;; name: +;; +;; (with-xml-output *out* +;; (xmlns :me "http://xmlns.com/mine") +;; [[:me :foo] "content"]) +;; +;; => <me:foo xmlns:me="http://xmlns.com/mine">content</me:foo> +;; +;; Same with attributes: +;; +;; (with-xml-output *out* +;; (xmlns :me "http://xmlns.com/mine") +;; [:foo {[:me bar] "value"}]) +;; +;; => <foo me:bar="value" xmlns:me="http://xmlns.com/mine"/> +;; +;; Undeclared namespaces will get an auto-generated prefix: +;; +;; (with-xml-output *out* [["http://myns.com/" :foo]]) +;; +;; => <zdef18:foo xmlns:zdef18="http://myns.com/"/> +;; +;; Normal function calls and symbols may be placed in the body of +;; 'with-xml-output'; they will be evaluated normally and their return +;; values will be included as text in the XML output. To avoid adding +;; text to the output, functions should return nil. +;; +;; To emit XML tags from a function without creating a new +;; XMLStreamWriter, use the 'emit-xml' macro: +;; +;; (defn foo [name] (emit-xml [:mytag {:id name}])) +;; +;; (with-xml-output *out* +;; [:root (foo 24) (foo 24)]) +;; +;; => <root><mytag id="24"/><mytag id="24"/></root> +;; +;; To create the default XML declaration (version 1.0 and UTF-8 +;; encoding), use the (xmldecl) macro: +;; +;; (with-xml-output *out* (xmldecl)) +;; +;; => <?xml version="1.0" encoding="UTF-8"?> +;; + + + +(clojure/in-ns 'xml-stream-writer) +(clojure/refer 'clojure) + +(import '(javax.xml.stream XMLOutputFactory)) + +(defn- to-name [x] + (cond + (keyword? x) (name x) + (string? x) x + (symbol? x) (name x) + true (str x))) + +(def + *xml-output-factory* + (let [factory (XMLOutputFactory.newInstance)] + (. factory setProperty + "javax.xml.stream.isRepairingNamespaces" true) + factory)) + +(def *xml-stream-writer*) + +(def *xmlns-prefixes*) + +(def *xml-indent* nil) + +(def *xml-tag-depth*) + +(defn apply-indent [] + (when *xml-indent* + (. *xml-stream-writer* writeCharacters "\n") + (dotimes i (* *xml-indent* *xml-tag-depth*) + (. *xml-stream-writer* writeCharacters " ")))) + +(defn start-tag + ([name] + (apply-indent) + (set! *xml-tag-depth* (inc *xml-tag-depth*)) + (. *xml-stream-writer* + writeStartElement (to-name name))) + ([namespace name] + (apply-indent) + (set! *xml-tag-depth* (inc *xml-tag-depth*)) + (. *xml-stream-writer* + writeStartElement namespace (to-name name)))) + +(defn empty-tag + ([name] + (. *xml-stream-writer* + writeEmptyElement (to-name name))) + ([namespace name] + (. *xml-stream-writer* + writeEmptyElement namespace (to-name name)))) + +(defn xml-attr + ([name value] + (. *xml-stream-writer* writeAttribute (to-name name) value)) + ([namespace name value] + (. *xml-stream-writer* writeAttribute namespace (to-name name) value))) + +(defn xml-text [t] + (when t + (apply-indent) + (. *xml-stream-writer* writeCharacters (str t)))) + +(defn end-tag [] + (set! *xml-tag-depth* (dec *xml-tag-depth*)) + (apply-indent) + (. *xml-stream-writer* writeEndElement)) + +(defn entity [name] + (. *xml-stream-writer* writeEntityRef (to-name name))) + +(def emit-form) + +(defn resolve-xmlns [x] + (if (keyword? x) + (*xmlns-prefixes* x) + x)) + +(defn- emit-tag [form] + (if (or (= 1 (count form)) + (and (= 2 (count form)) + (map? (second form)))) ; is the element empty? + (apply list 'do + (let [tag (first form)] + (if (vector? tag) ; is the element namespaced? + (list 'empty-tag (list 'resolve-xmlns (first tag)) (second tag)) + (list 'empty-tag tag))) + (map emit-form (rest form))) ; still have to handle attributes + (apply list 'do + (let [tag (first form)] ; non-empty element + (if (vector? tag) ; is the element namespaced? + (list 'start-tag (list 'resolve-xmlns (first tag)) (second tag)) + (list 'start-tag tag))) + (concat (map emit-form (rest form)) + (list '(end-tag)))))) + +(defn- emit-xml-attr [name-value] + (let [[name value] name-value] + (if (vector? name) + (list 'xml-attr (first name) (second name) value) + (list 'xml-attr name value)))) + +(defn- emit-xml-attrs [xml-attr-map] + (apply list 'do + (map emit-xml-attr xml-attr-map))) + +(defn- emit-form [form] + (cond + (vector? form) (emit-tag form) + (map? form) (emit-xml-attrs form) + true `(xml-text ~form))) + +(defmacro xmldecl [] + '(. *xml-stream-writer* writeStartDocument "UTF-8" "1.0")) + +(defmacro xmlns [prefix namespace] + `(sync nil + (set! *xmlns-prefixes* (assoc *xmlns-prefixes* ~prefix ~namespace)) + (. *xml-stream-writer* setPrefix ~(to-name prefix) ~namespace))) + +(defmacro emit-xml [& forms] + `(do ~@(map emit-form forms))) + +(defmacro with-xml-output [writer & body] + `(binding [*xml-stream-writer* + (. *xml-output-factory* createXMLStreamWriter ~writer) + *xmlns-prefixes* {} + *xml-tag-depth* 0] + (try + (emit-xml ~@body) + (finally + (. *xml-stream-writer* writeEndDocument) + (. *xml-stream-writer* close) + nil)))) |