diff options
author | Stuart Sierra <mail@stuartsierra.com> | 2009-06-05 20:55:45 +0000 |
---|---|---|
committer | Stuart Sierra <mail@stuartsierra.com> | 2009-06-05 20:55:45 +0000 |
commit | bea244b4d5e8512d4d659c0308a7781a606bccfd (patch) | |
tree | 23fcfaa173cf14a2d1310587fd3751e8d5734810 /src/clojure | |
parent | 66d0c223cf49a69deb93591f651f7aaeed0b991b (diff) |
str_utils2.clj: added codepoints and docodepoints
Diffstat (limited to 'src/clojure')
-rw-r--r-- | src/clojure/contrib/str_utils2.clj | 93 | ||||
-rw-r--r-- | src/clojure/contrib/test_contrib/str_utils2.clj | 5 |
2 files changed, 79 insertions, 19 deletions
diff --git a/src/clojure/contrib/str_utils2.clj b/src/clojure/contrib/str_utils2.clj index 186832b4..988d6a6d 100644 --- a/src/clojure/contrib/str_utils2.clj +++ b/src/clojure/contrib/str_utils2.clj @@ -33,18 +33,60 @@ (:require [clojure.contrib.java-utils :as j]) (:import (java.util.regex Pattern))) -(defmacro dochars + +(defmacro dochars "bindings => [name string] + Repeatedly executes body, with name bound to each character in - string." + string. Does NOT handle Unicode supplementary characters (above + U+FFFF)." [bindings & body] (assert (vector bindings)) (assert (= 2 (count bindings))) `(let [#^String s# ~(second bindings)] - (dotimes [i# (.length ~(second bindings))] + (dotimes [i# (.length s#)] (let [~(first bindings) (.charAt s# i#)] ~@body)))) + +(defmacro docodepoints + "bindings => [name string] + + Repeatedly executes body, with name bound to the integer code point + of each Unicode character in the string. Handles Unicode + supplementary characters (above U+FFFF) correctly." + [bindings & body] + (assert (vector bindings)) + (assert (= 2 (count bindings))) + ;; This seems to be the fastest way to iterate over characters. + (let [character (first bindings) + string (second bindings)] + `(let [#^String s# ~string + len# (.length s#)] + (loop [i# 0] + (when (< i# len#) + (let [~character (.charAt s# i#)] + (if (Character/isHighSurrogate ~character) + (let [~character (.codePointAt s# i#)] + ~@body + (recur (+ 2 i#))) + (let [~character (int ~character)] + ~@body + (recur (inc i#)))))))))) + +(defn codepoints + "Returns a sequence of integer Unicode code points in s. Handles + Unicode supplementary characters (above U+FFFF) correctly." + [#^String s] + (let [len (.length s) + f (fn thisfn [#^String s i] + (when (< i len) + (let [c (.charAt s i)] + (if (Character/isHighSurrogate c) + (cons (.codePointAt s i) (thisfn s (+ 2 i))) + (cons (int c) (thisfn s (inc i)))))))] + (lazy-seq (f s 0)))) + (defn escape "Escapes characters in string according to a cmap, a function or map from characters to their replacements." @@ -56,14 +98,10 @@ (.append buffer c))) (.toString buffer))) -(defn escape-pattern [#^String s] - (escape s (fn [c] (when (#{\\ \[ \] \. \^ \$ \? \* \+ \( \)} c) - (str \\ c))))) - (defn as-pattern [re] (if (instance? Pattern re) re - (Pattern/compile (escape-pattern (j/as-str re))))) + (Pattern/compile (Pattern/quote (j/as-str re))))) (defn blank? "True if s is nil, empty, or contains only whitespace." @@ -100,12 +138,20 @@ (.substring s (- (count s) n)))) (defmulti - #^{:doc "Replaces all instances of a in s with b. a and b may be - Characters, Strings, Pattern/String, or Pattern/Fn." - :arglists '([s a b])} + #^{:doc "Replaces all instances of pattern in string with replacement. + + Allowed argument types for pattern and replacement are: + 1. String and String + 2. Character and Character + 3. regex Pattern and String + (Uses java.util.regex.Matcher.replaceAll) + 4. regex Pattern and function + (Calls function with re-groups of each match, uses return + value as replacement.)" + :arglists '([string pattern replacement])} replace - (fn [#^String s a b] - [(class a) (class b)])) + (fn [#^String string pattern replacement] + [(class pattern) (class replacement)])) (defmethod replace [String String] [#^String s #^String a #^String b] (.replace s a b)) @@ -127,14 +173,23 @@ (.toString buffer))))))) (defmulti - #^{:doc "Replaces the first instance of a in s with b. a must be - Pattern, b may be String or Fn." - :arglists '([s a b])} + #^{:doc "Replaces the first instance of pattern in s with replacement. + + Allowed argument types for pattern and replacement are: + 1. String and String + 2. regex Pattern and String + (Uses java.util.regex.Matcher.replaceAll) + 3. regex Pattern and function +" + :arglists '([s pattern replacement])} replace-first - (fn [s a b] - [(class a) (class b)])) + (fn [s pattern replacement] + [(class pattern) (class replacement)])) + +(defmethod replace-first [String String] [#^String s pattern replacement] + (.replaceFirst (re-matcher (Pattern/quote pattern) s) replacement)) -(defmethod replace-first [Pattern String] [#^String s #^Pattern re replacement] +(defmethod replace-first [Pattern String] [#^String s re replacement] (.replaceFirst (re-matcher re s) replacement)) (defmethod replace-first [Pattern clojure.lang.IFn] [#^String s #^Pattern re f] diff --git a/src/clojure/contrib/test_contrib/str_utils2.clj b/src/clojure/contrib/test_contrib/str_utils2.clj index 981b5d31..d7d9b131 100644 --- a/src/clojure/contrib/test_contrib/str_utils2.clj +++ b/src/clojure/contrib/test_contrib/str_utils2.clj @@ -37,3 +37,8 @@ (deftest t-replace-first (is (= "barbarfoo" (s/replace-first "foobarfoo" #"foo" "bar"))) (is (= "FOObarfoo" (s/replace-first "foobarfoo" #"foo" s/upper-case)))) + +(deftest t-codepoints + (is (= (list 102 111 111 65536 98 97 114) + (s/codepoints "foo\uD800\uDC00bar")) + "Handles Unicode supplementary characters"))) |