aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStuart Sierra <mail@stuartsierra.com>2010-01-26 10:38:19 -0500
committerStuart Sierra <mail@stuartsierra.com>2010-01-26 10:38:19 -0500
commit18c6d2102a3f24024a8b45a6a137f24c6b21a91d (patch)
treeb236ddb9d219642668368d70f034d82299b45432
parent18584a46cdcb0651d6f025680c25ab8c18d0e639 (diff)
Add str-utils3 with tests
* most significant argument last, for ->> * 'contains?' renamed 'substring?' * removed 'partial'
-rw-r--r--src/main/clojure/clojure/contrib/str_utils3.clj350
-rw-r--r--src/test/clojure/clojure/contrib/test_contrib/str_utils3.clj115
2 files changed, 465 insertions, 0 deletions
diff --git a/src/main/clojure/clojure/contrib/str_utils3.clj b/src/main/clojure/clojure/contrib/str_utils3.clj
new file mode 100644
index 00000000..ffaa456d
--- /dev/null
+++ b/src/main/clojure/clojure/contrib/str_utils3.clj
@@ -0,0 +1,350 @@
+;;; str_utils3.clj -- functional string utilities for Clojure
+
+;; by Stuart Sierra, http://stuartsierra.com/
+;; January 26, 2010
+
+;; Copyright (c) Stuart Sierra, 2010. All rights reserved. The use
+;; and distribution terms for this software are covered by the Eclipse
+;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
+;; which can be found in the file epl-v10.html at the root of this
+;; distribution. By using this software in any fashion, you are
+;; agreeing to be bound by the terms of this license. You must not
+;; remove this notice, or any other, from this software.
+
+
+(ns #^{:author "Stuart Sierra"
+ :doc "This is a library of string manipulation functions. It
+ is intented as a replacement for clojure.contrib.str-utils.
+
+ You cannot (use 'clojure.contrib.str-utils3) because it defines
+ functions with the same names as functions in clojure.core.
+ Instead, do (require '[clojure.contrib.str-utils3 :as s])
+ or something similar.
+
+ Goals:
+ 1. Be functional
+ 2. Most significant argument LAST, to work with ->>
+ 3. At least O(n) performance for Strings of length n
+
+ Some ideas are borrowed from
+ http://github.com/francoisdevlin/devlinsf-clojure-utils/"}
+ clojure.contrib.str-utils3
+ (:refer-clojure :exclude (take replace drop butlast partition
+ contains? get repeat reverse partial))
+ (:import (java.util.regex Pattern)))
+
+
+(defmacro dochars
+ "bindings => [name string]
+
+ Repeatedly executes body, with name bound to each character in
+ string. Does NOT handle Unicode supplementary characters (above
+ U+FFFF)."
+ [bindings & body]
+ (assert (vector bindings))
+ (assert (= 2 (count bindings)))
+ ;; This seems to be the fastest way to iterate over characters.
+ `(let [#^String s# ~(second bindings)]
+ (dotimes [i# (.length s#)]
+ (let [~(first bindings) (.charAt s# i#)]
+ ~@body))))
+
+
+(defmacro docodepoints
+ "bindings => [name string]
+
+ Repeatedly executes body, with name bound to the integer code point
+ of each Unicode character in the string. Handles Unicode
+ supplementary characters (above U+FFFF) correctly."
+ [bindings & body]
+ (assert (vector bindings))
+ (assert (= 2 (count bindings)))
+ (let [character (first bindings)
+ string (second bindings)]
+ `(let [#^String s# ~string
+ len# (.length s#)]
+ (loop [i# 0]
+ (when (< i# len#)
+ (let [~character (.charAt s# i#)]
+ (if (Character/isHighSurrogate ~character)
+ (let [~character (.codePointAt s# i#)]
+ ~@body
+ (recur (+ 2 i#)))
+ (let [~character (int ~character)]
+ ~@body
+ (recur (inc i#))))))))))
+
+(defn codepoints
+ "Returns a sequence of integer Unicode code points in s. Handles
+ Unicode supplementary characters (above U+FFFF) correctly."
+ [#^String s]
+ (let [len (.length s)
+ f (fn thisfn [#^String s i]
+ (when (< i len)
+ (let [c (.charAt s i)]
+ (if (Character/isHighSurrogate c)
+ (cons (.codePointAt s i) (thisfn s (+ 2 i)))
+ (cons (int c) (thisfn s (inc i)))))))]
+ (lazy-seq (f s 0))))
+
+(defn #^String escape
+ "Returns a new String by applying cmap (a function or a map) to each
+ character in s. If cmap returns nil, the original character is
+ added to the output unchanged."
+ [cmap #^String s]
+ (let [buffer (StringBuilder. (.length s))]
+ (dochars [c s]
+ (if-let [r (cmap c)]
+ (.append buffer r)
+ (.append buffer c)))
+ (.toString buffer)))
+
+(defn blank?
+ "True if s is nil, empty, or contains only whitespace."
+ [#^String s]
+ (every? (fn [#^Character c] (Character/isWhitespace c)) s))
+
+(defn #^String take
+ "Take first n characters from s, up to the length of s."
+ [n #^String s]
+ (if (< (count s) n)
+ s
+ (.substring s 0 n)))
+
+(defn #^String drop
+ "Drops first n characters from s. Returns an empty string if n is
+ greater than the length of s."
+ [n #^String s]
+ (if (< (count s) n)
+ ""
+ (.substring s n)))
+
+(defn #^String butlast
+ "Returns s without the last n characters. Returns an empty string
+ if n is greater than the length of s."
+ [n #^String s]
+ (if (< (count s) n)
+ ""
+ (.substring s 0 (- (count s) n))))
+
+(defn #^String tail
+ "Returns the last n characters of s."
+ [n #^String s]
+ (if (< (count s) n)
+ s
+ (.substring s (- (count s) n))))
+
+(defn #^String repeat
+ "Returns a new String containing s repeated n times."
+ [n #^String s]
+ (apply str (clojure.core/repeat n s)))
+
+(defn #^String reverse
+ "Returns s with its characters reversed."
+ [#^String s]
+ (.toString (.reverse (StringBuilder. s))))
+
+(defmulti
+ #^{:doc "Replaces all instances of pattern in string with replacement.
+
+ Allowed argument types for pattern and replacement are:
+ 1. String and String
+ 2. Character and Character
+ 3. regex Pattern and String
+ (Uses java.util.regex.Matcher.replaceAll)
+ 4. regex Pattern and function
+ (Calls function with re-groups of each match, uses return
+ value as replacement.)"
+ :arglists '([pattern replacement string])
+ :tag String}
+ replace
+ (fn [pattern replacement #^String string]
+ [(class pattern) (class replacement)]))
+
+(defmethod replace [String String] [#^String a #^String b #^String s]
+ (.replace s a b))
+
+(defmethod replace [Character Character] [#^Character a #^Character b #^String s]
+ (.replace s a b))
+
+(defmethod replace [Pattern String] [re replacement #^String s]
+ (.replaceAll (re-matcher re s) replacement))
+
+(defmethod replace [Pattern clojure.lang.IFn] [re replacement #^String s]
+ (let [m (re-matcher re s)]
+ (let [buffer (StringBuffer. (.length s))]
+ (loop []
+ (if (.find m)
+ (do (.appendReplacement m buffer (replacement (re-groups m)))
+ (recur))
+ (do (.appendTail m buffer)
+ (.toString buffer)))))))
+
+(defmulti
+ #^{:doc "Replaces the first instance of pattern in s with replacement.
+
+ Allowed argument types for pattern and replacement are:
+ 1. String and String
+ 2. regex Pattern and String
+ (Uses java.util.regex.Matcher.replaceFirst)
+ 3. regex Pattern and function"
+ :arglists '([pattern replacement s])
+ :tag String}
+ replace-first
+ (fn [pattern replacement s]
+ [(class pattern) (class replacement)]))
+
+(defmethod replace-first [String String] [pattern replacement #^String s]
+ (.replaceFirst (re-matcher (Pattern/quote pattern) s) replacement))
+
+(defmethod replace-first [Pattern String] [re replacement #^String s]
+ (.replaceFirst (re-matcher re s) replacement))
+
+(defmethod replace-first [Pattern clojure.lang.IFn] [#^Pattern re f #^String s]
+ (let [m (re-matcher re s)]
+ (let [buffer (StringBuffer.)]
+ (if (.find m)
+ (let [rep (f (re-groups m))]
+ (.appendReplacement m buffer rep)
+ (.appendTail m buffer)
+ (str buffer))))))
+
+(defn partition
+ "Splits the string into a lazy sequence of substrings, alternating
+ between substrings that match the patthern and the substrings
+ between the matches. The sequence always starts with the substring
+ before the first match, or an empty string if the beginning of the
+ string matches.
+
+ For example: (partition \"abc123def\" #\"[a-z]+\")
+ returns: (\"\" \"abc\" \"123\" \"def\")"
+ [#^Pattern re #^String s]
+ (let [m (re-matcher re s)]
+ ((fn step [prevend]
+ (lazy-seq
+ (if (.find m)
+ (cons (.subSequence s prevend (.start m))
+ (cons (re-groups m)
+ (step (+ (.start m) (count (.group m))))))
+ (when (< prevend (.length s))
+ (list (.subSequence s prevend (.length s)))))))
+ 0)))
+
+(defn #^String join
+ "Returns a string of all elements in coll, separated by
+ separator. Like Perl's join."
+ [#^String separator coll]
+ (apply str (interpose separator coll)))
+
+(defn #^String chop
+ "Removes the last character of string, does nothing on a zero-length
+ string."
+ [#^String s]
+ (let [size (count s)]
+ (if (zero? size)
+ s
+ (subs s 0 (dec (count s))))))
+
+(defn #^String chomp
+ "Removes all trailing newline \\n or return \\r characters from
+ string. Note: String.trim() is similar and faster."
+ [#^String s]
+ (replace #"[\r\n]+$" "" s))
+
+(defn title-case [#^String s]
+ (throw (Exception. "title-case not implemeted yet")))
+
+(defn #^String swap-case
+ "Changes upper case characters to lower case and vice-versa.
+ Handles Unicode supplementary characters correctly. Uses the
+ locale-sensitive String.toUpperCase() and String.toLowerCase()
+ methods."
+ [#^String s]
+ (let [buffer (StringBuilder. (.length s))
+ ;; array to make a String from one code point
+ #^"[I" array (make-array Integer/TYPE 1)]
+ (docodepoints [c s]
+ (aset-int array 0 c)
+ (if (Character/isLowerCase c)
+ ;; Character.toUpperCase is not locale-sensitive, but
+ ;; String.toUpperCase is; so we use a String.
+ (.append buffer (.toUpperCase (String. array 0 1)))
+ (.append buffer (.toLowerCase (String. array 0 1)))))
+ (.toString buffer)))
+
+(defn #^String capitalize
+ "Converts first character of the string to upper-case, all other
+ characters to lower-case."
+ [#^String s]
+ (if (< (count s) 2)
+ (.toUpperCase s)
+ (str (.toUpperCase #^String (subs s 0 1))
+ (.toLowerCase #^String (subs s 1)))))
+
+(defn #^String ltrim
+ "Removes whitespace from the left side of string."
+ [#^String s]
+ (replace #"^\s+" "" s))
+
+(defn #^String rtrim
+ "Removes whitespace from the right side of string."
+ [#^String s]
+ (replace #"\s+$" "" s))
+
+(defn split-lines
+ "Splits s on \\n or \\r\\n."
+ [#^String s]
+ (seq (.split #"\r?\n" s)))
+
+;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
+(defn #^String map-str
+ "Apply f to each element of coll, concatenate all results into a
+ String."
+ [f coll]
+ (apply str (map f coll)))
+
+;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
+(defn grep
+ "Filters elements of coll by a regular expression. The String
+ representation (with str) of each element is tested with re-find."
+ [re coll]
+ (filter (fn [x] (re-find re (str x))) coll))
+
+
+;;; WRAPPERS
+
+;; The following functions are simple wrappers around java.lang.String
+;; functions. They are included here for completeness, and for use
+;; when mapping over a collection of strings.
+
+(defn #^String upper-case
+ "Converts string to all upper-case."
+ [#^String s]
+ (.toUpperCase s))
+
+(defn #^String lower-case
+ "Converts string to all lower-case."
+ [#^String s]
+ (.toLowerCase s))
+
+(defn split
+ "Splits string on a regular expression. Optional argument limit is
+ the maximum number of splits."
+ ([#^Pattern re #^String s] (seq (.split re s)))
+ ([#^Pattern re limit #^String s] (seq (.split re s limit))))
+
+(defn #^String trim
+ "Removes whitespace from both ends of string."
+ [#^String s]
+ (.trim s))
+
+(defn #^String substring?
+ "True if s contains the substring."
+ [substring #^String s]
+ (.contains s substring))
+
+(defn #^String get
+ "Gets the i'th character in string."
+ [#^String s i]
+ (.charAt s i))
+
diff --git a/src/test/clojure/clojure/contrib/test_contrib/str_utils3.clj b/src/test/clojure/clojure/contrib/test_contrib/str_utils3.clj
new file mode 100644
index 00000000..2625c168
--- /dev/null
+++ b/src/test/clojure/clojure/contrib/test_contrib/str_utils3.clj
@@ -0,0 +1,115 @@
+(ns clojure.contrib.test-contrib.str-utils3
+ (:require [clojure.contrib.str-utils3 :as s])
+ (:use clojure.test))
+
+(deftest t-codepoints
+ (is (= (list 102 111 111 65536 98 97 114)
+ (s/codepoints "foo\uD800\uDC00bar"))
+ "Handles Unicode supplementary characters"))
+
+(deftest t-escape
+ (is (= "&lt;foo&amp;bar&gt;"
+ (s/escape {\& "&amp;" \< "&lt;" \> "&gt;"} "<foo&bar>")))
+ (is (= " \\\"foo\\\" "
+ (s/escape {\" "\\\""} " \"foo\" " )))
+ (is (= "faabor" (s/escape {\a \o, \o \a} "foobar"))))
+
+(deftest t-blank
+ (is (s/blank? nil))
+ (is (s/blank? ""))
+ (is (s/blank? " "))
+ (is (s/blank? " \t \n \r "))
+ (is (not (s/blank? " foo "))))
+
+(deftest t-take
+ (is (= "foo" (s/take 3 "foobar")))
+ (is (= "foobar" (s/take 7 "foobar")))
+ (is (= "" (s/take 0 "foo"))))
+
+(deftest t-drop
+ (is (= "bar" (s/drop 3 "foobar")))
+ (is (= "" (s/drop 9 "foobar")))
+ (is (= "foobar" (s/drop 0 "foobar"))))
+
+(deftest t-butlast
+ (is (= "foob" (s/butlast 2 "foobar")))
+ (is (= "" (s/butlast 9 "foobar")))
+ (is (= "foobar" (s/butlast 0 "foobar"))))
+
+(deftest t-tail
+ (is (= "ar" (s/tail 2 "foobar")))
+ (is (= "foobar" (s/tail 9 "foobar")))
+ (is (= "" (s/tail 0 "foobar"))))
+
+(deftest t-repeat
+ (is (= "foofoofoo" (s/repeat 3 "foo"))))
+
+(deftest t-reverse
+ (is (= "tab" (s/reverse "bat"))))
+
+(deftest t-replace
+ (is (= "faabar" (s/replace \o \a "foobar")))
+ (is (= "barbarbar" (s/replace "foo" "bar" "foobarfoo")))
+ (is (= "FOObarFOO" (s/replace #"foo" s/upper-case "foobarfoo"))))
+
+(deftest t-replace-first
+ (is (= "barbarfoo" (s/replace-first #"foo" "bar" "foobarfoo")))
+ (is (= "FOObarfoo" (s/replace-first #"foo" s/upper-case "foobarfoo"))))
+
+(deftest t-partition
+ (is (= (list "" "abc" "123" "def")
+ (s/partition #"[a-z]+" "abc123def"))))
+
+(deftest t-join
+ (is (= "1,2,3" (s/join \, [1 2 3])))
+ (is (= "" (s/join \, [])))
+ (is (= "1 and-a 2 and-a 3" (s/join " and-a " [1 2 3]))))
+
+(deftest t-chop
+ (is (= "fo" (s/chop "foo")))
+ (is (= "") (s/chop "f"))
+ (is (= "") (s/chop "")))
+
+(deftest t-chomp
+ (is (= "foo" (s/chomp "foo\n")))
+ (is (= "foo" (s/chomp "foo\r\n")))
+ (is (= "foo" (s/chomp "foo")))
+ (is (= "" (s/chomp ""))))
+
+(deftest t-swap-case
+ (is (= "fOO!bAR" (s/swap-case "Foo!Bar")))
+ (is (= "" (s/swap-case ""))))
+
+(deftest t-capitalize
+ (is (= "Foobar" (s/capitalize "foobar")))
+ (is (= "Foobar" (s/capitalize "FOOBAR"))))
+
+(deftest t-ltrim
+ (is (= "foo " (s/ltrim " foo ")))
+ (is (= "" (s/ltrim " "))))
+
+(deftest t-rtrim
+ (is (= " foo" (s/rtrim " foo ")))
+ (is (= "" (s/rtrim " "))))
+
+(deftest t-split-lines
+ (is (= (list "one" "two" "three")
+ (s/split-lines "one\ntwo\r\nthree")))
+ (is (= (list "foo") (s/split-lines "foo"))))
+
+(deftest t-upper-case
+ (is (= "FOOBAR" (s/upper-case "Foobar"))))
+
+(deftest t-lower-case
+ (is (= "foobar" (s/lower-case "FooBar"))))
+
+(deftest t-trim
+ (is (= "foo" (s/trim " foo \r\n"))))
+
+(deftest t-substring
+ (is (s/substring? "foo" "foobar"))
+ (is (not (s/substring? "baz" "foobar"))))
+
+(deftest t-get
+ (is (= \o (s/get "foo" 1))))
+