/* * Copyright (c) 2006, Stephen Kelvin Friedrich, All rights reserved. * * This a BSD license. If you use or enhance the code, I'd be pleased if you sent a mail to s.friedrich@eekboom.com * * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the * following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of conditions and the * following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * * Neither the name of the "Stephen Kelvin Friedrich" nor the names of its contributors may be used to endorse * or promote products derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.openmrs.util; import java.text.Collator; import java.util.Comparator; /** * Utility class for natural String comparison operations * * @since 1.5 */ public final class NaturalStrings { /** * <p> * A string comparator that does case sensitive comparisons and handles embedded numbers * correctly. * </p> * <p> * <b>Do not use</b> if your app might ever run on any locale that uses more than 7-bit ascii * characters. * </p> */ private static final Comparator<String> NATURAL_COMPARATOR_ASCII = new Comparator<String>() { public int compare(String o1, String o2) { return compareNaturalAscii(o1, o2); } }; /** * <p> * A string comparator that does case insensitive comparisons and handles embedded numbers * correctly. * </p> * <p> * <b>Do not use</b> if your app might ever run on any locale that uses more than 7-bit ascii * characters. * </p> */ private static final Comparator<String> IGNORE_CASE_NATURAL_COMPARATOR_ASCII = new Comparator<String>() { public int compare(String o1, String o2) { return compareNaturalIgnoreCaseAscii(o1, o2); } }; /** * This is a utility class (static methods only), don't instantiate. */ private NaturalStrings() { } /** * Returns a comparator that compares contained numbers based on their numeric values and * compares other parts using the current locale's order rules. * <p> * For example in German locale this will be a comparator that handles umlauts correctly and * ignores upper/lower case differences. * </p> * * @return <p> * A string comparator that uses the current locale's order rules and handles embedded * numbers correctly. * </p> * @see #getNaturalComparator(java.text.Collator) */ public static Comparator<String> getNaturalComparator() { Collator collator = Collator.getInstance(); return getNaturalComparator(collator); } /** * Returns a comparator that compares contained numbers based on their numeric values and * compares other parts using the given collator. * * @param collator used for locale specific comparison of text (non-number) subwords - must not * be null * @return <p> * A string comparator that uses the given Collator to compare subwords and handles * embedded numbers correctly. * </p> * @see #getNaturalComparator() */ public static Comparator<String> getNaturalComparator(final Collator collator) { if (collator == null) { // it's important to explicitly handle this here - else the bug will manifest anytime later in possibly // unrelated code that tries to use the comparator throw new NullPointerException("collator must not be null"); } return new Comparator<String>() { public int compare(String o1, String o2) { return compareNatural(collator, o1, o2); } }; } /** * Returns a comparator that compares contained numbers based on their numeric values and * compares other parts based on each character's Unicode value. * * @return <p> * a string comparator that does case sensitive comparisons on pure ascii strings and * handles embedded numbers correctly. * </p> * <b>Do not use</b> if your app might ever run on any locale that uses more than 7-bit * ascii characters. * @see #getNaturalComparator() * @see #getNaturalComparator(java.text.Collator) */ public static Comparator<String> getNaturalComparatorAscii() { return NATURAL_COMPARATOR_ASCII; } /** * Returns a comparator that compares contained numbers based on their numeric values and * compares other parts based on each character's Unicode value while ignore upper/lower case * differences. <b>Do not use</b> if your app might ever run on any locale that uses more than * 7-bit ascii characters. * * @return <p> * a string comparator that does case insensitive comparisons on pure ascii strings and * handles embedded numbers correctly. * </p> * @see #getNaturalComparator() * @see #getNaturalComparator(java.text.Collator) */ public static Comparator<String> getNaturalComparatorIgnoreCaseAscii() { return IGNORE_CASE_NATURAL_COMPARATOR_ASCII; } /** * <p> * Compares two strings using the current locale's rules and comparing contained numbers based * on their numeric values. * </p> * <p> * This is probably the best default comparison to use. * </p> * <p> * If you know that the texts to be compared are in a certain language that differs from the * default locale's langage, then get a collator for the desired locale ( * {@link java.text.Collator#getInstance(java.util.Locale)}) and pass it to * {@link #compareNatural(java.text.Collator, String, String)} * </p> * * @param s first string * @param t second string * @return zero iff <code>s</code> and <code>t</code> are equal, a value less than zero iff * <code>s</code> lexicographically precedes <code>t</code> and a value larger than zero * iff <code>s</code> lexicographically follows <code>t</code> */ public static int compareNatural(String s, String t) { return compareNatural(s, t, false, Collator.getInstance()); } /** * <p> * Compares two strings using the given collator and comparing contained numbers based on their * numeric values. * </p> * * @param s first string * @param t second string * @return zero iff <code>s</code> and <code>t</code> are equal, a value less than zero iff * <code>s</code> lexicographically precedes <code>t</code> and a value larger than zero * iff <code>s</code> lexicographically follows <code>t</code> */ public static int compareNatural(Collator collator, String s, String t) { return compareNatural(s, t, true, collator); } /** * <p> * Compares two strings using each character's Unicode value for non-digit characters and the * numeric values off any contained numbers. * </p> * <p> * (This will probably make sense only for strings containing 7-bit ascii characters only.) * </p> * * @return zero iff <code>s</code> and <code>t</code> are equal, a value less than zero iff * <code>s</code> lexicographically precedes <code>t</code> and a value larger than zero * iff <code>s</code> lexicographically follows <code>t</code> */ public static int compareNaturalAscii(String s, String t) { return compareNatural(s, t, true, null); } /** * <p> * Compares two strings using each character's Unicode value - ignoring upper/lower case - for * non-digit characters and the numeric values of any contained numbers. * </p> * <p> * (This will probably make sense only for strings containing 7-bit ascii characters only.) * </p> * * @return zero iff <code>s</code> and <code>t</code> are equal, a value less than zero iff * <code>s</code> lexicographically precedes <code>t</code> and a value larger than zero * iff <code>s</code> lexicographically follows <code>t</code> */ public static int compareNaturalIgnoreCaseAscii(String s, String t) { return compareNatural(s, t, false, null); } /** * @param s first string * @param t second string * @param caseSensitive treat characters differing in case only as equal - will be ignored if a * collator is given * @param collator used to compare subwords that aren't numbers - if null, characters will be * compared individually based on their Unicode value * @return zero iff <code>s</code> and <code>t</code> are equal, a value less than zero iff * <code>s</code> lexicographically precedes <code>t</code> and a value larger than zero * iff <code>s</code> lexicographically follows <code>t</code> */ private static int compareNatural(String s, String t, boolean caseSensitive, Collator collator) { int sIndex = 0; int tIndex = 0; int sLength = s.length(); int tLength = t.length(); while (true) { // both character indices are after a subword (or at zero) // Check if one string is at end if (sIndex == sLength && tIndex == tLength) { return 0; } if (sIndex == sLength) { return -1; } if (tIndex == tLength) { return 1; } // Compare sub word char sChar = s.charAt(sIndex); char tChar = t.charAt(tIndex); boolean sCharIsDigit = Character.isDigit(sChar); boolean tCharIsDigit = Character.isDigit(tChar); if (sCharIsDigit && tCharIsDigit) { // Compare numbers // skip leading 0s int sLeadingZeroCount = 0; while (sChar == '0') { ++sLeadingZeroCount; ++sIndex; if (sIndex == sLength) { break; } sChar = s.charAt(sIndex); } int tLeadingZeroCount = 0; while (tChar == '0') { ++tLeadingZeroCount; ++tIndex; if (tIndex == tLength) { break; } tChar = t.charAt(tIndex); } boolean sAllZero = sIndex == sLength || !Character.isDigit(sChar); boolean tAllZero = tIndex == tLength || !Character.isDigit(tChar); if (sAllZero && tAllZero) { continue; } if (sAllZero && !tAllZero) { return -1; } if (tAllZero) { return 1; } int diff = 0; do { if (diff == 0) { diff = sChar - tChar; } ++sIndex; ++tIndex; if (sIndex == sLength && tIndex == tLength) { return diff != 0 ? diff : sLeadingZeroCount - tLeadingZeroCount; } if (sIndex == sLength) { if (diff == 0) { return -1; } return Character.isDigit(t.charAt(tIndex)) ? -1 : diff; } if (tIndex == tLength) { if (diff == 0) { return 1; } return Character.isDigit(s.charAt(sIndex)) ? 1 : diff; } sChar = s.charAt(sIndex); tChar = t.charAt(tIndex); sCharIsDigit = Character.isDigit(sChar); tCharIsDigit = Character.isDigit(tChar); if (!sCharIsDigit && !tCharIsDigit) { // both number sub words have the same length if (diff != 0) { return diff; } break; } if (!sCharIsDigit) { return -1; } if (!tCharIsDigit) { return 1; } } while (true); } else { // Compare words if (collator != null) { // To use the collator the whole subwords have to be compared - character-by-character comparision // is not possible. So find the two subwords first int aw = sIndex; int bw = tIndex; do { ++sIndex; } while (sIndex < sLength && !Character.isDigit(s.charAt(sIndex))); do { ++tIndex; } while (tIndex < tLength && !Character.isDigit(t.charAt(tIndex))); String as = s.substring(aw, sIndex); String bs = t.substring(bw, tIndex); int subwordResult = collator.compare(as, bs); if (subwordResult != 0) { return subwordResult; } } else { // No collator specified. All characters should be ascii only. Compare character-by-character. do { if (sChar != tChar) { if (caseSensitive) { return sChar - tChar; } sChar = Character.toUpperCase(sChar); tChar = Character.toUpperCase(tChar); if (sChar != tChar) { sChar = Character.toLowerCase(sChar); tChar = Character.toLowerCase(tChar); if (sChar != tChar) { return sChar - tChar; } } } ++sIndex; ++tIndex; if (sIndex == sLength && tIndex == tLength) { return 0; } if (sIndex == sLength) { return -1; } if (tIndex == tLength) { return 1; } sChar = s.charAt(sIndex); tChar = t.charAt(tIndex); sCharIsDigit = Character.isDigit(sChar); tCharIsDigit = Character.isDigit(tChar); } while (!sCharIsDigit && !tCharIsDigit); } } } } }