/** * Copyright (C) 2011 Brian Ferris <bdferris@onebusaway.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.onebusaway.utility.text; import java.text.Collator; /** * Utility class for natural string order operations (a1 < a2 < a10). Written by * Stephen Friedrich * (http://weblogs.java.net/blog/skelvin/archive/2006/01/natural_string.html). * No license specified? Code found at * http://svn.atlassian.com/fisheye/browse/public * /contrib/confluence/linking-plugin * /trunk/src/java/com/eekboom/utils/Strings.java?r=1322&%40annotateMode=none * * TODO: Determine license situation */ public class NaturalStringOrder { /** * <p> * A string comparator that does case sensitive comparisons and handles * embedded numbers correctly. * </p> * <p> * <b>Do not use</b> if your app might ever run on any locale that uses more * than 7-bit ascii characters. * </p> * / private static final Comparator<String> NATURAL_COMPARATOR_ASCII = new * Comparator<String>() { public int compare(String o1, String o2) { return * compareNaturalAscii(o1, o2); } }; * * /** * <p> * A string comparator that does case insensitive comparisons and handles * embedded numbers correctly. * </p> * <p> * <b>Do not use</b> if your app might ever run on any locale that uses more * than 7-bit ascii characters. * </p> * / private static final Comparator<String> * IGNORE_CASE_NATURAL_COMPARATOR_ASCII = new Comparator<String>() { public * int compare(String o1, String o2) { return * compareNaturalIgnoreCaseAscii(o1, o2); } }; * * /** This is a utility class (static methods only), don't instantiate. */ private NaturalStringOrder() { } /** * Returns a comparator that compares contained numbers based on their numeric * values and compares other parts using the current locale's order rules. * <p> * For example in German locale this will be a comparator that handles umlauts * correctly and ignores upper/lower case differences. * </p> * * @return <p> * A string comparator that uses the current locale's order rules and * handles embedded numbers correctly. * </p> * @see #getNaturalComparator(java.text.Collator) / public static * Comparator<String> getNaturalComparator() { Collator collator = * Collator.getInstance(); return getNaturalComparator(collator); } * * /** Returns a comparator that compares contained numbers based on * their numeric values and compares other parts using the given * collator. * * @param collator used for locale specific comparison of text (non-number) * subwords - must not be null * @return <p> * A string comparator that uses the given Collator to compare * subwords and handles embedded numbers correctly. * </p> * @see #getNaturalComparator() / public static Comparator<String> * getNaturalComparator(final Collator collator) { if(collator == null) { * // it's important to explicitly handle this here - else the bug will * manifest anytime later in possibly // unrelated code that tries to use * the comparator throw new * NullPointerException("collator must not be null"); } return new * Comparator<String>() { public int compare(String o1, String o2) { * return compareNatural(collator, o1, o2); } }; } * * /** Returns a comparator that compares contained numbers based on * their numeric values and compares other parts based on each * character's Unicode value. * * @return <p> * a string comparator that does case sensitive comparisons on pure * ascii strings and handles embedded numbers correctly. * </p> * <b>Do not use</b> if your app might ever run on any locale that * uses more than 7-bit ascii characters. * @see #getNaturalComparator() * @see #getNaturalComparator(java.text.Collator) / public static * Comparator<String> getNaturalComparatorAscii() { return * NATURAL_COMPARATOR_ASCII; } * * /** Returns a comparator that compares contained numbers based on * their numeric values and compares other parts based on each * character's Unicode value while ignore upper/lower case differences. * <b>Do not use</b> if your app might ever run on any locale that uses * more than 7-bit ascii characters. * * @return <p> * a string comparator that does case insensitive comparisons on pure * ascii strings and handles embedded numbers correctly. * </p> * @see #getNaturalComparator() * @see #getNaturalComparator(java.text.Collator) / public static * Comparator<String> getNaturalComparatorIgnoreCaseAscii() { return * IGNORE_CASE_NATURAL_COMPARATOR_ASCII; } * * /** * <p> * Compares two strings using the current locale's rules and comparing * contained numbers based on their numeric values. * </p> * <p> * This is probably the best default comparison to use. * </p> * <p> * If you know that the texts to be compared are in a certain language * that differs from the default locale's langage, then get a collator * for the desired locale ( * {@link java.text.Collator#getInstance(java.util.Locale)}) and pass it * to {@link #compareNatural(java.text.Collator, String, String)} * </p> * * @param s first string * @param t second string * @return zero iff <code>s</code> and <code>t</code> are equal, a value less * than zero iff <code>s</code> lexicographically precedes * <code>t</code> and a value larger than zero iff <code>s</code> * lexicographically follows <code>t</code> */ public static int compareNatural(String s, String t) { return compareNatural(s, t, false, Collator.getInstance()); } /** * <p> * Compares two strings using the given collator and comparing contained * numbers based on their numeric values. * </p> * * @param s first string * @param t second string * @return zero iff <code>s</code> and <code>t</code> are equal, a value less * than zero iff <code>s</code> lexicographically precedes * <code>t</code> and a value larger than zero iff <code>s</code> * lexicographically follows <code>t</code> */ public static int compareNatural(Collator collator, String s, String t) { return compareNatural(s, t, true, collator); } /** * <p> * Compares two strings using each character's Unicode value for non-digit * characters and the numeric values off any contained numbers. * </p> * <p> * (This will probably make sense only for strings containing 7-bit ascii * characters only.) * </p> * * @return zero iff <code>s</code> and <code>t</code> are equal, a value less * than zero iff <code>s</code> lexicographically precedes * <code>t</code> and a value larger than zero iff <code>s</code> * lexicographically follows <code>t</code> */ public static int compareNaturalAscii(String s, String t) { return compareNatural(s, t, true, null); } /** * <p> * Compares two strings using each character's Unicode value - ignoring * upper/lower case - for non-digit characters and the numeric values of any * contained numbers. * </p> * <p> * (This will probably make sense only for strings containing 7-bit ascii * characters only.) * </p> * * @return zero iff <code>s</code> and <code>t</code> are equal, a value less * than zero iff <code>s</code> lexicographically precedes * <code>t</code> and a value larger than zero iff <code>s</code> * lexicographically follows <code>t</code> */ public static int compareNaturalIgnoreCaseAscii(String s, String t) { return compareNatural(s, t, false, null); } /** * @param s first string * @param t second string * @param caseSensitive treat characters differing in case only as equal - * will be ignored if a collator is given * @param collator used to compare subwords that aren't numbers - if null, * characters will be compared individually based on their Unicode * value * @return zero iff <code>s</code> and <code>t</code> are equal, a value less * than zero iff <code>s</code> lexicographically precedes * <code>t</code> and a value larger than zero iff <code>s</code> * lexicographically follows <code>t</code> */ private static int compareNatural(String s, String t, boolean caseSensitive, Collator collator) { int sIndex = 0; int tIndex = 0; int sLength = s.length(); int tLength = t.length(); while (true) { // both character indices are after a subword (or at zero) // Check if one string is at end if (sIndex == sLength && tIndex == tLength) { return 0; } if (sIndex == sLength) { return -1; } if (tIndex == tLength) { return 1; } // Compare sub word char sChar = s.charAt(sIndex); char tChar = t.charAt(tIndex); boolean sCharIsDigit = Character.isDigit(sChar); boolean tCharIsDigit = Character.isDigit(tChar); if (sCharIsDigit && tCharIsDigit) { // Compare numbers // skip leading 0s int sLeadingZeroCount = 0; while (sChar == '0') { ++sLeadingZeroCount; ++sIndex; if (sIndex == sLength) { break; } sChar = s.charAt(sIndex); } int tLeadingZeroCount = 0; while (tChar == '0') { ++tLeadingZeroCount; ++tIndex; if (tIndex == tLength) { break; } tChar = t.charAt(tIndex); } boolean sAllZero = sIndex == sLength || !Character.isDigit(sChar); boolean tAllZero = tIndex == tLength || !Character.isDigit(tChar); if (sAllZero && tAllZero) { continue; } if (sAllZero && !tAllZero) { return -1; } if (tAllZero) { return 1; } int diff = 0; do { if (diff == 0) { diff = sChar - tChar; } ++sIndex; ++tIndex; if (sIndex == sLength && tIndex == tLength) { return diff != 0 ? diff : sLeadingZeroCount - tLeadingZeroCount; } if (sIndex == sLength) { if (diff == 0) { return -1; } return Character.isDigit(t.charAt(tIndex)) ? -1 : diff; } if (tIndex == tLength) { if (diff == 0) { return 1; } return Character.isDigit(s.charAt(sIndex)) ? 1 : diff; } sChar = s.charAt(sIndex); tChar = t.charAt(tIndex); sCharIsDigit = Character.isDigit(sChar); tCharIsDigit = Character.isDigit(tChar); if (!sCharIsDigit && !tCharIsDigit) { // both number sub words have the same length if (diff != 0) { return diff; } break; } if (!sCharIsDigit) { return -1; } if (!tCharIsDigit) { return 1; } } while (true); } else { // Compare words if (collator != null) { // To use the collator the whole subwords have to be compared - // character-by-character comparision // is not possible. So find the two subwords first int aw = sIndex; int bw = tIndex; do { ++sIndex; } while (sIndex < sLength && !Character.isDigit(s.charAt(sIndex))); do { ++tIndex; } while (tIndex < tLength && !Character.isDigit(t.charAt(tIndex))); String as = s.substring(aw, sIndex); String bs = t.substring(bw, tIndex); int subwordResult = collator.compare(as, bs); if (subwordResult != 0) { return subwordResult; } } else { // No collator specified. All characters should be ascii only. Compare // character-by-character. do { if (sChar != tChar) { if (caseSensitive) { return sChar - tChar; } sChar = Character.toUpperCase(sChar); tChar = Character.toUpperCase(tChar); if (sChar != tChar) { sChar = Character.toLowerCase(sChar); tChar = Character.toLowerCase(tChar); if (sChar != tChar) { return sChar - tChar; } } } ++sIndex; ++tIndex; if (sIndex == sLength && tIndex == tLength) { return 0; } if (sIndex == sLength) { return -1; } if (tIndex == tLength) { return 1; } sChar = s.charAt(sIndex); tChar = t.charAt(tIndex); sCharIsDigit = Character.isDigit(sChar); tCharIsDigit = Character.isDigit(tChar); } while (!sCharIsDigit && !tCharIsDigit); } } } } }