NaturalStringOrder.java example

Explorer
onebusaway-application-modules-master
/**
 * Copyright (C) 2011 Brian Ferris <bdferris@onebusaway.org>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.onebusaway.utility.text;

import java.text.Collator;

/**
 * Utility class for natural string order operations (a1 < a2 < a10). Written by
 * Stephen Friedrich
 * (http://weblogs.java.net/blog/skelvin/archive/2006/01/natural_string.html).
 * No license specified? Code found at
 * http://svn.atlassian.com/fisheye/browse/public
 * /contrib/confluence/linking-plugin
 * /trunk/src/java/com/eekboom/utils/Strings.java?r=1322&%40annotateMode=none
 * 
 * TODO: Determine license situation
 */
public class NaturalStringOrder {
  /**
   * <p>
   * A string comparator that does case sensitive comparisons and handles
   * embedded numbers correctly.
   * </p>
   * <p>
   * <b>Do not use</b> if your app might ever run on any locale that uses more
   * than 7-bit ascii characters.
   * </p>
   * / private static final Comparator<String> NATURAL_COMPARATOR_ASCII = new
   * Comparator<String>() { public int compare(String o1, String o2) { return
   * compareNaturalAscii(o1, o2); } };
   * 
   * /**
   * <p>
   * A string comparator that does case insensitive comparisons and handles
   * embedded numbers correctly.
   * </p>
   * <p>
   * <b>Do not use</b> if your app might ever run on any locale that uses more
   * than 7-bit ascii characters.
   * </p>
   * / private static final Comparator<String>
   * IGNORE_CASE_NATURAL_COMPARATOR_ASCII = new Comparator<String>() { public
   * int compare(String o1, String o2) { return
   * compareNaturalIgnoreCaseAscii(o1, o2); } };
   * 
   * /** This is a utility class (static methods only), don't instantiate.
   */
  private NaturalStringOrder() {
  }

  /**
   * Returns a comparator that compares contained numbers based on their numeric
   * values and compares other parts using the current locale's order rules.
   * <p>
   * For example in German locale this will be a comparator that handles umlauts
   * correctly and ignores upper/lower case differences.
   * </p>
   * 
   * @return <p>
   *         A string comparator that uses the current locale's order rules and
   *         handles embedded numbers correctly.
   *         </p>
   * @see #getNaturalComparator(java.text.Collator) / public static
   *      Comparator<String> getNaturalComparator() { Collator collator =
   *      Collator.getInstance(); return getNaturalComparator(collator); }
   * 
   *      /** Returns a comparator that compares contained numbers based on
   *      their numeric values and compares other parts using the given
   *      collator.
   * 
   * @param collator used for locale specific comparison of text (non-number)
   *          subwords - must not be null
   * @return <p>
   *         A string comparator that uses the given Collator to compare
   *         subwords and handles embedded numbers correctly.
   *         </p>
   * @see #getNaturalComparator() / public static Comparator<String>
   *      getNaturalComparator(final Collator collator) { if(collator == null) {
   *      // it's important to explicitly handle this here - else the bug will
   *      manifest anytime later in possibly // unrelated code that tries to use
   *      the comparator throw new
   *      NullPointerException("collator must not be null"); } return new
   *      Comparator<String>() { public int compare(String o1, String o2) {
   *      return compareNatural(collator, o1, o2); } }; }
   * 
   *      /** Returns a comparator that compares contained numbers based on
   *      their numeric values and compares other parts based on each
   *      character's Unicode value.
   * 
   * @return <p>
   *         a string comparator that does case sensitive comparisons on pure
   *         ascii strings and handles embedded numbers correctly.
   *         </p>
   *         <b>Do not use</b> if your app might ever run on any locale that
   *         uses more than 7-bit ascii characters.
   * @see #getNaturalComparator()
   * @see #getNaturalComparator(java.text.Collator) / public static
   *      Comparator<String> getNaturalComparatorAscii() { return
   *      NATURAL_COMPARATOR_ASCII; }
   * 
   *      /** Returns a comparator that compares contained numbers based on
   *      their numeric values and compares other parts based on each
   *      character's Unicode value while ignore upper/lower case differences.
   *      <b>Do not use</b> if your app might ever run on any locale that uses
   *      more than 7-bit ascii characters.
   * 
   * @return <p>
   *         a string comparator that does case insensitive comparisons on pure
   *         ascii strings and handles embedded numbers correctly.
   *         </p>
   * @see #getNaturalComparator()
   * @see #getNaturalComparator(java.text.Collator) / public static
   *      Comparator<String> getNaturalComparatorIgnoreCaseAscii() { return
   *      IGNORE_CASE_NATURAL_COMPARATOR_ASCII; }
   * 
   *      /**
   *      <p>
   *      Compares two strings using the current locale's rules and comparing
   *      contained numbers based on their numeric values.
   *      </p>
   *      <p>
   *      This is probably the best default comparison to use.
   *      </p>
   *      <p>
   *      If you know that the texts to be compared are in a certain language
   *      that differs from the default locale's langage, then get a collator
   *      for the desired locale (
   *      {@link java.text.Collator#getInstance(java.util.Locale)}) and pass it
   *      to {@link #compareNatural(java.text.Collator, String, String)}
   *      </p>
   * 
   * @param s first string
   * @param t second string
   * @return zero iff <code>s</code> and <code>t</code> are equal, a value less
   *         than zero iff <code>s</code> lexicographically precedes
   *         <code>t</code> and a value larger than zero iff <code>s</code>
   *         lexicographically follows <code>t</code>
   */
  public static int compareNatural(String s, String t) {
    return compareNatural(s, t, false, Collator.getInstance());
  }

  /**
   * <p>
   * Compares two strings using the given collator and comparing contained
   * numbers based on their numeric values.
   * </p>
   * 
   * @param s first string
   * @param t second string
   * @return zero iff <code>s</code> and <code>t</code> are equal, a value less
   *         than zero iff <code>s</code> lexicographically precedes
   *         <code>t</code> and a value larger than zero iff <code>s</code>
   *         lexicographically follows <code>t</code>
   */
  public static int compareNatural(Collator collator, String s, String t) {
    return compareNatural(s, t, true, collator);
  }

  /**
   * <p>
   * Compares two strings using each character's Unicode value for non-digit
   * characters and the numeric values off any contained numbers.
   * </p>
   * <p>
   * (This will probably make sense only for strings containing 7-bit ascii
   * characters only.)
   * </p>
   * 
   * @return zero iff <code>s</code> and <code>t</code> are equal, a value less
   *         than zero iff <code>s</code> lexicographically precedes
   *         <code>t</code> and a value larger than zero iff <code>s</code>
   *         lexicographically follows <code>t</code>
   */
  public static int compareNaturalAscii(String s, String t) {
    return compareNatural(s, t, true, null);
  }

  /**
   * <p>
   * Compares two strings using each character's Unicode value - ignoring
   * upper/lower case - for non-digit characters and the numeric values of any
   * contained numbers.
   * </p>
   * <p>
   * (This will probably make sense only for strings containing 7-bit ascii
   * characters only.)
   * </p>
   * 
   * @return zero iff <code>s</code> and <code>t</code> are equal, a value less
   *         than zero iff <code>s</code> lexicographically precedes
   *         <code>t</code> and a value larger than zero iff <code>s</code>
   *         lexicographically follows <code>t</code>
   */
  public static int compareNaturalIgnoreCaseAscii(String s, String t) {
    return compareNatural(s, t, false, null);
  }

  /**
   * @param s first string
   * @param t second string
   * @param caseSensitive treat characters differing in case only as equal -
   *          will be ignored if a collator is given
   * @param collator used to compare subwords that aren't numbers - if null,
   *          characters will be compared individually based on their Unicode
   *          value
   * @return zero iff <code>s</code> and <code>t</code> are equal, a value less
   *         than zero iff <code>s</code> lexicographically precedes
   *         <code>t</code> and a value larger than zero iff <code>s</code>
   *         lexicographically follows <code>t</code>
   */
  private static int compareNatural(String s, String t, boolean caseSensitive,
      Collator collator) {
    int sIndex = 0;
    int tIndex = 0;

    int sLength = s.length();
    int tLength = t.length();

    while (true) {
      // both character indices are after a subword (or at zero)

      // Check if one string is at end
      if (sIndex == sLength && tIndex == tLength) {
        return 0;
      }
      if (sIndex == sLength) {
        return -1;
      }
      if (tIndex == tLength) {
        return 1;
      }

      // Compare sub word
      char sChar = s.charAt(sIndex);
      char tChar = t.charAt(tIndex);

      boolean sCharIsDigit = Character.isDigit(sChar);
      boolean tCharIsDigit = Character.isDigit(tChar);

      if (sCharIsDigit && tCharIsDigit) {
        // Compare numbers

        // skip leading 0s
        int sLeadingZeroCount = 0;
        while (sChar == '0') {
          ++sLeadingZeroCount;
          ++sIndex;
          if (sIndex == sLength) {
            break;
          }
          sChar = s.charAt(sIndex);
        }
        int tLeadingZeroCount = 0;
        while (tChar == '0') {
          ++tLeadingZeroCount;
          ++tIndex;
          if (tIndex == tLength) {
            break;
          }
          tChar = t.charAt(tIndex);
        }
        boolean sAllZero = sIndex == sLength || !Character.isDigit(sChar);
        boolean tAllZero = tIndex == tLength || !Character.isDigit(tChar);
        if (sAllZero && tAllZero) {
          continue;
        }
        if (sAllZero && !tAllZero) {
          return -1;
        }
        if (tAllZero) {
          return 1;
        }

        int diff = 0;
        do {
          if (diff == 0) {
            diff = sChar - tChar;
          }
          ++sIndex;
          ++tIndex;
          if (sIndex == sLength && tIndex == tLength) {
            return diff != 0 ? diff : sLeadingZeroCount - tLeadingZeroCount;
          }
          if (sIndex == sLength) {
            if (diff == 0) {
              return -1;
            }
            return Character.isDigit(t.charAt(tIndex)) ? -1 : diff;
          }
          if (tIndex == tLength) {
            if (diff == 0) {
              return 1;
            }
            return Character.isDigit(s.charAt(sIndex)) ? 1 : diff;
          }
          sChar = s.charAt(sIndex);
          tChar = t.charAt(tIndex);
          sCharIsDigit = Character.isDigit(sChar);
          tCharIsDigit = Character.isDigit(tChar);
          if (!sCharIsDigit && !tCharIsDigit) {
            // both number sub words have the same length
            if (diff != 0) {
              return diff;
            }
            break;
          }
          if (!sCharIsDigit) {
            return -1;
          }
          if (!tCharIsDigit) {
            return 1;
          }
        } while (true);
      } else {
        // Compare words
        if (collator != null) {
          // To use the collator the whole subwords have to be compared -
          // character-by-character comparision
          // is not possible. So find the two subwords first
          int aw = sIndex;
          int bw = tIndex;
          do {
            ++sIndex;
          } while (sIndex < sLength && !Character.isDigit(s.charAt(sIndex)));
          do {
            ++tIndex;
          } while (tIndex < tLength && !Character.isDigit(t.charAt(tIndex)));

          String as = s.substring(aw, sIndex);
          String bs = t.substring(bw, tIndex);
          int subwordResult = collator.compare(as, bs);
          if (subwordResult != 0) {
            return subwordResult;
          }
        } else {
          // No collator specified. All characters should be ascii only. Compare
          // character-by-character.
          do {
            if (sChar != tChar) {
              if (caseSensitive) {
                return sChar - tChar;
              }
              sChar = Character.toUpperCase(sChar);
              tChar = Character.toUpperCase(tChar);
              if (sChar != tChar) {
                sChar = Character.toLowerCase(sChar);
                tChar = Character.toLowerCase(tChar);
                if (sChar != tChar) {
                  return sChar - tChar;
                }
              }
            }
            ++sIndex;
            ++tIndex;
            if (sIndex == sLength && tIndex == tLength) {
              return 0;
            }
            if (sIndex == sLength) {
              return -1;
            }
            if (tIndex == tLength) {
              return 1;
            }
            sChar = s.charAt(sIndex);
            tChar = t.charAt(tIndex);
            sCharIsDigit = Character.isDigit(sChar);
            tCharIsDigit = Character.isDigit(tChar);
          } while (!sCharIsDigit && !tCharIsDigit);
        }
      }
    }
  }
}