/*==========================================================================*\ | $Id: StringNormalizer.java,v 1.4 2010/02/23 19:19:30 stedwar2 Exp $ |*-------------------------------------------------------------------------*| | Copyright (C) 2007-2010 Virginia Tech | | This file is part of the Student-Library. | | The Student-Library is free software; you can redistribute it and/or | modify it under the terms of the GNU Lesser General Public License as | published by the Free Software Foundation; either version 3 of the | License, or (at your option) any later version. | | The Student-Library is distributed in the hope that it will be useful, | but WITHOUT ANY WARRANTY; without even the implied warranty of | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | GNU Lesser General Public License for more details. | | You should have received a copy of the GNU Lesser General Public License | along with the Student-Library; if not, see <http://www.gnu.org/licenses/>. \*==========================================================================*/ package student.testingsupport; import java.util.ArrayList; import java.util.Collection; import java.util.EnumMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; //------------------------------------------------------------------------- /** * This class represents a programmable string "normalizing" engine that * can be used to convert strings into a canonical form, say, before * comparing strings for equality or something. Basically, a normalizer * is a list of zero or more rules, or transformations. The * {@link #normalize(String)} method can be used to apply the entire * set of transformations to a given string. * <p> * For example, you can build a string normalizer that replaces all * sequences of one or more whitespace characters by a single space * character, trims any leading or trailing space, and converts a * string to lower case. This class provides a number of predefined * transformations in the {@link StandardRule} enumeration. * Some examples:</p> * <pre> * // An "identity" transformation that does nothing: * StringNormalizer norm1 = new StringNormalizer(); * // norm1.normalize(...) returns its argument unchanged * * // A "lower case" normalizer: * StringNormalizer norm2 = new StringNormalizer( * StringNormalizer.StandardRule.IGNORE_CAPITALIZATION); * // norm2.normalize(...) returns a lower case version of its argument * * // self-explanatory: * StringNormalizer norm3 = new StringNormalizer( * StringNormalizer.StandardRule.IGNORE_CAPITALIZATION, * StringNormalizer.StandardRule.IGNORE_PUNCTUATION); * * // A "standard" normalizer: * StringNormalizer norm4 = new StringNormalizer(true); * // norm4.normalize(...) returns its contents with all punctuation * // characters removed, all letters converted to lower case, all * // whitespace sequences replaced by single spaces, all MS-DOS or * // Mac line terminators replaced by "\n"'s, and all leading and * // trailing whitespace removed. * </pre> * <p> * Note that string normalizers that contain multiple rules apply those * rules <b>in order</b> (i.e., in the order added, or the * {@link java.util.List} order of this class). This may produce * inconsistent results if you are not careful when you add your rules. * </p> * * @author Stephen Edwards * @author Last changed by $Author: stedwar2 $ * @version $Revision: 1.4 $, $Date: 2010/02/23 19:19:30 $ */ public class StringNormalizer extends ArrayList<StringNormalizer.NormalizerRule> { //~ Instance/static variables ............................................. private static final long serialVersionUID = -909915399977948511L; private static Map<StandardRule, NormalizerRule> standardRules = new EnumMap<StandardRule, NormalizerRule>(StandardRule.class); // initialize the map static { // ---- IGNORE_PUNCTUATION ---- standardRules.put(StandardRule.IGNORE_PUNCTUATION, new RegexNormalizerRule( "[^\\p{javaLetterOrDigit}" // if not letters or digits + "\\p{Pc}" // or connector punctuation + "\\p{Nl}" // or LETTER_NUMBER + "\\p{javaWhitespace}]+", // or white space "" // then remove it )); // ---- IGNORE_CAPITALIZATION ---- standardRules.put(StandardRule.IGNORE_CAPITALIZATION, new NormalizerRule() { public String normalize(String content) { return content.toLowerCase(); } }); // ---- IGNORE_NEWLINE_DIFFERENCES ---- standardRules.put(StandardRule.IGNORE_NEWLINE_DIFFERENCES, new RegexNormalizerRule( "\\p{Zl}|\r(\n?)", "\n" )); // ---- IGNORE_SPACING_DIFFERENCES ---- standardRules.put(StandardRule.IGNORE_SPACING_DIFFERENCES, new RegexNormalizerRule( // normalize line endings "\\p{Zl}|\r(\n?)", "\n", // trim leading space from every line "(?dm)^[\\p{javaWhitespace}&&[^\n]]+", "", // trim trailing space from every line "(?dm)[\\p{javaWhitespace}&&[^\n]]+$", "", // normalize other space "[\\p{javaWhitespace}&&[^\n]]+", " " )); // ---- IGNORE_TRAILING_NEWLINES ---- standardRules.put(StandardRule.IGNORE_TRAILING_NEWLINES, new RegexNormalizerRule( "(\\p{Zl}|\r(\n?)|\n)+$", "" )); // ---- OPT_IGNORE_BLANK_LINES ---- standardRules.put(StandardRule.OPT_IGNORE_BLANK_LINES, new RegexNormalizerRule( "(\\p{Zl}|\r(\n?)|\n)(\\p{javaWhitespace}+)" + "(\\p{Zl}|\r(\n?)|\n)", "$1" )); // ---- OPT_IGNORE_ALL_WHITESPACE ---- standardRules.put(StandardRule.OPT_IGNORE_ALL_WHITESPACE, new RegexNormalizerRule( "[\\p{javaWhitespace}&&[^\\p{Zl}\r\n]]+", "" )); // ---- OPT_IGNORE_ALL_WHITESPACE_AND_NEWLINES ---- standardRules.put(StandardRule.OPT_IGNORE_ALL_WHITESPACE_AND_NEWLINES, new RegexNormalizerRule("\\p{javaWhitespace}+", "")); } //~ Constructor ........................................................... // ---------------------------------------------------------- /** * Creates a new StringNormalizer object containing no rules (the * "identity" normalizer). */ public StringNormalizer() { // Nothing to do } // ---------------------------------------------------------- /** * Creates a new StringNormalizer object, optionally containing the * standard set of rules. The standard set is all those in * {@link StandardRule} exception the OPT_* rules. * @param useStandardRules If true, the set of standard (non-OPT_*) * rules will be used. If false, an "identity" normalizer will be * produced instead. */ public StringNormalizer(boolean useStandardRules) { if (useStandardRules) { addStandardRules(); } } // ---------------------------------------------------------- /** * Creates a new StringNormalizer object containing the given * set of rules. * @param rules a (variable-length) comma-separated sequence of * rules to add */ public StringNormalizer(StandardRule ... rules) { if (rules != null) { for (StandardRule rule : rules) { add(rule); } } } // ---------------------------------------------------------- /** * Creates a new StringNormalizer object containing the given * set of rules. * @param rules a (variable-length) comma-separated sequence of * rules to add */ public StringNormalizer(NormalizerRule ... rules) { if (rules != null) { for (NormalizerRule rule : rules) { add(rule); } } } // ---------------------------------------------------------- /** * Creates a new StringNormalizer object containing the given * set of rules. * @param rules a collection of rules to add (could be another * StringNormalizer, or any other kind of collection) */ public StringNormalizer(Collection<? extends NormalizerRule> rules) { super(rules); } //~ Methods ............................................................... // ---------------------------------------------------------- /** * Normalize a string by applying a set of normalization rules * (transformations). * @param content The string to transform * @return The result after all rules have been applied */ public String normalize(String content) { if (content == null) return content; for (NormalizerRule rule : this) { content = rule.normalize(content); } return content; } // ---------------------------------------------------------- /** * Add the standard set of rules. The standard set is all those in * {@link StandardRule} exception the OPT_* rules. */ public void addStandardRules() { for (StandardRule rule : StandardRule.values()) { if (!rule.toString().startsWith("OPT_")) { add(rule); } } } // ---------------------------------------------------------- /** * Add the specified standard rule, as defined in {@link StandardRule}. * Note that you can also use the inherited * {@link java.util.List#add(Object)} method to add custom NormalizerRule * objects. * @param rule The rule to add */ public void add(StandardRule rule) { add(standardRule(rule)); } // ---------------------------------------------------------- /** * Add the specified rule. For efficiency, only adds the rule if it * is not already present in this normalizer. * @param rule The rule to add * @return True if the rule was added, or false if it is already * present */ public boolean add(NormalizerRule rule) { return contains(rule) ? false : super.add(rule); } // ---------------------------------------------------------- /** * Remove the specified standard rule, as defined in {@link StandardRule}. * Note that you can also use the inherited * {@link java.util.List#remove(Object)} method to remove other kinds * of NormalizerRule objects. * @param rule The rule to remove */ public void remove(StandardRule rule) { remove(standardRule(rule)); } // ---------------------------------------------------------- /** * This interface defines what it means to be a normalizer rule: an * object having an appropriate {@link #normalize(String)} method. */ public static interface NormalizerRule { /** * Apply this rule by normalizing the given string. * @param content The string to normalize * @return The normalized result */ public String normalize(String content); } // ---------------------------------------------------------- /** * A highly reusable concrete implementation of {@link NormalizerRule} * that applies a series of {@link Pattern regular expression} * substitutions. */ public static class RegexNormalizerRule implements NormalizerRule { private boolean everywhere; private Pattern[] patterns; private String[] replacements; // ---------------------------------------------------------- /** * Create a new regular expression rule using a series of * pattern/replacement pairs. Each pattern/replacement will * be applied globally (all matches that can be found). If multiple * patterns are given, they will be applied in the order given. * Use this form: * <pre> * myRule = new RegexNormalizerRule( * "pattern1", "replacement1", * "pattern2", "replacement2", * ... // As many as you want * ); * </pre> * @param patterns a series of regular expression pattern/replacement * pairs (there <b>must</b> be an even number!) */ public RegexNormalizerRule(String ... patterns) { this(true, patterns); } // ---------------------------------------------------------- /** * Create a new regular expression rule using a series of * pattern/replacement pairs. * @param everywhere True if all pattern/replacements should be * applied globally (for every match), or false if the replacements * should only be applied to the first match for each pattern. * @param patterns a series of regular expression pattern/replacement * pairs (there <b>must</b> be an even number!) */ public RegexNormalizerRule(boolean everywhere, String ... patterns) { assert patterns.length % 2 == 0 : "patterns/replacements must come in pairs"; this.everywhere = everywhere; this.patterns = new Pattern[patterns.length / 2]; this.replacements = new String[patterns.length / 2]; for (int i = 0; i < patterns.length; i++) { this.patterns[i/2] = Pattern.compile(patterns[i]); i++; this.replacements[i/2] = patterns[i]; } } // ---------------------------------------------------------- /** * Normalize a string by applying all of this rule's regular * expression pattern/replacement pairs, in order. * @param content The string to transform * @return The result after all substitutions have been applied */ public String normalize(String content) { // System.out.println("normalizing: '" + content + "'"); for (int i = 0; i < patterns.length; i++) { // System.out.println("Applying: '" + patterns[i] + "' => '" // + replacements[i] + "'"); Matcher matcher = patterns[i].matcher(content); if (everywhere) { content = matcher.replaceAll(replacements[i]); } else { content = matcher.replaceFirst(replacements[i]); } // System.out.println(" --> '" + content + "'"); } return content; } } // ---------------------------------------------------------- /** * This enumeration defines the set of predefined transformation rules. */ public static enum StandardRule { /** * Strips all punctuation characters (that is, characters that are * not letters, numbers, connecting punctuation like "_", or white * space). */ IGNORE_PUNCTUATION, /** * Converts to all lower case. */ IGNORE_CAPITALIZATION, /** * Convert all MS-DOS (CRLF, "\r\n") and Mac (CR, "\r") line * termination sequences to Unix-style (LF, "\n") termination * sequences. */ IGNORE_NEWLINE_DIFFERENCES, /** * Same as IGNORE_NEWLINE_DIFFERENCES, but also converts all * non-empty sequences of white space characters (except newlines) * to single spaces, and trims any leading or trailing white space * from every line. */ IGNORE_SPACING_DIFFERENCES, /** * Trims any sequence of trailing line termination sequences * (regardless of OS). */ IGNORE_TRAILING_NEWLINES, /** * Removes any blank lines. */ OPT_IGNORE_BLANK_LINES, /** * Removes all white space except line termination sequences (from * any OS). */ OPT_IGNORE_ALL_WHITESPACE, /** * Removes all white space of any kind, including newlines of any kind. */ OPT_IGNORE_ALL_WHITESPACE_AND_NEWLINES } // ---------------------------------------------------------- /** * Retrieve a standard rule by name. * @param rule the rule to retrieve * @return The corresponding {@link NormalizerRule} */ public static NormalizerRule standardRule(StandardRule rule) { return standardRules.get(rule); } }