StringNormalizer.java example

Explorer
uml-auto-assessment-master
- web-cat-src
/*==========================================================================*\
 |  $Id: StringNormalizer.java,v 1.4 2010/02/23 19:19:30 stedwar2 Exp $
 |*-------------------------------------------------------------------------*|
 |  Copyright (C) 2007-2010 Virginia Tech
 |
 |  This file is part of the Student-Library.
 |
 |  The Student-Library is free software; you can redistribute it and/or
 |  modify it under the terms of the GNU Lesser General Public License as
 |  published by the Free Software Foundation; either version 3 of the
 |  License, or (at your option) any later version.
 |
 |  The Student-Library is distributed in the hope that it will be useful,
 |  but WITHOUT ANY WARRANTY; without even the implied warranty of
 |  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 |  GNU Lesser General Public License for more details.
 |
 |  You should have received a copy of the GNU Lesser General Public License
 |  along with the Student-Library; if not, see <http://www.gnu.org/licenses/>.
\*==========================================================================*/

package student.testingsupport;

import java.util.ArrayList;
import java.util.Collection;
import java.util.EnumMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//-------------------------------------------------------------------------
/**
 *  This class represents a programmable string "normalizing" engine that
 *  can be used to convert strings into a canonical form, say, before
 *  comparing strings for equality or something.  Basically, a normalizer
 *  is a list of zero or more rules, or transformations.  The
 *  {@link #normalize(String)} method can be used to apply the entire
 *  set of transformations to a given string.
 *  <p>
 *  For example, you can build a string normalizer that replaces all
 *  sequences of one or more whitespace characters by a single space
 *  character, trims any leading or trailing space, and converts a
 *  string to lower case.  This class provides a number of predefined
 *  transformations in the {@link StandardRule} enumeration.
 *  Some examples:</p>
 *  <pre>
 *  // An "identity" transformation that does nothing:
 *  StringNormalizer norm1 = new StringNormalizer();
 *  // norm1.normalize(...) returns its argument unchanged
 *
 *  // A "lower case" normalizer:
 *  StringNormalizer norm2 = new StringNormalizer(
 *      StringNormalizer.StandardRule.IGNORE_CAPITALIZATION);
 *  // norm2.normalize(...) returns a lower case version of its argument
 *
 *  // self-explanatory:
 *  StringNormalizer norm3 = new StringNormalizer(
 *      StringNormalizer.StandardRule.IGNORE_CAPITALIZATION,
 *      StringNormalizer.StandardRule.IGNORE_PUNCTUATION);
 *
 *  // A "standard" normalizer:
 *  StringNormalizer norm4 = new StringNormalizer(true);
 *  // norm4.normalize(...) returns its contents with all punctuation
 *  // characters removed, all letters converted to lower case, all
 *  // whitespace sequences replaced by single spaces, all MS-DOS or
 *  // Mac line terminators replaced by "\n"'s, and all leading and
 *  // trailing whitespace removed.
 *  </pre>
 *  <p>
 *  Note that string normalizers that contain multiple rules apply those
 *  rules <b>in order</b> (i.e., in the order added, or the
 *  {@link java.util.List} order of this class).  This may produce
 *  inconsistent results if you are not careful when you add your rules.
 *  </p>
 *
 *  @author  Stephen Edwards
 *  @author Last changed by $Author: stedwar2 $
 *  @version $Revision: 1.4 $, $Date: 2010/02/23 19:19:30 $
 */
public class StringNormalizer
    extends ArrayList<StringNormalizer.NormalizerRule>
{
    //~ Instance/static variables .............................................

    private static final long serialVersionUID = -909915399977948511L;
    private static Map<StandardRule, NormalizerRule> standardRules =
        new EnumMap<StandardRule, NormalizerRule>(StandardRule.class);
    // initialize the map
    static
    {
        // ---- IGNORE_PUNCTUATION ----
        standardRules.put(StandardRule.IGNORE_PUNCTUATION,
            new RegexNormalizerRule(
                "[^\\p{javaLetterOrDigit}" // if not letters or digits
                + "\\p{Pc}"                // or connector punctuation
                + "\\p{Nl}"                // or LETTER_NUMBER
                + "\\p{javaWhitespace}]+", // or white space
                ""                         // then remove it
            ));

        // ---- IGNORE_CAPITALIZATION ----
        standardRules.put(StandardRule.IGNORE_CAPITALIZATION,
            new NormalizerRule() {
                public String normalize(String content)
                {
                    return content.toLowerCase();
                }
            });

        // ---- IGNORE_NEWLINE_DIFFERENCES ----
        standardRules.put(StandardRule.IGNORE_NEWLINE_DIFFERENCES,
            new RegexNormalizerRule(
                "\\p{Zl}|\r(\n?)", "\n"
            ));

        // ---- IGNORE_SPACING_DIFFERENCES ----
        standardRules.put(StandardRule.IGNORE_SPACING_DIFFERENCES,
            new RegexNormalizerRule(
                // normalize line endings
                "\\p{Zl}|\r(\n?)", "\n",

                // trim leading space from every line
                "(?dm)^[\\p{javaWhitespace}&&[^\n]]+", "",
                // trim trailing space from every line
                "(?dm)[\\p{javaWhitespace}&&[^\n]]+$", "",
                // normalize other space
                "[\\p{javaWhitespace}&&[^\n]]+",  " "
            ));

        // ---- IGNORE_TRAILING_NEWLINES ----
        standardRules.put(StandardRule.IGNORE_TRAILING_NEWLINES,
            new RegexNormalizerRule(
                "(\\p{Zl}|\r(\n?)|\n)+$", ""
            ));

        // ---- OPT_IGNORE_BLANK_LINES ----
        standardRules.put(StandardRule.OPT_IGNORE_BLANK_LINES,
            new RegexNormalizerRule(
                "(\\p{Zl}|\r(\n?)|\n)(\\p{javaWhitespace}+)"
                + "(\\p{Zl}|\r(\n?)|\n)", "$1"
            ));

        // ---- OPT_IGNORE_ALL_WHITESPACE ----
        standardRules.put(StandardRule.OPT_IGNORE_ALL_WHITESPACE,
            new RegexNormalizerRule(
                "[\\p{javaWhitespace}&&[^\\p{Zl}\r\n]]+", ""
            ));

        // ---- OPT_IGNORE_ALL_WHITESPACE_AND_NEWLINES ----
        standardRules.put(StandardRule.OPT_IGNORE_ALL_WHITESPACE_AND_NEWLINES,
            new RegexNormalizerRule("\\p{javaWhitespace}+", ""));
    }


    //~ Constructor ...........................................................

    // ----------------------------------------------------------
    /**
     * Creates a new StringNormalizer object containing no rules (the
     * "identity" normalizer).
     */
    public StringNormalizer()
    {
        // Nothing to do
    }


    // ----------------------------------------------------------
    /**
     * Creates a new StringNormalizer object, optionally containing the
     * standard set of rules.  The standard set is all those in
     * {@link StandardRule} exception the OPT_* rules.
     * @param useStandardRules If true, the set of standard (non-OPT_*)
     * rules will be used.  If false, an "identity" normalizer will be
     * produced instead.
     */
    public StringNormalizer(boolean useStandardRules)
    {
        if (useStandardRules)
        {
            addStandardRules();
        }
    }


    // ----------------------------------------------------------
    /**
     * Creates a new StringNormalizer object containing the given
     * set of rules.
     * @param rules a (variable-length) comma-separated sequence of
     * rules to add
     */
    public StringNormalizer(StandardRule ... rules)
    {
        if (rules != null)
        {
            for (StandardRule rule : rules)
            {
                add(rule);
            }
        }
    }


    // ----------------------------------------------------------
    /**
     * Creates a new StringNormalizer object containing the given
     * set of rules.
     * @param rules a (variable-length) comma-separated sequence of
     * rules to add
     */
    public StringNormalizer(NormalizerRule ... rules)
    {
        if (rules != null)
        {
            for (NormalizerRule rule : rules)
            {
                add(rule);
            }
        }
    }


    // ----------------------------------------------------------
    /**
     * Creates a new StringNormalizer object containing the given
     * set of rules.
     * @param rules a collection of rules to add (could be another
     * StringNormalizer, or any other kind of collection)
     */
    public StringNormalizer(Collection<? extends NormalizerRule> rules)
    {
        super(rules);
    }


    //~ Methods ...............................................................

    // ----------------------------------------------------------
    /**
     * Normalize a string by applying a set of normalization rules
     * (transformations).
     * @param content The string to transform
     * @return The result after all rules have been applied
     */
    public String normalize(String content)
    {
        if (content == null) return content;
        for (NormalizerRule rule : this)
        {
            content = rule.normalize(content);
        }
        return content;
    }


    // ----------------------------------------------------------
    /**
     * Add the standard set of rules.  The standard set is all those in
     * {@link StandardRule} exception the OPT_* rules.
     */
    public void addStandardRules()
    {
        for (StandardRule rule : StandardRule.values())
        {
            if (!rule.toString().startsWith("OPT_"))
            {
                add(rule);
            }
        }
    }


    // ----------------------------------------------------------
    /**
     * Add the specified standard rule, as defined in {@link StandardRule}.
     * Note that you can also use the inherited
     * {@link java.util.List#add(Object)} method to add custom NormalizerRule
     * objects.
     * @param rule The rule to add
     */
    public void add(StandardRule rule)
    {
        add(standardRule(rule));
    }


    // ----------------------------------------------------------
    /**
     * Add the specified rule.  For efficiency, only adds the rule if it
     * is not already present in this normalizer.
     * @param rule The rule to add
     * @return True if the rule was added, or false if it is already
     * present
     */
    public boolean add(NormalizerRule rule)
    {
        return contains(rule)
            ? false
            : super.add(rule);
    }


    // ----------------------------------------------------------
    /**
     * Remove the specified standard rule, as defined in {@link StandardRule}.
     * Note that you can also use the inherited
     * {@link java.util.List#remove(Object)} method to remove other kinds
     * of NormalizerRule objects.
     * @param rule The rule to remove
     */
    public void remove(StandardRule rule)
    {
        remove(standardRule(rule));
    }


    // ----------------------------------------------------------
    /**
     * This interface defines what it means to be a normalizer rule: an
     * object having an appropriate {@link #normalize(String)} method.
     */
    public static interface NormalizerRule
    {
        /**
         * Apply this rule by normalizing the given string.
         * @param content The string to normalize
         * @return The normalized result
         */
        public String normalize(String content);
    }


    // ----------------------------------------------------------
    /**
     * A highly reusable concrete implementation of {@link NormalizerRule}
     * that applies a series of {@link Pattern regular expression}
     * substitutions.
     */
    public static class RegexNormalizerRule
        implements NormalizerRule
    {
        private boolean   everywhere;
        private Pattern[] patterns;
        private String[]  replacements;


        // ----------------------------------------------------------
        /**
         * Create a new regular expression rule using a series of
         * pattern/replacement pairs.  Each pattern/replacement will
         * be applied globally (all matches that can be found).  If multiple
         * patterns are given, they will be applied in the order given.
         * Use this form:
         * <pre>
         * myRule = new RegexNormalizerRule(
         *     "pattern1", "replacement1",
         *     "pattern2", "replacement2",
         *     ... // As many as you want
         * );
         * </pre>
         * @param patterns a series of regular expression pattern/replacement
         * pairs (there <b>must</b> be an even number!)
         */
        public RegexNormalizerRule(String ... patterns)
        {
            this(true, patterns);
        }


        // ----------------------------------------------------------
        /**
         * Create a new regular expression rule using a series of
         * pattern/replacement pairs.
         * @param everywhere True if all pattern/replacements should be
         * applied globally (for every match), or false if the replacements
         * should only be applied to the first match for each pattern.
         * @param patterns a series of regular expression pattern/replacement
         * pairs (there <b>must</b> be an even number!)
         */
        public RegexNormalizerRule(boolean everywhere, String ... patterns)
        {
            assert patterns.length % 2 == 0
                : "patterns/replacements must come in pairs";
            this.everywhere   = everywhere;
            this.patterns     = new Pattern[patterns.length / 2];
            this.replacements = new String[patterns.length / 2];
            for (int i = 0; i < patterns.length; i++)
            {
                this.patterns[i/2]     = Pattern.compile(patterns[i]);
                i++;
                this.replacements[i/2] = patterns[i];
            }
        }


        // ----------------------------------------------------------
        /**
         * Normalize a string by applying all of this rule's regular
         * expression pattern/replacement pairs, in order.
         * @param content The string to transform
         * @return The result after all substitutions have been applied
         */
        public String normalize(String content)
        {
//            System.out.println("normalizing: '" + content + "'");
            for (int i = 0; i < patterns.length; i++)
            {
//                System.out.println("Applying: '" + patterns[i] + "' => '"
//                    + replacements[i] + "'");
                Matcher matcher = patterns[i].matcher(content);
                if (everywhere)
                {
                    content = matcher.replaceAll(replacements[i]);
                }
                else
                {
                    content = matcher.replaceFirst(replacements[i]);
                }
//                System.out.println("    --> '" + content + "'");
            }
            return content;
        }
    }


    // ----------------------------------------------------------
    /**
     * This enumeration defines the set of predefined transformation rules.
     */
    public static enum StandardRule
    {
        /**
         * Strips all punctuation characters (that is, characters that are
         * not letters, numbers, connecting punctuation like "_", or white
         * space).
         */
        IGNORE_PUNCTUATION,

        /**
         * Converts to all lower case.
         */
        IGNORE_CAPITALIZATION,

        /**
         * Convert all MS-DOS (CRLF, "\r\n") and Mac (CR, "\r") line
         * termination sequences to Unix-style (LF, "\n") termination
         * sequences.
         */
        IGNORE_NEWLINE_DIFFERENCES,

        /**
         * Same as IGNORE_NEWLINE_DIFFERENCES, but also converts all
         * non-empty sequences of white space characters (except newlines)
         * to single spaces, and trims any leading or trailing white space
         * from every line.
         */
        IGNORE_SPACING_DIFFERENCES,

        /**
         * Trims any sequence of trailing line termination sequences
         * (regardless of OS).
         */
        IGNORE_TRAILING_NEWLINES,

        /**
         * Removes any blank lines.
         */
        OPT_IGNORE_BLANK_LINES,

        /**
         * Removes all white space except line termination sequences (from
         * any OS).
         */
        OPT_IGNORE_ALL_WHITESPACE,

        /**
         * Removes all white space of any kind, including newlines of any kind.
         */
        OPT_IGNORE_ALL_WHITESPACE_AND_NEWLINES
    }


    // ----------------------------------------------------------
    /**
     * Retrieve a standard rule by name.
     * @param rule the rule to retrieve
     * @return The corresponding {@link NormalizerRule}
     */
    public static NormalizerRule standardRule(StandardRule rule)
    {
        return standardRules.get(rule);
    }
}