package org.marketcetera.util.misc; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.HashMap; /** * A filter for Unicode code points. It also maintains a cache of * filters associated with {@link Charset} instances. * * <p>For charset-based filters, this class may perform slowly when * the cache is built, if the JVM is running with an active debugging * agent. This is because the JRE implements the acceptability test by * throwing and catching an exception, which is trapped by the agent; * if the charset can only encode a small subset of the Unicode code * points, then a lot of exceptions are thrown and caught, resulting * in a performance degradation as the agent intercepts repeatedly * (even if the debugger does not indicate an interest in * exceptions).</p> * * @author tlerios@marketcetera.com * @since 0.6.0 * @version $Id: UCPFilter.java 16154 2012-07-14 16:34:05Z colin $ */ /* $License$ */ @ClassVersion("$Id: UCPFilter.java 16154 2012-07-14 16:34:05Z colin $") public abstract class UCPFilter { // CLASS DATA. /** * A filter for Unicode characters that can be encoded by a * specific charset. */ @ClassVersion("$Id: UCPFilter.java 16154 2012-07-14 16:34:05Z colin $") private static final class UCPCharsetFilter extends UCPFilter { // INSTANCE DATA. private CharsetEncoder mEncoder; // CONSTRUCTORS. /** * Creates a filter for the given charset. * * @param cs The charset. */ public UCPCharsetFilter (Charset cs) { mEncoder=cs.newEncoder(); } // UCPFilter. @Override public boolean isAcceptable(int ucp) { return mEncoder.canEncode(StringUtils.fromUCP(ucp)); } } /** * A filter for Unicode characters deemed valid by {@link * StringUtils#isValid(int)}. */ public static final UCPFilter VALID=new UCPFilter() { @Override public boolean isAcceptable(int ucp) { return StringUtils.isValid(ucp); } }; /** * A filter for Unicode characters that can be represented by a * single char. */ public static final UCPFilter CHAR=new UCPFilter() { @Override public boolean isAcceptable(int ucp) { return ((0<=ucp) && (ucp<=0xFFFF)); } }; /** * A filter for Unicode code points that are digits. */ public static final UCPFilter DIGIT=new UCPFilter() { @Override public boolean isAcceptable(int ucp) { return Character.isDigit(ucp); } }; /** * A filter for Unicode code points that are letters. */ public static final UCPFilter LETTER=new UCPFilter() { @Override public boolean isAcceptable(int ucp) { return Character.isLetter(ucp); } }; /** * A filter for Unicode code points that are letters or digits. */ public static final UCPFilter ALNUM=new UCPFilter() { @Override public boolean isAcceptable(int ucp) { return Character.isLetterOrDigit(ucp); } }; private static final HashMap<Charset,UCPFilter> mMap= new HashMap<Charset,UCPFilter>(); // CLASS METHODS. /** * Returns a filter for Unicode code points that can be encoded by * the given charset. * * @param cs The charset. * * @return The filter. */ public static UCPFilter forCharset (Charset cs) { synchronized (mMap) { UCPFilter filter=mMap.get(cs); if (filter!=null) { return filter; } filter=new UCPCharsetFilter(cs); mMap.put(cs,filter); return filter; } } /** * Returns a filter for Unicode code points that can be encoded by * the default JVM charset. * * @return The filter. */ public static final UCPFilter getDefaultCharset() { return UCPFilter.forCharset(Charset.defaultCharset()); } /** * Returns a filter for Unicode code points that can be encoded by * the current system file encoding/charset (as specified in the * system property <code>file.encoding</code>). * * @return The filter. */ public static final UCPFilter getFileSystemCharset() { return UCPFilter.forCharset (Charset.forName(System.getProperty ("file.encoding"))); //$NON-NLS-1$ } // INSTANCE METHODS. /** * Checks whether the given Unicode code point is acceptable to * the receiver. * * @param ucp The code point. * * @return True if so. */ public abstract boolean isAcceptable(int ucp); }