/* * Copyright (C) 2013-2017 NTT DATA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ package org.terasoluna.gfw.common.codepoints; import java.io.Serializable; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; /** * Represents the collection of code point. This class holds immutable code points as {@link java.util.Set} and provides * <ul> * <li>check method if the code points in the given string are included</li> * <li>set operations (union, subtract, intersect)</li> * </ul> * <h3>How to create an instance</h3> Use Factory method to create a cached instance * * <pre> * <code>CodePoints cp = CodePoints.of(ASCIIPrintableChars.class);</code> * </pre> * * The constructor can be also used. In this case, of course, the set of code points are not cached and created every time. * * <pre> * <code>CodePoints cp = new ASCIIPrintableChars();</code> * </pre> * * There are three types of constructor: * <ol> * <li>Pass {@code int} varargs * * <pre> * <code>CodePoints cp = new CodePoints(0x0061, 0x0062); // a b</code> * </pre> * * </li> * <li>Pass {@link java.util.Collection} of {@link java.lang.Integer} * * <pre> * <code>{@literal Set<Integer>} set = new {@literal HashSet<>}(); * set.add(0x0061); // a * set.add(0x0062); // b * CodePoints cp = new CodePoints(set);</code> * </pre> * * </li> * <li>Pass {@link java.lang.String} varargs including the target code points * * <pre> * <code>CodePoints cp = new CodePoints("ab"); * CodePoints cp = new CodePoints("a", "b"); // is same </code> * </pre> * * </li> * <li>Pass existing {@link CodePoints}. This type is intended to use for the definition of new code points. The set in the * {@link CodePoints} are shared. * * <pre> * <code>CodePoints cp = ...; * CodePoints newCp = new CodePoints(cp); </code> * </pre> * * </li> * </ol> * <h3>How to check strings</h3> {@link #containsAll(String)} returns {@code true} if all code points in the given string are * included in the target code points. Otherwise {@code false} is returned. * * <pre> * <code>CodePoints cp = new CodePoints(0x0061, 0x0062); // a b * cp.containsAll("a"); // true * cp.containsAll("b"); // true * cp.containsAll("ab"); // true * cp.containsAll("c"); // false * cp.containsAll("abc"); // false * </code> * </pre> * * {@link #firstExcludedCodePoint(String)} return the first code point in the given string which is not included in the target * code points. * * <pre> * <code>CodePoints cp = new CodePoints(0x0061, 0x0062); // a b * cp.firstExcludedContPoint("abc"); // 0x0063 (c) * cp.firstExcludedContPoint("abcad"); // 0x0063 (c) * cp.firstExcludedContPoint("ab"); // CodePoints#NOT_FOUND * </code> * </pre> * * {@link #allExcludedCodePoints(String)} returns set of code points in the given string which are not not included in the * target. * * <pre> * <code>CodePoints cp = new CodePoints(0x0061, 0x0062); // a b * cp.allExcludedCodePoints("abc"); // [0x0063 (c)] * cp.allExcludedCodePoints("abcad"); // [0x0063 (c), 0x0064 (d)] * cp.allExcludedCodePoints("ab"); // [] * </code> * </pre> * * <h3>How to compose code points</h3> * <p> * {@code CodePoints} provides composable APIs. Since a {@code CodePoints} instance is immutable. These API does not effect the * state of {@code CodePoints} instances. * </p> * <h4>Union</h4> * <p> * Use {@link #union(CodePoints)} * </p> * * <pre> * <code>CodePoints ab = new CodePoints(0x0061 , 0x0062); // a b * CodePoints cd = new CodePoints(0x0063, 0x0064); // c d * CodePoints abcd = ab.union(cd); // a b c d</code> * </pre> * * <h4>Subtract</h4> * <p> * Use {@link #subtract(CodePoints)} * </p> * * <pre> * <code>CodePoints abcd = new CodePoints(0x0061 , 0x0062, 0x0063, 0x0064); // a b c d * CodePoints cd = new CodePoints(0x0063, 0x0064); // c d * CodePoints ab = abcd.subtract(cd); // a b</code> * </pre> * * <h4>Intersect</h4> * <p> * Use {@link #intersect(CodePoints)} * </p> * * <pre> * <code>CodePoints abcd = new CodePoints(0x0061 , 0x0062, 0x0063, 0x0064); // a b c d * CodePoints cde = new CodePoints(0x0063, 0x0064, 0x0064 ); // c d e * CodePoints cd = abcd.intersect(cde); // c d</code> * </pre> * * <h3>How to define new code points</h3> * <p> * Extend {@link CodePoints} to define new code points. Following is a simple code points: * </p> * * <pre> * <code>public class ABCD extends CodePoints { * public ABCD() { * super(0x0061, 0x0062, 0x0063, 0x0064); // a b c d * } * }</code> * </pre> * <p> * New code points can be created using the combination of existing code points. * </p> * * <pre> * <code>public class X_JIS_0208_Hiragana_Katakana extends CodePoints { * public X_JIS_0208_Hiragana_Katakana() { * super(new X_JIS_0208_Hiragana().union(new X_JIS_0208_Hiragana_Katakana())); * } * }</code> * </pre> * <p> * Not that, <code>new</code> is used not to cache temporary code points. If {@code X_JIS_0208_Hiragana} and * {@code X_JIS_0208_Hiragana_Katakana} are also intended to be used, use {@link #of(Class)} instead of {@code new} so that * these are cached: * </p> * * <pre> * <code>public class X_JIS_0208_Hiragana_Katakana extends CodePoints { * public X_JIS_0208_Hiragana_Katakana() { * super(CodePoints.of(X_JIS_0208_Hiragana.class).union(CodePoints.of(X_JIS_0208_Hiragana_Katakana.class))); * } * }</code> * </pre> * @since 5.1.0 */ public class CodePoints implements Serializable { /** * shows no code point is found in the given string which is not included in the target code points. */ public static final int NOT_FOUND = Integer.MIN_VALUE; /** * {@code CodePoints} cache */ private static final ConcurrentMap<Class<? extends CodePoints>, CodePoints> cache = new ConcurrentHashMap<Class<? extends CodePoints>, CodePoints>(); /** * set for code points. */ private final Set<Integer> set; /** * Constructor with the given {@code java.lang.Integer} code points * @param codePoints array of actual code points */ public CodePoints(Integer... codePoints) { Set<Integer> s = new HashSet<Integer>(codePoints.length); Collections.addAll(s, codePoints); this.set = Collections.unmodifiableSet(s); } /** * Constructor with the given {@code java.lang.String} * @param strings array of strings which include target code points */ public CodePoints(String... strings) { Set<Integer> s = new HashSet<Integer>(); for (String str : strings) { int len = str.length(); int codePoint; for (int i = 0; i < len; i += Character.charCount(codePoint)) { codePoint = str.codePointAt(i); s.add(codePoint); } } this.set = Collections.unmodifiableSet(s); } /** * Constructor with the given {@code java.lang.Integer} code points * @param codePoints collection of actual code points */ public CodePoints(Collection<Integer> codePoints) { Set<Integer> s = new HashSet<Integer>(codePoints); this.set = Collections.unmodifiableSet(s); } /** * Constructor with the given {@code CodePoints}. The {@code java.util.Set} object inside {@code CodePoints} is shared. * @param codePoints actual code points */ public CodePoints(CodePoints codePoints) { this.set = codePoints.set; } /** * returns whether all code points in the given string are included in the target code points. * @param s target string * @return {@code true} if all code points in the given string are included in the target code points。Otherwise * {@code false} is returned. */ public boolean containsAll(String s) { return this.firstExcludedCodePoint(s) == NOT_FOUND; } /** * returns the first code point in the given string which is not included in the target code points. * @param s target string * @return first code point in the given string which is not included in the target code points. {@link #NOT_FOUND} is * returned if all code points in the given string are included in the target code points. */ public int firstExcludedCodePoint(String s) { if (s == null || s.isEmpty()) { return NOT_FOUND; } // http://www.ibm.com/developerworks/jp/ysl/library/java/j-unicode_surrogate/ int len = s.length(); int codePoint; for (int i = 0; i < len; i += Character.charCount(codePoint)) { codePoint = s.codePointAt(i); if (!set.contains(codePoint)) { return codePoint; } } return NOT_FOUND; } /** * returns set of code points in the given string which are not not included in the target. * @param s target string * @return set of code points in the given string which are not not included in the target. an empty set is returned if all * code points in the given string are included in the target code points. */ public Set<Integer> allExcludedCodePoints(String s) { if (s == null || s.isEmpty()) { return Collections.emptySet(); } Set<Integer> excludedCodePoints = new LinkedHashSet<Integer>(); // http://www.ibm.com/developerworks/jp/ysl/library/java/j-unicode_surrogate/ int len = s.length(); Integer codePoint; for (int i = 0; i < len; i += Character.charCount(codePoint)) { codePoint = s.codePointAt(i); if (!set.contains(codePoint)) { excludedCodePoints.add(codePoint); } } return excludedCodePoints; } /** * unite two set of code points * @param codePoints code points to unite * @return united code points */ public CodePoints union(CodePoints codePoints) { Set<Integer> setTmp = new HashSet<Integer>(this.set); setTmp.addAll(codePoints.set); return new CodePoints(setTmp); } /** * subtract two set of code points * @param codePoints code points to subtract * @return subtracted code points */ public CodePoints subtract(CodePoints codePoints) { Set<Integer> setTmp = new HashSet<Integer>(this.set); setTmp.removeAll(codePoints.set); return new CodePoints(setTmp); } /** * intersect two set of code points * @param codePoints code points to intersect * @return intersected code points */ public CodePoints intersect(CodePoints codePoints) { Set<Integer> setTmp = new HashSet<Integer>(this.set); setTmp.retainAll(codePoints.set); return new CodePoints(setTmp); } /** * Produces cached {@link CodePoints}. At first time, a new {@link CodePoints} is created. After second time, same instance * is returned. * @param clazz {@link CodePoints} class to create * @param <T> {@link CodePoints} class * @return cached instance */ @SuppressWarnings("unchecked") public static <T extends CodePoints> T of(Class<T> clazz) { if (cache.containsKey(clazz)) { return (T) cache.get(clazz); } try { T codePoints = clazz.newInstance(); cache.put(clazz, codePoints); return codePoints; } catch (InstantiationException e) { throw new IllegalArgumentException("exception occurred while initializing", e); } catch (IllegalAccessException e) { throw new IllegalArgumentException("public default constructor not found", e); } } /** * Helper method to check whether all code points in the given string are included in any of the code points list. * @param s target string * @param codePointsList * @return {@code true} if all code points in the given string are included in any of the code points list. Otherwise * {@code false} is returned. */ public static boolean containsAllInAnyCodePoints(String s, final CodePoints... codePointsList) { Map<Integer, Integer> excludedCounts = new HashMap<Integer, Integer>(); for (CodePoints codePoints : codePointsList) { Set<Integer> excluded = codePoints.allExcludedCodePoints(s); if (excluded.isEmpty()) { // return immediately if the given string consists of a code points. return true; } for (Integer codePoint : excluded) { // count the number of CodePoints in the given list which forbade the given code point Integer count = excludedCounts.get(codePoint); if (count != null) { excludedCounts.put(codePoint, count + 1); } else { excludedCounts.put(codePoint, 1); } } } for (Map.Entry<Integer, Integer> entry : excludedCounts.entrySet()) { if (entry.getValue() == codePointsList.length) { // All CodePoints forbade the given code point. // This means there are some code points which are not included in any given CodePoints' list return false; } } // OK if each code point is included in some CodePoints' list return true; } /** * equals method * @param o object to check * @return {@code true} if the given object equals to this instance. {@code false} otherwise. */ @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } CodePoints that = (CodePoints) o; return set.equals(that.set); } /** * hash code of the instance * @return hash code */ @Override public int hashCode() { return set.hashCode(); } }