/*
* aitools utilities
* Copyright (C) 2006 Noel Bush
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
package org.aitools.util;
import java.io.File;
import java.io.PrintWriter;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.aitools.util.resource.Filesystem;
import org.aitools.util.runtime.DeveloperError;
import org.exolab.javasource.JAnnotation;
import org.exolab.javasource.JAnnotationType;
import org.exolab.javasource.JClass;
import org.exolab.javasource.JComment;
import org.exolab.javasource.JMethod;
import org.exolab.javasource.JSourceWriter;
/**
* Provides some useful Unicode functionality that doesn't seem to be available elsewhere.
*
* @author <a href="mailto:noel@aitools.org">Noel Bush</a>
*/
@SuppressWarnings("unchecked")
public class Unicode {
/** The longest string that will be produced in the test methods. */
private static final int MAX_STRING_LEN = 65535;
/** A cache of already-determined Unicode block contents. */
private static Map<String, List<char[]>> BLOCKS = new HashMap<String, List<char[]>>();
/** A cache of already-determined letter contents of Unicode blocks. */
private static Map<String, List<char[]>> LETTERS = new HashMap<String, List<char[]>>();
/** A cache of already-determined uppercase letter contents of Unicode blocks. */
private static Map<String, List<char[]>> UPPERCASE_LETTERS = new HashMap<String, List<char[]>>();
/** A cache of already-determined lowercase letter contents of Unicode blocks. */
private static Map<String, List<char[]>> LOWERCASE_LETTERS = new HashMap<String, List<char[]>>();
/** The set of all available Unicode blocks. */
private static Set<Character.UnicodeBlock> UNICODE_BLOCKS;
/** {@link Character#isUpperCase(char)}. */
private static Method CHAR_IS_UPPERCASE_METHOD;
/** {@link Character#isUpperCase(int)}. */
private static Method INT_IS_UPPERCASE_METHOD;
/** {@link Character#isLowerCase(char)}. */
private static Method CHAR_IS_LOWERCASE_METHOD;
/** {@link Character#isLowerCase(int)}. */
private static Method INT_IS_LOWERCASE_METHOD;
/** {@link Character#isLetter(char)}. */
private static Method CHAR_IS_LETTER_METHOD;
/** {@link Character#isLetter(int)}. */
private static Method INT_IS_LETTER_METHOD;
static {
// Get all Unicode blocks.
Field mapField = null;
try {
mapField = Character.UnicodeBlock.class.getDeclaredField("map");
}
catch (SecurityException e) {
assert false : "Unable to access Character.UnicodeBlock.map.";
}
catch (NoSuchFieldException e) {
assert false : "Could not find Character.UnicodeBlock.map.";
}
if (mapField != null) {
mapField.setAccessible(true);
Map<String, Character.UnicodeBlock> map = null;
try {
map = (Map<String, Character.UnicodeBlock>) mapField.get(null);
}
catch (IllegalArgumentException e) {
assert false : "Character.UnicodeBlock.map does not appear as a static variable.";
}
catch (IllegalAccessException e) {
assert false : "Not allowed to access Character.UnicodeBlock.map.";
}
if (map != null) {
UNICODE_BLOCKS = new HashSet<Character.UnicodeBlock>(map.values());
// Get the isUpperCase, isLowerCase, and isLetter methods of Character.
try {
CHAR_IS_UPPERCASE_METHOD = Character.class.getMethod("isUpperCase", char.class);
INT_IS_UPPERCASE_METHOD = Character.class.getMethod("isUpperCase", int.class);
CHAR_IS_LOWERCASE_METHOD = Character.class.getMethod("isLowerCase", char.class);
INT_IS_LOWERCASE_METHOD = Character.class.getMethod("isLowerCase", int.class);
CHAR_IS_LETTER_METHOD = Character.class.getMethod("isLetter", char.class);
INT_IS_LETTER_METHOD = Character.class.getMethod("isLetter", int.class);
}
catch (SecurityException e) {
assert false : "Denied access to well-known method of Character.";
}
catch (NoSuchMethodException e) {
assert false : "Well-known method of Character does not exist.";
}
}
}
}
private static final JAnnotationType TEST_ANNOTATION_TYPE = new JAnnotationType("Test");
private static final JAnnotationType SUPPRESS_WARNINGS_ANNOTATION_TYPE = new JAnnotationType("SuppressWarnings");
private static void addAnnotations(JMethod method) {
method.addAnnotation(new JAnnotation(TEST_ANNOTATION_TYPE));
JAnnotation suppressStaticMethodWarning = new JAnnotation(SUPPRESS_WARNINGS_ANNOTATION_TYPE);
suppressStaticMethodWarning.setValue("\"static-method\"");
method.addAnnotation(suppressStaticMethodWarning);
}
private static void addTestAllMethod(JClass clazz, String characterType, String blockName, List<char[]> characters) {
JMethod method = new JMethod(String.format("testAll%sIn%s", characterType, blockName));
addAnnotations(method);
// Avoid making quoted strings longer than MAX_STRING_LEN bytes.
String characterString = Text.mergeCharArrays(characters);
StringBuilder quotedCharacters = new StringBuilder();
int stringLength = characterString.getBytes().length;
if (stringLength > MAX_STRING_LEN) {
CharsetEncoder encoder = Charset.defaultCharset().newEncoder();
ByteBuffer buffer;
try {
buffer = encoder.encode(CharBuffer.wrap(characterString));
}
catch (CharacterCodingException e) {
throw new DeveloperError("Could not encode string.", e);
}
int byteCount = buffer.remaining();
for (int index = 0; index < byteCount; index += MAX_STRING_LEN) {
if (quotedCharacters.length() > 0) {
quotedCharacters.append("\" + \n");
}
int length = Math.min(MAX_STRING_LEN, buffer.remaining());
byte[] bytes = new byte[length];
buffer.get(bytes, 0, length);
ByteBuffer segment = ByteBuffer.wrap(bytes);
quotedCharacters.append("\"");
quotedCharacters.append(segment.asCharBuffer());
}
quotedCharacters.append("\"");
}
else {
quotedCharacters.append(String.format("\"%s\"", characterString));
}
method.setSourceCode(String.format("assertEquals(Text.mergeCharArrays(Unicode.all%sIn(\"%s\")), %s);", characterType,
blockName, quotedCharacters.toString()));
clazz.addMember(method);
}
private static void addTestMethod(JClass clazz, String characterType, String blockName, List<char[]> characters,
int letterCount) {
if (letterCount > 0) {
addTestAllMethod(clazz, characterType, blockName, characters);
}
else {
addTestZeroMethod(clazz, characterType, blockName);
}
}
private static void addTestZeroMethod(JClass clazz, String characterType, String blockName) {
JMethod method = new JMethod(String.format("testZero%sIn%s", characterType, blockName));
addAnnotations(method);
method.setSourceCode(String.format("assertTrue(Unicode.all%sIn(\"%s\").size() == 0);", characterType, blockName));
clazz.addMember(method);
}
/**
* Returns an array containing every character that is a member of the named Unicode block name.
*
* @param blockName
* @return an array of chars
*/
public static List<char[]> allCharactersIn(String blockName) {
if (BLOCKS.containsKey(blockName)) {
return BLOCKS.get(blockName);
}
// Will throw IllegalArgumentException if blockName is not valid.
Character.UnicodeBlock block = Character.UnicodeBlock.forName(blockName);
List<char[]> characters = new ArrayList<char[]>();
for (int codePoint = 0; codePoint < 0x10ffff; codePoint++) {
if (Character.isDefined(codePoint)) {
if (Character.UnicodeBlock.of(codePoint).equals(block)) {
characters.add(Character.toChars(codePoint));
}
}
}
BLOCKS.put(blockName, characters);
return characters;
}
/**
* Returns an array containing every letter that is a member of the named Unicode block name.
*
* @param blockName
* @return a list of char arrays
*/
public static List<char[]> allLettersIn(String blockName) {
return allQualifyingCharactersIn(blockName, LETTERS, CHAR_IS_LETTER_METHOD, INT_IS_LETTER_METHOD, false);
}
/**
* Returns an array containing every lowercase character that is a member of the named Unicode block name.
* <b>However</b>, if the block does not contain <i>any</i> lowercase characters, then it is likely that this is a
* block for which case folding is not an operative concept; in such cases, this will return all characters which are
* letters.
*
* @param blockName
* @return a list of char arrays
*/
public static List<char[]> allLowercaseCharactersIn(String blockName) {
return allQualifyingCharactersIn(blockName, LOWERCASE_LETTERS, CHAR_IS_LOWERCASE_METHOD, INT_IS_LOWERCASE_METHOD,
true);
}
/**
* Returns an array containing every "qualifying" character that is a member of the named Unicode block name.
* "Qualifying" is determined by the <code>qualifies</code> method. If <code>includeAllIfNoneQualify</code> is true,
* then if the block does not contain <i>any</i> qualifying characters, then all characters which are letters will be
* returned instead.
*
* @param blockName the name of the block of characters
* @param caseMap the map of blockNames to lists of qualifying characters
* @param charQualifies the method that will return a boolean indicating whether a character qualifies
* @param intQualifies the method that will return a boolean indicating whether a codepoint (int) qualifies
* @param includeAllIfNoneQualify whether to include all characters in the result if none qualify
* @return a list of char arrays
*/
@SuppressWarnings("boxing")
public static List<char[]> allQualifyingCharactersIn(String blockName, Map<String, List<char[]>> caseMap,
Method charQualifies, Method intQualifies, boolean includeAllIfNoneQualify) {
if (caseMap.containsKey(blockName)) {
return caseMap.get(blockName);
}
List<char[]> wholeSet;
if (charQualifies == CHAR_IS_LETTER_METHOD && intQualifies == INT_IS_LETTER_METHOD) {
wholeSet = allCharactersIn(blockName);
}
else {
wholeSet = allLettersIn(blockName);
}
List<char[]> qualifiers = new ArrayList<char[]>();
for (char[] candidate : wholeSet) {
try {
if (candidate.length == 1 && ((Boolean) charQualifies.invoke(Character.class, candidate[0])).booleanValue()) {
qualifiers.add(candidate);
}
else if (candidate.length == 2
&& ((Boolean) intQualifies.invoke(Character.class, Character.toCodePoint(candidate[0], candidate[1])))
.booleanValue()) {
qualifiers.add(candidate);
}
}
catch (IllegalArgumentException e) {
throw new DeveloperError(String.format(
"Did not provide meaningful arguments to qualifies method \"%s\" or \"%s\".", charQualifies.getName(),
intQualifies.getName()), e);
}
catch (IllegalAccessException e) {
throw new DeveloperError(String.format("Qualifies method \"%s\" or \"%s\" is not accessible.",
charQualifies.getName(), intQualifies.getName()), e);
}
catch (InvocationTargetException e) {
throw new DeveloperError(String.format("Qualifies method \"%s\" or \"%s\" threw an exception.",
charQualifies.getName(), intQualifies.getName()), e);
}
}
if (qualifiers.size() > 0) {
caseMap.put(blockName, qualifiers);
return qualifiers;
}
if (includeAllIfNoneQualify) {
caseMap.put(blockName, wholeSet);
return wholeSet;
}
return qualifiers;
}
/**
* Returns an array containing every uppercase character that is a member of the named Unicode block name.
* <b>However</b>, if the block does not contain <i>any</i> uppercase characters, then it is likely that this is a
* block for which case folding is not an operative concept; in such cases, this will return all characters which are
* letters.
*
* @param blockName
* @return a list of char arrays
*/
public static List<char[]> allUppercaseCharactersIn(String blockName) {
return allQualifyingCharactersIn(blockName, UPPERCASE_LETTERS, CHAR_IS_UPPERCASE_METHOD, INT_IS_UPPERCASE_METHOD,
true);
}
/**
* This is a perhaps ridiculous set of tests. Obviously it is just going to test whether the Unicode blocks on the
* testing machine's JVM are the same as on the compiling machine's.
*
* @param argv one argument, the directory in which to write the UnicodeTest.java file
*/
public static void main(String[] argv) {
JClass clazz = new JClass("org.aitools.util.UnicodeTest");
JComment header = new JComment(JComment.HEADER_STYLE);
header.setComment(License.TEXT);
clazz.setHeader(header);
clazz.addImport("static org.junit.Assert.*");
clazz.addImport("org.junit.Test");
// Sort all the blocks alphabetically.
TreeMap<String, Character.UnicodeBlock> sortedBlocks = new TreeMap<String, Character.UnicodeBlock>();
for (Character.UnicodeBlock block : UNICODE_BLOCKS) {
sortedBlocks.put(block.toString(), block);
}
// Create tests for every block.
for (Character.UnicodeBlock block : sortedBlocks.values()) {
String blockName = block.toString();
List<char[]> letters = allLettersIn(blockName);
int letterCount = letters.size();
addTestMethod(clazz, "Letters", blockName, letters, letterCount);
addTestMethod(clazz, "UppercaseCharacters", blockName, allUppercaseCharactersIn(blockName), letterCount);
addTestMethod(clazz, "LowercaseCharacters", blockName, allLowercaseCharactersIn(blockName), letterCount);
}
PrintWriter out = Filesystem.checkOrCreatePrintWriter(argv[0] + File.separator + "UnicodeTest.java",
"Unicode test file");
JSourceWriter writer = new JSourceWriter(out);
clazz.print(writer);
writer.close();
out.close();
}
}