/*
* Copyright (C) 2011 Laurent Caillette
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.novelang.build.unicode;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Reads Unicode character names from a property file {@value #RESOURCE_NAME}.
* The file may contain duplicate keys.
*
* @author Laurent Caillette
*/
/*package*/ class UnicodeNamesTextReader {
private static final Logger LOGGER = LoggerFactory.getLogger( UnicodeNamesTextReader.class ) ;
/**
* <a href="http://www.unicode.org/Public/5.2.0/ucd/UnicodeData.txt" >Unicode 5.2</a>
*/
private static final String RESOURCE_NAME = "UnicodeData.txt" ;
private static final String DESCRIPTOR_TEXT = "(?:\\w| |-|/|,|<|>|\\(|\\))*" ;
private static final String IGNORED_DESCRIPTOR = "(?:" + DESCRIPTOR_TEXT + ";)" ;
private static final String USEFUL_DESCRIPTOR = "(" + DESCRIPTOR_TEXT + ");" ;
private static final Pattern PROPERTY_LINE_PATTERN =
Pattern.compile( "(\\w{4});" +
USEFUL_DESCRIPTOR +
IGNORED_DESCRIPTOR + "{8}" +
USEFUL_DESCRIPTOR +
IGNORED_DESCRIPTOR + "{3}" +
"(?:\\w*)"
) ;
static {
LOGGER.debug( "Crafted regex: " + PROPERTY_LINE_PATTERN.pattern() ) ;
}
private static String readProperties() throws IOException {
final URL resource = UnicodeNamesTextReader.class.getResource( RESOURCE_NAME ) ;
LOGGER.info( "Reading " + resource.toExternalForm() ) ;
final InputStream inputStream = resource.openStream() ;
return IOUtils.toString( inputStream ) ;
}
public Map< Character, String > loadNames() throws IOException {
return extractNames( readProperties() ) ;
}
/*package*/ Map< Character, String > extractNames( final String names ) throws IOException {
final Map< Character, String > characterToNameMap =
Maps.newHashMapWithExpectedSize( 256 * 256 ) ;
final Matcher matcher = PROPERTY_LINE_PATTERN.matcher( names ) ;
int limiter = 0 ;
while( matcher.find() /*&& limiter < 150*/ ) {
final String code = matcher.group( 1 ) ;
if( code.length() == 4 ) {
final String name ;
final String casualName = matcher.group( 2 ) ;
if( "<control>".equals( casualName ) ) {
final String controlName = matcher.group( 3 ) ;
name = controlName ;
} else {
name = casualName ;
}
final int codeAsInt = Integer.parseInt( code, 16 ) ; // Be confident!
final Character character = ( char ) codeAsInt ;
final String existing = characterToNameMap.get( character ) ;
if( existing == null && ! "".equals( name ) ) {
// Retain first definition, seems that most interesting appear first.
characterToNameMap.put( character, name ) ;
// LOG.info( "Added " + ( ( int ) character ) + " as '" + name + "'" +
// ( character != limiter ? " OOOPS!" : "" )
// ) ;
}
}
limiter++ ;
}
return ImmutableMap.copyOf( characterToNameMap ) ;
}
}