/* * Copyright (C) 2011 Laurent Caillette * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation, either * version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.novelang.build.unicode; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.charset.Charset; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Set; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.commons.lang.ClassUtils; import org.apache.commons.lang.SystemUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.novelang.build.CodeGenerationTools; /** * Generates a file containing every Unicode name. * <ul> * <li>First 4 bytes: n, the number of character in the offset table. * <li>n * 4 bytes: the offsets of the names (from the start of the file). * Offsets are 32-bit, unsigned ints. * <li>8-bit characters for names, zero-terminated. * </ul> * * @author Laurent Caillette */ public class UnicodeNamesGenerator { private static final Logger LOGGER = LoggerFactory.getLogger( UnicodeNamesGenerator.class ) ; private final File targetFile ; private static final int UNSIGNED_MAX_16BIT = 256 * 256; public UnicodeNamesGenerator( final String packageName, final String namesFile, final File targetDirectory ) throws IOException { this.targetFile = CodeGenerationTools.resolveTargetFile( targetDirectory, packageName, namesFile ) ; if( targetFile.getParentFile().mkdirs() ) { LOGGER.info( "Created '" + targetDirectory.getAbsolutePath() + "'" ) ; } } public void generate() throws IOException { LOGGER.info( "About to generate into '" + targetFile.getAbsolutePath() + "'..." ) ; if( targetFile.exists() ) { if( targetFile.delete() ) { LOGGER.info( "Deleted '" + targetFile.getAbsolutePath() + "'" ) ; } } if( ! targetFile.createNewFile() ) { throw new IOException( "Could not create '" + targetFile.getAbsolutePath() + "'" ) ; } LOGGER.info( "Loading names..." ) ; final Map< Character, String > characters = new UnicodeNamesTextReader().loadNames() ; final OutputStream outputStream = new FileOutputStream( targetFile ) ; LOGGER.info( "Generating indexed file..." ) ; generate( new BufferedOutputStream( outputStream, 640 * 1024 ), characters ) ; outputStream.close() ; } /** * Generates the offset table and the names. * * @param outputStream not flushed. * @param characterNames a Map with characters having contiguous codes that start by 0. */ public static void generate( final OutputStream outputStream, final Map< Character, String > characterNames ) throws IOException { final Set< Character > characters = characterNames.keySet() ; final List< Character > characterList = Lists.newArrayList( characters ) ; Collections.sort( characterList, CHARACTER_COMPARATOR /* Needed? */ ) ; final int lastCharacterIndex = characterList.get( characterList.size() - 1 ) ; generate( outputStream, characterNames, lastCharacterIndex + 1 ) ; } /** * Generates the offset table and the names. * * @param outputStream not flushed. * @param characterNames a Map with characters having contiguous codes that start by 0. */ public static void generate( final OutputStream outputStream, final Map< Character, String > characterNames, final int totalCharacterCount ) throws IOException { Preconditions.checkArgument( totalCharacterCount <= UNSIGNED_MAX_16BIT ) ; final Map< Integer, Integer > offsetsFromFirstName = Maps.newHashMapWithExpectedSize( totalCharacterCount ) ; final Map< Character, byte[] > characterNamesAsBytes = calculateCharacterNamesAsBytes( characterNames ) ; // Find the offset of the name of each character. int writePositionFromFirstName = 0 ; int characterCount = 0 ; for( int characterIndex = 0 ; characterIndex < totalCharacterCount ; characterIndex ++ ) { final Character character = ( char ) characterIndex ; if( characterNames.containsKey( character ) ) { offsetsFromFirstName.put( characterIndex, writePositionFromFirstName ) ; writePositionFromFirstName += characterNamesAsBytes.get( character ).length + // Real length. 1 // Terminal zero. ; characterCount ++ ; } else { offsetsFromFirstName.put( characterIndex, null ) ; } } LOGGER.debug( "Found " + characterCount + " characters." ) ; // Write character count. outputStream.write( asBytes( totalCharacterCount ) ) ; // Write offsets. final int offsetTableSize = totalCharacterCount * 4 ; for( int characterIndex = 0 ; characterIndex < totalCharacterCount ; characterIndex ++ ) { final byte[] bytes ; final Integer value = offsetsFromFirstName.get( characterIndex ) ; if( value == null ) { bytes = ZERO_OFFSET ; } else { bytes = asBytes( 4 + offsetTableSize + value ) ; } outputStream.write( bytes ) ; } // Write names. for( int characterIndex = 0 ; characterIndex < totalCharacterCount ; characterIndex ++ ) { final byte[] nameBytes = characterNamesAsBytes.get( ( char ) characterIndex ) ; if( nameBytes != null ) { outputStream.write( nameBytes ) ; outputStream.write( TERMINAL_ZERO ) ; } } outputStream.flush() ; LOGGER.debug( "Generation complete." ) ; } /** * Getting bytes only once speeds generation up a lot. */ private static Map< Character, byte[] > calculateCharacterNamesAsBytes( final Map<Character, String> characterNames ) { final Map< Character, byte[] > map = Maps.newHashMapWithExpectedSize( characterNames.size() ) ; for( final Map.Entry< Character, String > entry : characterNames.entrySet() ) { map.put( entry.getKey(), entry.getValue().replace( ' ', '_' ).getBytes( CHARSET ) ) ; } return map ; } private static final Charset CHARSET = Charset.forName( "UTF-8" ) ; private static final byte[] TERMINAL_ZERO = { 0 } ; private static final byte[] ZERO_OFFSET = { 0, 0, 0, 0 } ; private static final Comparator< Character > CHARACTER_COMPARATOR = new Comparator< Character >() { @Override public int compare( final Character c1, final Character c2 ) { return ( ( int ) c1.charValue() ) - ( ( int ) c2.charValue() ) ; } } ; /*package*/ static byte[] asBytes( final int i ) { final byte[] bytes = new byte[ 4 ] ; bytes[ 0 ] = ( byte ) ( i >>> 24 ) ; bytes[ 1 ] = ( byte ) ( i >>> 16 ) ; bytes[ 2 ] = ( byte ) ( i >>> 8 ) ; bytes[ 3 ] = ( byte ) ( i & 0x000000FF ) ; return bytes ; } // ============================================== // Main, supports no arg for interactive testing. // ============================================== public static void main( final String[] args ) throws IOException { final File targetDirectory ; if( args.length == 0 ) { final File projectDirectory = SystemUtils.USER_DIR.endsWith( "idea" ) ? new File( SystemUtils.USER_DIR ).getParentFile() : new File( SystemUtils.USER_DIR ) ; targetDirectory = new File( projectDirectory, "idea/generated/antlr" ) ; } else if( args.length == 1 ) { targetDirectory = new File( args[ 0 ] ) ; } else { throw new IllegalArgumentException( "Usage: " + ClassUtils.getShortClassName( UnicodeNamesGenerator.class ) + "[target-directory]" ) ; } new UnicodeNamesGenerator( "org.novelang.parser.unicode", "names.bin", targetDirectory ). generate() ; } }