/*
* Copyright (C) 2011 Laurent Caillette
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.novelang.build.unicode;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.lang.ClassUtils;
import org.apache.commons.lang.SystemUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.novelang.build.CodeGenerationTools;
/**
* Generates a file containing every Unicode name.
* <ul>
* <li>First 4 bytes: n, the number of character in the offset table.
* <li>n * 4 bytes: the offsets of the names (from the start of the file).
* Offsets are 32-bit, unsigned ints.
* <li>8-bit characters for names, zero-terminated.
* </ul>
*
* @author Laurent Caillette
*/
public class UnicodeNamesGenerator {
private static final Logger LOGGER = LoggerFactory.getLogger( UnicodeNamesGenerator.class ) ;
private final File targetFile ;
private static final int UNSIGNED_MAX_16BIT = 256 * 256;
public UnicodeNamesGenerator(
final String packageName,
final String namesFile,
final File targetDirectory
) throws IOException {
this.targetFile =
CodeGenerationTools.resolveTargetFile( targetDirectory, packageName, namesFile ) ;
if( targetFile.getParentFile().mkdirs() ) {
LOGGER.info( "Created '" + targetDirectory.getAbsolutePath() + "'" ) ;
}
}
public void generate() throws IOException {
LOGGER.info( "About to generate into '" + targetFile.getAbsolutePath() + "'..." ) ;
if( targetFile.exists() ) {
if( targetFile.delete() ) {
LOGGER.info( "Deleted '" + targetFile.getAbsolutePath() + "'" ) ;
}
}
if( ! targetFile.createNewFile() ) {
throw new IOException( "Could not create '" + targetFile.getAbsolutePath() + "'" ) ;
}
LOGGER.info( "Loading names..." ) ;
final Map< Character, String > characters = new UnicodeNamesTextReader().loadNames() ;
final OutputStream outputStream = new FileOutputStream( targetFile ) ;
LOGGER.info( "Generating indexed file..." ) ;
generate( new BufferedOutputStream( outputStream, 640 * 1024 ), characters ) ;
outputStream.close() ;
}
/**
* Generates the offset table and the names.
*
* @param outputStream not flushed.
* @param characterNames a Map with characters having contiguous codes that start by 0.
*/
public static void generate(
final OutputStream outputStream,
final Map< Character, String > characterNames
) throws IOException {
final Set< Character > characters = characterNames.keySet() ;
final List< Character > characterList = Lists.newArrayList( characters ) ;
Collections.sort( characterList, CHARACTER_COMPARATOR /* Needed? */ ) ;
final int lastCharacterIndex = characterList.get( characterList.size() - 1 ) ;
generate( outputStream, characterNames, lastCharacterIndex + 1 ) ;
}
/**
* Generates the offset table and the names.
*
* @param outputStream not flushed.
* @param characterNames a Map with characters having contiguous codes that start by 0.
*/
public static void generate(
final OutputStream outputStream,
final Map< Character, String > characterNames,
final int totalCharacterCount
) throws IOException {
Preconditions.checkArgument( totalCharacterCount <= UNSIGNED_MAX_16BIT ) ;
final Map< Integer, Integer > offsetsFromFirstName =
Maps.newHashMapWithExpectedSize( totalCharacterCount ) ;
final Map< Character, byte[] > characterNamesAsBytes =
calculateCharacterNamesAsBytes( characterNames ) ;
// Find the offset of the name of each character.
int writePositionFromFirstName = 0 ;
int characterCount = 0 ;
for( int characterIndex = 0 ; characterIndex < totalCharacterCount ; characterIndex ++ ) {
final Character character = ( char ) characterIndex ;
if( characterNames.containsKey( character ) ) {
offsetsFromFirstName.put( characterIndex, writePositionFromFirstName ) ;
writePositionFromFirstName +=
characterNamesAsBytes.get( character ).length + // Real length.
1 // Terminal zero.
;
characterCount ++ ;
} else {
offsetsFromFirstName.put( characterIndex, null ) ;
}
}
LOGGER.debug( "Found " + characterCount + " characters." ) ;
// Write character count.
outputStream.write( asBytes( totalCharacterCount ) ) ;
// Write offsets.
final int offsetTableSize = totalCharacterCount * 4 ;
for( int characterIndex = 0 ; characterIndex < totalCharacterCount ; characterIndex ++ ) {
final byte[] bytes ;
final Integer value = offsetsFromFirstName.get( characterIndex ) ;
if( value == null ) {
bytes = ZERO_OFFSET ;
} else {
bytes = asBytes( 4 + offsetTableSize + value ) ;
}
outputStream.write( bytes ) ;
}
// Write names.
for( int characterIndex = 0 ; characterIndex < totalCharacterCount ; characterIndex ++ ) {
final byte[] nameBytes = characterNamesAsBytes.get( ( char ) characterIndex ) ;
if( nameBytes != null ) {
outputStream.write( nameBytes ) ;
outputStream.write( TERMINAL_ZERO ) ;
}
}
outputStream.flush() ;
LOGGER.debug( "Generation complete." ) ;
}
/**
* Getting bytes only once speeds generation up a lot.
*/
private static Map< Character, byte[] > calculateCharacterNamesAsBytes(
final Map<Character, String> characterNames
) {
final Map< Character, byte[] > map = Maps.newHashMapWithExpectedSize( characterNames.size() ) ;
for( final Map.Entry< Character, String > entry : characterNames.entrySet() ) {
map.put( entry.getKey(), entry.getValue().replace( ' ', '_' ).getBytes( CHARSET ) ) ;
}
return map ;
}
private static final Charset CHARSET = Charset.forName( "UTF-8" ) ;
private static final byte[] TERMINAL_ZERO = { 0 } ;
private static final byte[] ZERO_OFFSET = { 0, 0, 0, 0 } ;
private static final Comparator< Character > CHARACTER_COMPARATOR =
new Comparator< Character >() {
@Override
public int compare( final Character c1, final Character c2 ) {
return ( ( int ) c1.charValue() ) - ( ( int ) c2.charValue() ) ;
}
}
;
/*package*/ static byte[] asBytes( final int i ) {
final byte[] bytes = new byte[ 4 ] ;
bytes[ 0 ] = ( byte ) ( i >>> 24 ) ;
bytes[ 1 ] = ( byte ) ( i >>> 16 ) ;
bytes[ 2 ] = ( byte ) ( i >>> 8 ) ;
bytes[ 3 ] = ( byte ) ( i & 0x000000FF ) ;
return bytes ;
}
// ==============================================
// Main, supports no arg for interactive testing.
// ==============================================
public static void main( final String[] args ) throws IOException {
final File targetDirectory ;
if( args.length == 0 ) {
final File projectDirectory = SystemUtils.USER_DIR.endsWith( "idea" ) ?
new File( SystemUtils.USER_DIR ).getParentFile() :
new File( SystemUtils.USER_DIR )
;
targetDirectory = new File( projectDirectory, "idea/generated/antlr" ) ;
} else if( args.length == 1 ) {
targetDirectory = new File( args[ 0 ] ) ;
} else {
throw new IllegalArgumentException(
"Usage: " + ClassUtils.getShortClassName( UnicodeNamesGenerator.class ) +
"[target-directory]"
) ;
}
new UnicodeNamesGenerator( "org.novelang.parser.unicode", "names.bin", targetDirectory ).
generate() ;
}
}