/**
* Copyright (c) 2002-2013 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.kernel.impl.nioneo.store;
import java.io.UnsupportedEncodingException;
import java.util.EnumSet;
/**
* Supports encoding alphanumerical and <code>SP . - + , ' : / _</code>
*
* @author Tobias Ivarsson <tobias.ivarsson@neotechnology.com>
*/
public enum ShortString
{
/**
* Binary coded decimal with punctuation.
*
* <pre>
* HEADER (binary): 0000 LENG DATA... (0-14 chars) [4bit data]
* HEADER (binary): 0001 DATA... (15 chars) [4bit data]
*
* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F
* 0- 0 1 2 3 4 5 6 7 8 9 + , ' SP . -
* </pre>
*/
NUMERICAL( 15, 0x0F, 4 )
{
@Override
int encTranslate( byte b )
{
if ( b >= '0' && b <= '9' ) return b - '0';
switch ( b )
{
case 0:
return 0xA;
case 2:
return 0xB;
case 3:
return 0xC;
case 6:
return 0xD;
case 7:
return 0xE;
case 8:
return 0xF;
default:
throw cannotEncode( b );
}
}
@Override
int encPunctuation( byte b )
{
throw cannotEncode( b );
}
@Override
char decTranslate( byte codePoint )
{
if ( codePoint < 10 ) return (char) ( codePoint + '0' );
return decPunctuation( ( codePoint - 10 + 6 ) );
}
@Override
long header( int length )
{
if ( length == max ) return 0x10;
return length;
}
},
/**
* Upper-case characters with punctuation.
*
* <pre>
* HEADER (binary): 0010 LENG DATA... (0-11 chars) [5bit data]
* HEADER (binary): 0011 DATA... (12 chars) [5bit data]
*
* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F
* 0- SP A B C D E F G H I J K L M N O
* 1- P Q R S T U V W X Y Z _ . - : /
* </pre>
*/
UPPER( 12, 0x1F, 5 )
{
@Override
int encTranslate( byte b )
{
return super.encTranslate( b ) - 0x40;
}
@Override
int encPunctuation( byte b )
{
return b == 0 ? 0x40 : b + 0x5a;
}
@Override
char decTranslate( byte codePoint )
{
if ( codePoint == 0 ) return ' ';
if ( codePoint <= 0x1A ) return (char) ( codePoint + 'A' - 1 );
return decPunctuation( codePoint - 0x1A );
}
@Override
long header( int length )
{
// shift to get padding
if ( length == max ) return 0x30 << 1;
return ( 0x20 | length ) << 1;
}
},
/**
* Lower-case characters with punctuation.
*
* <pre>
* HEADER (binary): 0100 LENG DATA... (0-11 chars) [5bit data]
* HEADER (binary): 0101 DATA... (12 chars) [5bit data]
*
* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F
* 0- SP a b c d e f g h i j k l m n o
* 1- p q r s t u v w x y z _ . - : /
* </pre>
*/
LOWER( 12, 0x1F, 5 )
{
@Override
int encTranslate( byte b )
{
return super.encTranslate( b ) - 0x60;
}
@Override
int encPunctuation( byte b )
{
return b == 0 ? 0x60 : b + 0x7a;
}
@Override
char decTranslate( byte codePoint )
{
if ( codePoint == 0 ) return ' ';
if ( codePoint <= 0x1A ) return (char) ( codePoint + 'a' - 1 );
return decPunctuation( codePoint - 0x1A );
}
@Override
long header( int length )
{
// shift to get padding
if ( length == max ) return 0x50 << 1;
return ( 0x40 | length ) << 1;
}
},
/**
* Alpha-numerical characters space and underscore.
*
* HEADER (binary): 0110 LENG DATA... (10 chars) [6bit data]
*
* <pre>
* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F
* 0- SP A B C D E F G H I J K L M N O
* 1- P Q R S T U V W X Y Z 0 1 2 3 4
* 2- _ a b c d e f g h i j k l m n o
* 3- p q r s t u v w x y z 5 6 7 8 9
* </pre>
*/
ALPHANUM( 10, 0x3F, 6 )
{
@Override
char decTranslate( byte codePoint )
{
return EUROPEAN.decTranslate( (byte) ( codePoint + 0x40 ) );
}
@Override
int encTranslate( byte b )
{
// Punctuation is in the same places as European
if ( b < 0x20 ) return encPunctuation( b ); // Punctuation
// But the rest is transposed by 0x40
return EUROPEAN.encTranslate( b ) - 0x40;
}
@Override
int encPunctuation( byte b )
{
switch ( b )
{
case 0:
return 0x00; // SPACE
case 1:
return 0x20; // UNDERSCORE
default:
throw cannotEncode( b );
}
}
@Override
long header( int length )
{
return 0x60 << 2;
}
},
/**
* The most common European characters (latin-1 but with less punctuation).
*
* <pre>
* HEADER (binary): 0111 0LEN DATA... (1-8 chars) [7bit data]
* HEADER (binary): 1DATA... (9 chars) [7bit data]
*
* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F
* 0- À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï
* 1- Ð Ñ Ò Ó Ô Õ Ö . Ø Ù Ú Û Ü Ý Þ ß
* 2- à á â ã ä å æ ç è é ê ë ì í î ï
* 3- ð ñ ò ó ô õ ö - ø ù ú û ü ý þ ÿ
* 4- SP A B C D E F G H I J K L M N O
* 5- P Q R S T U V W X Y Z 0 1 2 3 4
* 6- _ a b c d e f g h i j k l m n o
* 7- p q r s t u v w x y z 5 6 7 8 9
* </pre>
*/
EUROPEAN( 9, 0x7F, 7 )
{
@Override
char decTranslate( byte codePoint )
{
if ( codePoint < 0x40 )
{
if ( codePoint == 0x17 ) return '.';
if ( codePoint == 0x37 ) return '-';
return (char) ( codePoint + 0xC0 );
}
else
{
if ( codePoint == 0x40 ) return ' ';
if ( codePoint == 0x60 ) return '_';
if ( codePoint >= 0x5B && codePoint < 0x60 ) return (char) ( '0' + codePoint - 0x5B );
if ( codePoint >= 0x7B && codePoint < 0x80 ) return (char) ( '5' + codePoint - 0x7B );
return (char) codePoint;
}
}
@Override
int encPunctuation( byte b )
{
switch ( b )
{
case 0x00:
return 0x40; // SPACE
case 0x01:
return 0x60; // UNDERSCORE
case 0x02:
return 0x17; // DOT
case 0x03:
return 0x37; // DASH
default:
throw cannotEncode( b );
}
}
@Override
long header( int length )
{
if ( length == max ) return 0x80;
return 0x70 | ( length - 1 );
}
};
final int max;
final short mask;
final short step;
private ShortString( int max, int mask, int step )
{
this.max = max;
this.mask = (short) mask;
this.step = (short) step;
}
final IllegalArgumentException cannotEncode( byte b )
{
return new IllegalArgumentException( "Cannot encode as " + this.name() + ": " + b );
}
/** Lookup table for decoding punctuation */
private static final char[] PUNCTUATION = { ' ', '_', '.', '-', ':', '/', ' ', '.', '-', '+', ',', '\'', };
final char decPunctuation( int code )
{
return PUNCTUATION[code];
}
public static void main( String[] args )
{
System.out.println( Long.toHexString( (byte) 'À' ) );
System.out.println( Long.toHexString( (byte) 'ÿ' ) );
System.out.println( Long.toHexString( EUROPEAN.encTranslate( (byte) 'À' ) ) );
System.out.println( Long.toHexString( EUROPEAN.encTranslate( (byte) 'ÿ' ) ) );
}
int encTranslate( byte b )
{
if ( b < 0 ) return ( 0xFF & b ) - 0xC0; // European chars
if ( b < 0x20 ) return encPunctuation( b ); // Punctuation
if ( b >= '0' && b <= '4' ) return 0x5B + b - '0'; // Numbers
if ( b >= '5' && b <= '9' ) return 0x7B + b - '5'; // Numbers
return b; // Alphabetical
}
abstract int encPunctuation( byte b );
abstract char decTranslate( byte codePoint );
abstract long header( int length );
/**
* Encodes a short string.
*
* @param string the string to encode.
* @param target the property record to store the encoded string in
* @return <code>true</code> if the string could be encoded as a short
* string, <code>false</code> if it couldn't.
*/
/*
* Intermediate code table
* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F
* 0- SP _ . - : / + , '
* 1-
* 2-
* 3- 0 1 2 3 4 5 6 7 8 9
* 4- A B C D E F G H I J K L M N O
* 5- P Q R S T U V W X Y Z
* 6- a b c d e f g h i j k l m n o
* 7- p q r s t u v w x y z
* 8-
* 9-
* A-
* B-
* C- À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï
* D- Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß
* E- à á â ã ä å æ ç è é ê ë ì í î ï
* F- ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ
*/
public static boolean encode( int keyId, String string, PropertyRecord target )
{
if ( string.length() > 15 ) return false; // Not handled by any encoding
if ( string.equals( "" ) )
{
applyInRecord( target, keyId, 0 );
return true;
}
// Keep track of the possible encodings that can be used for the string
EnumSet<ShortString> possible = null;
// First try encoding using Latin-1
if ( string.length() < 8 )
{
if ( encodeLatin1( keyId, string, target ) ) return true;
// If the string was short enough, but still didn't fit in latin-1
// we know that no other encoding will work either, remember that
// so that we can try UTF-8 at the end of this method
possible = EnumSet.noneOf( ShortString.class );
}
// Allocate space for the intermediate representation
// (using the intermediate representation table above)
byte[] data = new byte[string.length()];
if ( possible == null )
{
possible = EnumSet.allOf( ShortString.class );
// ALPHANUM can only store len == 10
if ( data.length != 10 ) possible.remove( ALPHANUM );
if ( data.length > 9 ) possible.remove( EUROPEAN );
if ( data.length > 12 ) possible.removeAll( EnumSet.of( UPPER, LOWER ) );
}
LOOP: for ( int i = 0; i < data.length && !possible.isEmpty(); i++ )
{
char c = string.charAt( i );
switch ( c )
{
case ' ':
data[i] = 0;
break;
case '_':
data[i] = 1;
possible.remove( NUMERICAL );
break;
case '.':
data[i] = 2;
possible.remove( ALPHANUM );
break;
case '-':
data[i] = 3;
possible.remove( ALPHANUM );
break;
case ':':
data[i] = 4;
possible.removeAll( EnumSet.of( ALPHANUM, NUMERICAL, EUROPEAN ) );
break;
case '/':
data[i] = 5;
possible.removeAll( EnumSet.of( ALPHANUM, NUMERICAL, EUROPEAN ) );
break;
case '+':
data[i] = 6;
possible.retainAll( EnumSet.of( NUMERICAL ) );
break;
case ',':
data[i] = 7;
possible.retainAll( EnumSet.of( NUMERICAL ) );
break;
case '\'':
data[i] = 8;
possible.retainAll( EnumSet.of( NUMERICAL ) );
break;
default:
if ( ( c >= 'A' && c <= 'Z' ) )
{
possible.remove( NUMERICAL );
possible.remove( LOWER );
}
else if ( ( c >= 'a' && c <= 'z' ) )
{
possible.remove( NUMERICAL );
possible.remove( UPPER );
}
else if ( ( c >= '0' && c <= '9' ) )
{
possible.remove( UPPER );
possible.remove( LOWER );
}
else if ( c >= 'À' && c <= 'ÿ' && c != 0xD7 && c != 0xF7 )
{
possible.retainAll( EnumSet.of( EUROPEAN ) );
}
else
{
possible.clear();
break LOOP; // fall back to UTF-8
}
data[i] = (byte) c;
}
}
for ( ShortString encoding : possible )
{
// Will return false if the data is too long for the encoding
if ( encoding.doEncode( keyId, data, target ) ) return true;
}
if ( string.length() <= 6 )
{ // We might have a chance with UTF-8 - try it!
try
{
return encodeUTF8( keyId, string.getBytes( "UTF-8" ), target );
}
catch ( UnsupportedEncodingException e )
{
throw new IllegalStateException( "All JVMs must support UTF-8", e );
}
}
return false;
}
private static void applyInRecord( PropertyRecord target, int keyId,
long propBlock )
{
// long data = 0;
// data |= ( (long) keyId << 40 );
// data |= ( (long) PropertyType.SHORT_STRING.intValue() << 36 );
// data |= ( (long) encoding << 32 );
// data |= ( (long) stringLength << 28 );
//
// target.setSinglePropBlock( data );
}
/**
* Decode a short string represented as a long
*
* @param data the value to decode to a short string.
* @return the decoded short string
*/
public static String decode( long data )
{
if ( data == 0 ) return "";
int header = (int) ( data >>> 56 );
ShortString table;
switch ( header >>> 4 )
{
case 0: // 0b0000 - NUMERICAL 4bit (0-14 chars)
if ( ( header &= 0x0F ) == 0 ) return decodeUTF8( data );
//$FALL-THROUGH$
case 1: // 0b0001 - NUMERICAL 4bit (15 chars)
table = NUMERICAL;
break;
case 2: // 0b0010 - UPPER 5bit (0-11 chars)
header &= 0x0F;
//$FALL-THROUGH$
case 3: // 0b0011 - UPPER 5bit (12 chars)
table = UPPER;
break;
case 4: // 0b0100 - LOWER 5bit (0-11 chars)
header &= 0x0F;
//$FALL-THROUGH$
case 5: // 0b0101 - LOWER 5bit (12 chars)
table = LOWER;
break;
case 6: // 0b0110 - ALPHANUM 6bit (10 chars)
table = ALPHANUM;
break;
case 7: // 0b0111 - EUROPEAN 7bit (1-8 chars) or LATIN1 8bit (0-7 chars)
header &= 0x0F;
if ( ( header & 0x08 ) != 0 )
{ // 0b0111 1 - LATIN1 8bit (0-7 chars)
return decodeLatin1( data, ( header & 0x07 ) + 1 );
}
else
{ // 0b0111 0 - EUROPEAN 7bit (1-8 chars)
header += 1; // offset char count
}
//$FALL-THROUGH$
default: // 0b1XXX- EUROPEAN 7bit (9 chars)
table = EUROPEAN;
break;
}
if ( header > 15 ) header = table.max; // header is now length
char[] result = new char[header];
// encode shifts in the bytes with the first char at the MSB, therefore
// we must "unshift" in the reverse order
for ( int i = result.length - 1; i >= 0; i-- )
{
result[i] = table.decTranslate( (byte) ( data & table.mask ) );
data >>>= table.step;
}
return new String( result );
}
private static boolean encodeLatin1( int keyId, String string, PropertyRecord target )
{ // see doEncode
long result = 0x78 | ( string.length() - 1 );
result <<= ( 7 - string.length() ) * 8; // move the header to its place
for ( int i = 0; i < string.length(); i++ )
{
char c = string.charAt( i );
if ( c < 0 || c >= 256 ) return false;
result = ( result << 8 ) | c;
}
applyInRecord( target, keyId, result );
return true;
}
private static boolean encodeUTF8( int keyId, byte[] bytes, PropertyRecord target )
{ // UTF-8 padded with null bytes
if ( bytes.length > 7 ) return false;
long result = 0;
for ( byte b : bytes )
{
result = ( result << 8 ) | ( 0xFF & b );
}
applyInRecord( target, keyId, result );
return true;
}
private boolean doEncode( int keyId, byte[] data, PropertyRecord target )
{
if ( data.length > max ) return false;
long result = header( data.length );
result <<= ( max - data.length ) * step; // move the header to its place
for ( int i = 0; i < data.length; i++ )
{ // shift the data along and mask in each piece
if ( i != 0 ) result <<= step;
result |= encTranslate( data[i] );
}
applyInRecord( target, keyId, result );
return true;
}
private static String decodeLatin1( long data, int length )
{ // see decode
char[] result = new char[length];
for ( int i = result.length - 1; i >= 0; i-- )
{
result[i] = (char) ( data & 0xFF );
data >>>= 8;
}
return new String( result );
}
private static String decodeUTF8( long data )
{
byte[] temp = new byte[7];
int size = 7;
while ( data != 0 ) // since we pad with null bytes
{
temp[--size] = (byte) ( data & 0xFF );
data >>>= 8;
}
// we didn't know the length up front, compensate for that
byte[] result;
if ( size == 0 )
{
result = temp;
}
else
{
result = new byte[7 - size];
for ( int i = 0; i < result.length; i++ )
{
result[i] = temp[size + i];
}
}
try
{
return new String( result, "UTF-8" );
}
catch ( UnsupportedEncodingException e )
{
throw new IllegalStateException( "All JVMs must support UTF-8", e );
}
}
}