/** * Copyright (c) 2002-2013 "Neo Technology," * Network Engine for Objects in Lund AB [http://neotechnology.com] * * This file is part of Neo4j. * * Neo4j is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.neo4j.kernel.impl.nioneo.store; import java.io.UnsupportedEncodingException; import java.util.EnumSet; /** * Supports encoding alphanumerical and <code>SP . - + , ' : / _</code> * * @author Tobias Ivarsson <tobias.ivarsson@neotechnology.com> */ public enum ShortString { /** * Binary coded decimal with punctuation. * * <pre> * HEADER (binary): 0000 LENG DATA... (0-14 chars) [4bit data] * HEADER (binary): 0001 DATA... (15 chars) [4bit data] * * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- 0 1 2 3 4 5 6 7 8 9 + , ' SP . - * </pre> */ NUMERICAL( 15, 0x0F, 4 ) { @Override int encTranslate( byte b ) { if ( b >= '0' && b <= '9' ) return b - '0'; switch ( b ) { case 0: return 0xA; case 2: return 0xB; case 3: return 0xC; case 6: return 0xD; case 7: return 0xE; case 8: return 0xF; default: throw cannotEncode( b ); } } @Override int encPunctuation( byte b ) { throw cannotEncode( b ); } @Override char decTranslate( byte codePoint ) { if ( codePoint < 10 ) return (char) ( codePoint + '0' ); return decPunctuation( ( codePoint - 10 + 6 ) ); } @Override long header( int length ) { if ( length == max ) return 0x10; return length; } }, /** * Upper-case characters with punctuation. * * <pre> * HEADER (binary): 0010 LENG DATA... (0-11 chars) [5bit data] * HEADER (binary): 0011 DATA... (12 chars) [5bit data] * * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP A B C D E F G H I J K L M N O * 1- P Q R S T U V W X Y Z _ . - : / * </pre> */ UPPER( 12, 0x1F, 5 ) { @Override int encTranslate( byte b ) { return super.encTranslate( b ) - 0x40; } @Override int encPunctuation( byte b ) { return b == 0 ? 0x40 : b + 0x5a; } @Override char decTranslate( byte codePoint ) { if ( codePoint == 0 ) return ' '; if ( codePoint <= 0x1A ) return (char) ( codePoint + 'A' - 1 ); return decPunctuation( codePoint - 0x1A ); } @Override long header( int length ) { // shift to get padding if ( length == max ) return 0x30 << 1; return ( 0x20 | length ) << 1; } }, /** * Lower-case characters with punctuation. * * <pre> * HEADER (binary): 0100 LENG DATA... (0-11 chars) [5bit data] * HEADER (binary): 0101 DATA... (12 chars) [5bit data] * * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP a b c d e f g h i j k l m n o * 1- p q r s t u v w x y z _ . - : / * </pre> */ LOWER( 12, 0x1F, 5 ) { @Override int encTranslate( byte b ) { return super.encTranslate( b ) - 0x60; } @Override int encPunctuation( byte b ) { return b == 0 ? 0x60 : b + 0x7a; } @Override char decTranslate( byte codePoint ) { if ( codePoint == 0 ) return ' '; if ( codePoint <= 0x1A ) return (char) ( codePoint + 'a' - 1 ); return decPunctuation( codePoint - 0x1A ); } @Override long header( int length ) { // shift to get padding if ( length == max ) return 0x50 << 1; return ( 0x40 | length ) << 1; } }, /** * Alpha-numerical characters space and underscore. * * HEADER (binary): 0110 LENG DATA... (10 chars) [6bit data] * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP A B C D E F G H I J K L M N O * 1- P Q R S T U V W X Y Z 0 1 2 3 4 * 2- _ a b c d e f g h i j k l m n o * 3- p q r s t u v w x y z 5 6 7 8 9 * </pre> */ ALPHANUM( 10, 0x3F, 6 ) { @Override char decTranslate( byte codePoint ) { return EUROPEAN.decTranslate( (byte) ( codePoint + 0x40 ) ); } @Override int encTranslate( byte b ) { // Punctuation is in the same places as European if ( b < 0x20 ) return encPunctuation( b ); // Punctuation // But the rest is transposed by 0x40 return EUROPEAN.encTranslate( b ) - 0x40; } @Override int encPunctuation( byte b ) { switch ( b ) { case 0: return 0x00; // SPACE case 1: return 0x20; // UNDERSCORE default: throw cannotEncode( b ); } } @Override long header( int length ) { return 0x60 << 2; } }, /** * The most common European characters (latin-1 but with less punctuation). * * <pre> * HEADER (binary): 0111 0LEN DATA... (1-8 chars) [7bit data] * HEADER (binary): 1DATA... (9 chars) [7bit data] * * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï * 1- Ð Ñ Ò Ó Ô Õ Ö . Ø Ù Ú Û Ü Ý Þ ß * 2- à á â ã ä å æ ç è é ê ë ì í î ï * 3- ð ñ ò ó ô õ ö - ø ù ú û ü ý þ ÿ * 4- SP A B C D E F G H I J K L M N O * 5- P Q R S T U V W X Y Z 0 1 2 3 4 * 6- _ a b c d e f g h i j k l m n o * 7- p q r s t u v w x y z 5 6 7 8 9 * </pre> */ EUROPEAN( 9, 0x7F, 7 ) { @Override char decTranslate( byte codePoint ) { if ( codePoint < 0x40 ) { if ( codePoint == 0x17 ) return '.'; if ( codePoint == 0x37 ) return '-'; return (char) ( codePoint + 0xC0 ); } else { if ( codePoint == 0x40 ) return ' '; if ( codePoint == 0x60 ) return '_'; if ( codePoint >= 0x5B && codePoint < 0x60 ) return (char) ( '0' + codePoint - 0x5B ); if ( codePoint >= 0x7B && codePoint < 0x80 ) return (char) ( '5' + codePoint - 0x7B ); return (char) codePoint; } } @Override int encPunctuation( byte b ) { switch ( b ) { case 0x00: return 0x40; // SPACE case 0x01: return 0x60; // UNDERSCORE case 0x02: return 0x17; // DOT case 0x03: return 0x37; // DASH default: throw cannotEncode( b ); } } @Override long header( int length ) { if ( length == max ) return 0x80; return 0x70 | ( length - 1 ); } }; final int max; final short mask; final short step; private ShortString( int max, int mask, int step ) { this.max = max; this.mask = (short) mask; this.step = (short) step; } final IllegalArgumentException cannotEncode( byte b ) { return new IllegalArgumentException( "Cannot encode as " + this.name() + ": " + b ); } /** Lookup table for decoding punctuation */ private static final char[] PUNCTUATION = { ' ', '_', '.', '-', ':', '/', ' ', '.', '-', '+', ',', '\'', }; final char decPunctuation( int code ) { return PUNCTUATION[code]; } public static void main( String[] args ) { System.out.println( Long.toHexString( (byte) 'À' ) ); System.out.println( Long.toHexString( (byte) 'ÿ' ) ); System.out.println( Long.toHexString( EUROPEAN.encTranslate( (byte) 'À' ) ) ); System.out.println( Long.toHexString( EUROPEAN.encTranslate( (byte) 'ÿ' ) ) ); } int encTranslate( byte b ) { if ( b < 0 ) return ( 0xFF & b ) - 0xC0; // European chars if ( b < 0x20 ) return encPunctuation( b ); // Punctuation if ( b >= '0' && b <= '4' ) return 0x5B + b - '0'; // Numbers if ( b >= '5' && b <= '9' ) return 0x7B + b - '5'; // Numbers return b; // Alphabetical } abstract int encPunctuation( byte b ); abstract char decTranslate( byte codePoint ); abstract long header( int length ); /** * Encodes a short string. * * @param string the string to encode. * @param target the property record to store the encoded string in * @return <code>true</code> if the string could be encoded as a short * string, <code>false</code> if it couldn't. */ /* * Intermediate code table * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP _ . - : / + , ' * 1- * 2- * 3- 0 1 2 3 4 5 6 7 8 9 * 4- A B C D E F G H I J K L M N O * 5- P Q R S T U V W X Y Z * 6- a b c d e f g h i j k l m n o * 7- p q r s t u v w x y z * 8- * 9- * A- * B- * C- À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï * D- Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß * E- à á â ã ä å æ ç è é ê ë ì í î ï * F- ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ */ public static boolean encode( int keyId, String string, PropertyRecord target ) { if ( string.length() > 15 ) return false; // Not handled by any encoding if ( string.equals( "" ) ) { applyInRecord( target, keyId, 0 ); return true; } // Keep track of the possible encodings that can be used for the string EnumSet<ShortString> possible = null; // First try encoding using Latin-1 if ( string.length() < 8 ) { if ( encodeLatin1( keyId, string, target ) ) return true; // If the string was short enough, but still didn't fit in latin-1 // we know that no other encoding will work either, remember that // so that we can try UTF-8 at the end of this method possible = EnumSet.noneOf( ShortString.class ); } // Allocate space for the intermediate representation // (using the intermediate representation table above) byte[] data = new byte[string.length()]; if ( possible == null ) { possible = EnumSet.allOf( ShortString.class ); // ALPHANUM can only store len == 10 if ( data.length != 10 ) possible.remove( ALPHANUM ); if ( data.length > 9 ) possible.remove( EUROPEAN ); if ( data.length > 12 ) possible.removeAll( EnumSet.of( UPPER, LOWER ) ); } LOOP: for ( int i = 0; i < data.length && !possible.isEmpty(); i++ ) { char c = string.charAt( i ); switch ( c ) { case ' ': data[i] = 0; break; case '_': data[i] = 1; possible.remove( NUMERICAL ); break; case '.': data[i] = 2; possible.remove( ALPHANUM ); break; case '-': data[i] = 3; possible.remove( ALPHANUM ); break; case ':': data[i] = 4; possible.removeAll( EnumSet.of( ALPHANUM, NUMERICAL, EUROPEAN ) ); break; case '/': data[i] = 5; possible.removeAll( EnumSet.of( ALPHANUM, NUMERICAL, EUROPEAN ) ); break; case '+': data[i] = 6; possible.retainAll( EnumSet.of( NUMERICAL ) ); break; case ',': data[i] = 7; possible.retainAll( EnumSet.of( NUMERICAL ) ); break; case '\'': data[i] = 8; possible.retainAll( EnumSet.of( NUMERICAL ) ); break; default: if ( ( c >= 'A' && c <= 'Z' ) ) { possible.remove( NUMERICAL ); possible.remove( LOWER ); } else if ( ( c >= 'a' && c <= 'z' ) ) { possible.remove( NUMERICAL ); possible.remove( UPPER ); } else if ( ( c >= '0' && c <= '9' ) ) { possible.remove( UPPER ); possible.remove( LOWER ); } else if ( c >= 'À' && c <= 'ÿ' && c != 0xD7 && c != 0xF7 ) { possible.retainAll( EnumSet.of( EUROPEAN ) ); } else { possible.clear(); break LOOP; // fall back to UTF-8 } data[i] = (byte) c; } } for ( ShortString encoding : possible ) { // Will return false if the data is too long for the encoding if ( encoding.doEncode( keyId, data, target ) ) return true; } if ( string.length() <= 6 ) { // We might have a chance with UTF-8 - try it! try { return encodeUTF8( keyId, string.getBytes( "UTF-8" ), target ); } catch ( UnsupportedEncodingException e ) { throw new IllegalStateException( "All JVMs must support UTF-8", e ); } } return false; } private static void applyInRecord( PropertyRecord target, int keyId, long propBlock ) { // long data = 0; // data |= ( (long) keyId << 40 ); // data |= ( (long) PropertyType.SHORT_STRING.intValue() << 36 ); // data |= ( (long) encoding << 32 ); // data |= ( (long) stringLength << 28 ); // // target.setSinglePropBlock( data ); } /** * Decode a short string represented as a long * * @param data the value to decode to a short string. * @return the decoded short string */ public static String decode( long data ) { if ( data == 0 ) return ""; int header = (int) ( data >>> 56 ); ShortString table; switch ( header >>> 4 ) { case 0: // 0b0000 - NUMERICAL 4bit (0-14 chars) if ( ( header &= 0x0F ) == 0 ) return decodeUTF8( data ); //$FALL-THROUGH$ case 1: // 0b0001 - NUMERICAL 4bit (15 chars) table = NUMERICAL; break; case 2: // 0b0010 - UPPER 5bit (0-11 chars) header &= 0x0F; //$FALL-THROUGH$ case 3: // 0b0011 - UPPER 5bit (12 chars) table = UPPER; break; case 4: // 0b0100 - LOWER 5bit (0-11 chars) header &= 0x0F; //$FALL-THROUGH$ case 5: // 0b0101 - LOWER 5bit (12 chars) table = LOWER; break; case 6: // 0b0110 - ALPHANUM 6bit (10 chars) table = ALPHANUM; break; case 7: // 0b0111 - EUROPEAN 7bit (1-8 chars) or LATIN1 8bit (0-7 chars) header &= 0x0F; if ( ( header & 0x08 ) != 0 ) { // 0b0111 1 - LATIN1 8bit (0-7 chars) return decodeLatin1( data, ( header & 0x07 ) + 1 ); } else { // 0b0111 0 - EUROPEAN 7bit (1-8 chars) header += 1; // offset char count } //$FALL-THROUGH$ default: // 0b1XXX- EUROPEAN 7bit (9 chars) table = EUROPEAN; break; } if ( header > 15 ) header = table.max; // header is now length char[] result = new char[header]; // encode shifts in the bytes with the first char at the MSB, therefore // we must "unshift" in the reverse order for ( int i = result.length - 1; i >= 0; i-- ) { result[i] = table.decTranslate( (byte) ( data & table.mask ) ); data >>>= table.step; } return new String( result ); } private static boolean encodeLatin1( int keyId, String string, PropertyRecord target ) { // see doEncode long result = 0x78 | ( string.length() - 1 ); result <<= ( 7 - string.length() ) * 8; // move the header to its place for ( int i = 0; i < string.length(); i++ ) { char c = string.charAt( i ); if ( c < 0 || c >= 256 ) return false; result = ( result << 8 ) | c; } applyInRecord( target, keyId, result ); return true; } private static boolean encodeUTF8( int keyId, byte[] bytes, PropertyRecord target ) { // UTF-8 padded with null bytes if ( bytes.length > 7 ) return false; long result = 0; for ( byte b : bytes ) { result = ( result << 8 ) | ( 0xFF & b ); } applyInRecord( target, keyId, result ); return true; } private boolean doEncode( int keyId, byte[] data, PropertyRecord target ) { if ( data.length > max ) return false; long result = header( data.length ); result <<= ( max - data.length ) * step; // move the header to its place for ( int i = 0; i < data.length; i++ ) { // shift the data along and mask in each piece if ( i != 0 ) result <<= step; result |= encTranslate( data[i] ); } applyInRecord( target, keyId, result ); return true; } private static String decodeLatin1( long data, int length ) { // see decode char[] result = new char[length]; for ( int i = result.length - 1; i >= 0; i-- ) { result[i] = (char) ( data & 0xFF ); data >>>= 8; } return new String( result ); } private static String decodeUTF8( long data ) { byte[] temp = new byte[7]; int size = 7; while ( data != 0 ) // since we pad with null bytes { temp[--size] = (byte) ( data & 0xFF ); data >>>= 8; } // we didn't know the length up front, compensate for that byte[] result; if ( size == 0 ) { result = temp; } else { result = new byte[7 - size]; for ( int i = 0; i < result.length; i++ ) { result[i] = temp[size + i]; } } try { return new String( result, "UTF-8" ); } catch ( UnsupportedEncodingException e ) { throw new IllegalStateException( "All JVMs must support UTF-8", e ); } } }