/** * Copyright (c) 2002-2013 "Neo Technology," * Network Engine for Objects in Lund AB [http://neotechnology.com] * * This file is part of Neo4j. * * Neo4j is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.neo4j.kernel.impl.nioneo.store; import static java.util.Arrays.copyOf; import java.io.UnsupportedEncodingException; import java.util.EnumSet; import org.neo4j.kernel.impl.util.Bits; /** * Supports encoding alphanumerical and <code>SP . - + , ' : / _</code> * * (This version assumes 14bytes property block, instead of 8bytes) * * @author Tobias Ivarsson <tobias.ivarsson@neotechnology.com> */ public enum LongerShortString { /** * Binary coded decimal with punctuation. * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- 0 1 2 3 4 5 6 7 8 9 SP . - + , ' * </pre> */ NUMERICAL( 1, 4 ) { @Override int encTranslate( byte b ) { if ( b >= '0' && b <= '9' ) return b - '0'; switch ( b ) { // interm. encoded case 0: return 0xA; case 2: return 0xB; case 3: return 0xC; case 6: return 0xD; case 7: return 0xE; case 8: return 0xF; default: throw cannotEncode( b ); } } @Override int encPunctuation( byte b ) { throw cannotEncode( b ); } @Override char decTranslate( byte codePoint ) { if ( codePoint < 10 ) return (char) ( codePoint + '0' ); return decPunctuation( ( codePoint - 10 + 6 ) ); } }, /** * Binary coded decimal with punctuation. * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- 0 1 2 3 4 5 6 7 8 9 SP - : / + , * </pre> */ DATE( 2, 4 ) { @Override int encTranslate( byte b ) { if ( b >= '0' && b <= '9' ) return b - '0'; switch ( b ) { case 0: return 0xA; case 3: return 0xB; case 4: return 0xC; case 5: return 0xD; case 6: return 0xE; case 7: return 0xF; default: throw cannotEncode( b ); } } @Override int encPunctuation( byte b ) { throw cannotEncode( b ); } @Override char decTranslate( byte codePoint ) { if ( codePoint < 0xA ) return (char) ( codePoint + '0' ); switch ( codePoint ) { case 0xA: return ' '; case 0xB: return '-'; case 0xC: return ':'; case 0xD: return '/'; case 0xE: return '+'; default: return ','; } } }, /** * Upper-case characters with punctuation. * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP A B C D E F G H I J K L M N O * 1- P Q R S T U V W X Y Z _ . - : / * </pre> */ UPPER( 3, 5 ) { @Override int encTranslate( byte b ) { return super.encTranslate( b ) - 0x40; } @Override int encPunctuation( byte b ) { return b == 0 ? 0x40 : b + 0x5a; } @Override char decTranslate( byte codePoint ) { if ( codePoint == 0 ) return ' '; if ( codePoint <= 0x1A ) return (char) ( codePoint + 'A' - 1 ); return decPunctuation( codePoint - 0x1A ); } }, /** * Lower-case characters with punctuation. * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP a b c d e f g h i j k l m n o * 1- p q r s t u v w x y z _ . - : / * </pre> */ LOWER( 4, 5 ) { @Override int encTranslate( byte b ) { return super.encTranslate( b ) - 0x60; } @Override int encPunctuation( byte b ) { return b == 0 ? 0x60 : b + 0x7a; } @Override char decTranslate( byte codePoint ) { if ( codePoint == 0 ) return ' '; if ( codePoint <= 0x1A ) return (char) ( codePoint + 'a' - 1 ); return decPunctuation( codePoint - 0x1A ); } }, /** * Lower-case characters with punctuation. * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- , a b c d e f g h i j k l m n o * 1- p q r s t u v w x y z _ . - + @ * </pre> */ EMAIL( 5, 5 ) { @Override int encTranslate( byte b ) { return super.encTranslate( b ) - 0x60; } @Override int encPunctuation( byte b ) { int encOffset = 0x60; if ( b == 7 ) return encOffset; int offset = encOffset + 0x1B; switch ( b ) { case 1: return 0 + offset; case 2: return 1 + offset; case 3: return 2 + offset; case 6: return 3 + offset; case 9: return 4 + offset; default: throw cannotEncode( b ); } } @Override char decTranslate( byte codePoint ) { if ( codePoint == 0 ) return ','; if ( codePoint <= 0x1A ) return (char) ( codePoint + 'a' - 1 ); switch ( codePoint ) { case 0x1E: return '+'; case 0x1F: return '@'; default: return decPunctuation( codePoint - 0x1A ); } } }, /** * Lower-case characters, digits and punctuation and symbols. * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP a b c d e f g h i j k l m n o * 1- p q r s t u v w x y z * 2- 0 1 2 3 4 5 6 7 8 9 _ . - : / + * 3- , ' @ | ; * ? & % # ( ) $ < > = * </pre> */ URI( 6, 6 ) { @Override int encTranslate( byte b ) { if ( b == 0 ) return 0; // space if ( b >= 0x61 && b <= 0x7A ) return b - 0x60; // lower-case letters if ( b >= 0x30 && b <= 0x39 ) return b - 0x10; // digits if ( b >= 0x1 && b <= 0x16 ) return b + 0x29; // symbols throw cannotEncode( b ); } @Override int encPunctuation( byte b ) { // Handled by encTranslate throw cannotEncode( b ); } @Override char decTranslate( byte codePoint ) { if ( codePoint == 0 ) return ' '; if ( codePoint <= 0x1A ) return (char) ( codePoint + 'a' - 1 ); if ( codePoint <= 0x29 ) return (char) (codePoint - 0x20 + '0'); if ( codePoint <= 0x2E ) return decPunctuation( codePoint - 0x29 ); return decPunctuation( codePoint - 0x2F + 9); } }, /** * Alpha-numerical characters space and underscore. * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP A B C D E F G H I J K L M N O * 1- P Q R S T U V W X Y Z 0 1 2 3 4 * 2- _ a b c d e f g h i j k l m n o * 3- p q r s t u v w x y z 5 6 7 8 9 * </pre> */ ALPHANUM( 7, 6 ) { @Override char decTranslate( byte codePoint ) { return EUROPEAN.decTranslate( (byte) ( codePoint + 0x40 ) ); } @Override int encTranslate( byte b ) { // Punctuation is in the same places as European if ( b < 0x20 ) return encPunctuation( b ); // Punctuation // But the rest is transposed by 0x40 return EUROPEAN.encTranslate( b ) - 0x40; } @Override int encPunctuation( byte b ) { switch ( b ) { case 0: return 0x00; // SPACE case 1: return 0x20; // UNDERSCORE default: throw cannotEncode( b ); } } }, /** * Alpha-numerical characters space and underscore. * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP A B C D E F G H I J K L M N O * 1- P Q R S T U V W X Y Z _ . - : / * 2- ; a b c d e f g h i j k l m n o * 3- p q r s t u v w x y z + , ' @ | * </pre> */ ALPHASYM( 8, 6 ) { @Override char decTranslate( byte codePoint ) { if ( codePoint == 0x0 ) return ' '; if ( codePoint <= 0x1A ) return (char)('A' + codePoint - 0x1); if ( codePoint <= 0x1F ) return decPunctuation( codePoint - 0x1B + 1 ); if ( codePoint == 0x20 ) return ';'; if ( codePoint <= 0x3A ) return (char)('a' + codePoint - 0x21); return decPunctuation( codePoint - 0x3B + 9 ); } @Override int encTranslate( byte b ) { // Punctuation is in the same places as European if ( b < 0x20 ) return encPunctuation( b ); // Punctuation // But the rest is transposed by 0x40 // return EUROPEAN.encTranslate( b ) - 0x40; return b - 0x40; } @Override int encPunctuation( byte b ) { switch ( b ) { case 0x0: return 0x0; case 0x1: return 0x1B; case 0x2: return 0x1C; case 0x3: return 0x1D; case 0x4: return 0x1E; case 0x5: return 0x1F; case 0x6: return 0x3B; case 0x7: return 0x3C; case 0x8: return 0x3D; case 0x9: return 0x3E; case 0xA: return 0x3F; case 0xB: return 0x20; default: throw cannotEncode( b ); } } }, /** * The most common European characters (latin-1 but with less punctuation). * * <pre> * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï * 1- Ð Ñ Ò Ó Ô Õ Ö . Ø Ù Ú Û Ü Ý Þ ß * 2- à á â ã ä å æ ç è é ê ë ì í î ï * 3- ð ñ ò ó ô õ ö - ø ù ú û ü ý þ ÿ * 4- SP A B C D E F G H I J K L M N O * 5- P Q R S T U V W X Y Z 0 1 2 3 4 * 6- _ a b c d e f g h i j k l m n o * 7- p q r s t u v w x y z 5 6 7 8 9 * </pre> */ EUROPEAN( 9, 7 ) { @Override char decTranslate( byte codePoint ) { if ( codePoint < 0x40 ) { if ( codePoint == 0x17 ) return '.'; if ( codePoint == 0x37 ) return '-'; return (char) ( codePoint + 0xC0 ); } else { if ( codePoint == 0x40 ) return ' '; if ( codePoint == 0x60 ) return '_'; if ( codePoint >= 0x5B && codePoint < 0x60 ) return (char) ( '0' + codePoint - 0x5B ); if ( codePoint >= 0x7B && codePoint < 0x80 ) return (char) ( '5' + codePoint - 0x7B ); return (char) codePoint; } } @Override int encPunctuation( byte b ) { switch ( b ) { case 0x00: return 0x40; // SPACE case 0x01: return 0x60; // UNDERSCORE case 0x02: return 0x17; // DOT case 0x03: return 0x37; // DASH case 0x07: // TODO return 0; default: throw cannotEncode( b ); } } }; final int encodingHeader; final short mask; final short step; private LongerShortString( int encodingHeader, int step ) { this.encodingHeader = encodingHeader; this.mask = (short) Bits.rightOverflowMask( step ); this.step = (short) step; } int maxLength( int payloadSize ) { // key-type-encoding-length return ((payloadSize << 3)-24-4-4-6)/step; } final IllegalArgumentException cannotEncode( byte b ) { return new IllegalArgumentException( "Cannot encode as " + this.name() + ": " + b ); } /** Lookup table for decoding punctuation */ private static final char[] PUNCTUATION = { ' ', '_', '.', '-', ':', '/', ' ', '.', '-', '+', ',', '\'', '@', '|', ';', '*', '?', '&', '%', '#', '(', ')', '$', '<', '>', '=' }; final char decPunctuation( int code ) { return PUNCTUATION[code]; } int encTranslate( byte b ) { if ( b < 0 ) return ( 0xFF & b ) - 0xC0; // European chars if ( b < 0x20 ) return encPunctuation( b ); // Punctuation if ( b >= '0' && b <= '4' ) return 0x5B + b - '0'; // Numbers if ( b >= '5' && b <= '9' ) return 0x7B + b - '5'; // Numbers return b; // Alphabetical } abstract int encPunctuation( byte b ); abstract char decTranslate( byte codePoint ); /** * Encodes a short string. * * @param string the string to encode. * @param target the property record to store the encoded string in * @return <code>true</code> if the string could be encoded as a short * string, <code>false</code> if it couldn't. */ /* * Intermediate code table * -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F * 0- SP _ . - : / + , ' @ | ; * ? & % * 1- # ( ) $ < > = * 2- * 3- 0 1 2 3 4 5 6 7 8 9 * 4- A B C D E F G H I J K L M N O * 5- P Q R S T U V W X Y Z * 6- a b c d e f g h i j k l m n o * 7- p q r s t u v w x y z * 8- * 9- * A- * B- * C- À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï * D- Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß * E- à á â ã ä å æ ç è é ê ë ì í î ï * F- ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ */ public static boolean encode( int keyId, String string, PropertyBlock target, int payloadSize ) { // NUMERICAL can carry most characters, so compare to that int stringLength = string.length(); // We only use 6 bits for storing the string length // TODO could be dealt with by having string length zero and go for null bytes, // at least for LATIN1 (that's what the ShortString implementation initially did) if ( stringLength > NUMERICAL.maxLength( payloadSize ) || stringLength > 63 ) return false; // Not handled by any encoding // if ( string.equals( "" ) ) // { // Bits bits = Bits.bits( 8 ); // writeHeader( bits, keyId, 0, 0 ); // target.setValueBlocks( bits.getLongs() ); // return true; // } // Keep track of the possible encodings that can be used for the string EnumSet<LongerShortString> possible = null; // Allocate space for the intermediate representation // (using the intermediate representation table above) byte[] data = new byte[stringLength]; if ( possible == null ) { possible = EnumSet.allOf( LongerShortString.class ); for ( LongerShortString possibility : LongerShortString.values() ) { if ( data.length > possibility.maxLength( payloadSize ) ) possible.remove( possibility ); } } LOOP: for ( int i = 0; i < data.length && !possible.isEmpty(); i++ ) { char c = string.charAt( i ); switch ( c ) { case ' ': data[i] = 0; possible.remove( EMAIL ); break; case '_': data[i] = 1; possible.removeAll( EnumSet.of( NUMERICAL, DATE ) ); break; case '.': data[i] = 2; possible.removeAll( EnumSet.of( ALPHANUM, DATE ) ); break; case '-': data[i] = 3; possible.remove( ALPHANUM ); break; case ':': data[i] = 4; possible.removeAll( EnumSet.of( ALPHANUM, NUMERICAL, EUROPEAN, EMAIL ) ); break; case '/': data[i] = 5; possible.removeAll( EnumSet.of( ALPHANUM, NUMERICAL, EUROPEAN, EMAIL ) ); break; case '+': data[i] = 6; possible.retainAll( EnumSet.of( NUMERICAL, DATE, EMAIL, URI, ALPHASYM ) ); break; case ',': data[i] = 7; possible.retainAll( EnumSet.of( NUMERICAL, DATE, EMAIL, URI, ALPHASYM ) ); break; case '\'': data[i] = 8; possible.retainAll( EnumSet.of( NUMERICAL, URI, ALPHASYM ) ); break; case '@': data[i] = 9; possible.retainAll( EnumSet.of( EMAIL, URI, ALPHASYM ) ); break; case '|': data[i] = 0xA; possible.retainAll( EnumSet.of( ALPHASYM ) ); break; // These below are all for the URI encoding only (as of yet at least) case ';': data[i] = 0xB; possible.retainAll( EnumSet.of( URI ) ); break; case '*': data[i] = 0xC; possible.retainAll( EnumSet.of( URI ) ); break; case '?': data[i] = 0xD; possible.retainAll( EnumSet.of( URI ) ); break; case '&': data[i] = 0xE; possible.retainAll( EnumSet.of( URI ) ); break; case '%': data[i] = 0xF; possible.retainAll( EnumSet.of( URI ) ); break; case '#': data[i] = 0x10; possible.retainAll( EnumSet.of( URI ) ); break; case '(': data[i] = 0x11; possible.retainAll( EnumSet.of( URI ) ); break; case ')': data[i] = 0x12; possible.retainAll( EnumSet.of( URI ) ); break; case '$': data[i] = 0x13; possible.retainAll( EnumSet.of( URI ) ); break; case '<': data[i] = 0x14; possible.retainAll( EnumSet.of( URI ) ); break; case '>': data[i] = 0x15; possible.retainAll( EnumSet.of( URI ) ); break; case '=': data[i] = 0x16; possible.retainAll( EnumSet.of( URI ) ); break; // These above are all for the URI encoding only (as of yet at least) default: if ( ( c >= 'A' && c <= 'Z' ) ) { possible.removeAll( EnumSet.of( NUMERICAL, DATE, LOWER, EMAIL, URI ) ); } else if ( ( c >= 'a' && c <= 'z' ) ) { possible.removeAll( EnumSet.of( NUMERICAL, DATE, UPPER ) ); } else if ( ( c >= '0' && c <= '9' ) ) { possible.removeAll( EnumSet.of( UPPER, LOWER, EMAIL, ALPHASYM ) ); } else if ( c >= 'À' && c <= 'ÿ' && c != 0xD7 && c != 0xF7 ) { possible.retainAll( EnumSet.of( EUROPEAN ) ); } else { possible.clear(); break LOOP; // fall back to UTF-8 } data[i] = (byte) c; } } for ( LongerShortString encoding : possible ) { // Will return false if the data is too long for the encoding if ( encoding.doEncode( keyId, data, target, payloadSize ) ) return true; } int maxBytes = PropertyType.getPayloadSize(); if ( stringLength <= maxBytes - 5 ) { if ( encodeLatin1( keyId, string, target ) ) return true; if ( encodeUTF8( keyId, string, target, payloadSize ) ) return true; } return false; } private static void writeHeader( Bits bits, int keyId, int encoding, int stringLength ) { // [][][][ lll,llle][eeee,tttt][kkkk,kkkk][kkkk,kkkk][kkkk,kkkk] bits.put( keyId, 24 ).put( PropertyType.SHORT_STRING.intValue(), 4 ).put( encoding, 5 ).put( stringLength, 6 ); } /** * Decode a short string represented as a long[] * * @param data the value to decode to a short string. * @return the decoded short string */ public static String decode( PropertyBlock block ) { Bits bits = Bits.bitsFromLongs( copyOf( block.getValueBlocks(), block.getValueBlocks().length ) ); long firstLong = bits.getLongs()[0]; if ( ( firstLong & 0xFFFFFF0FFFFFFFFFL ) == 0 ) return ""; bits.getInt( 24 ); // Get rid of the key bits.getByte( 4 ); // Get rid of the type int encoding = bits.getByte( 5 ); //(int) ( ( firstLong & 0xF00000000L ) >>> 32 ); int stringLength = bits.getByte( 6 ); //(int) ( ( firstLong & 0xFC000000L ) >>> 26 ); if ( encoding == 0 ) return decodeUTF8( bits, stringLength ); if ( encoding == 10 ) return decodeLatin1( bits, stringLength ); LongerShortString table = getEncodingTable( encoding ); char[] result = new char[stringLength]; // encode shifts in the bytes with the first char at the MSB, therefore // we must "unshift" in the reverse order for ( int i = 0; i < result.length; i++ ) { byte codePoint = bits.getByte( table.step ); result[i] = table.decTranslate( codePoint ); } return new String( result ); } private static LongerShortString getEncodingTable( int encoding ) { LongerShortString table; switch ( encoding ) { case 1: table = NUMERICAL; break; case 2: table = DATE; break; case 3: table = UPPER; break; case 4: table = LOWER; break; case 5: table = EMAIL; break; case 6: table = URI; break; case 7: table = ALPHANUM; break; case 8: table = ALPHASYM; break; case 9: table = EUROPEAN; break; default: throw new IllegalArgumentException( "Invalid encoding '" + encoding + "'" ); } return table; } private static Bits newBits( int encoding, int length ) { return Bits.bits( calculateNumberOfBlocksUsed( encoding, length )*8 ); } private static boolean encodeLatin1( int keyId, String string, PropertyBlock target ) { int length = string.length(); Bits bits = newBits( 10, length ); /* if ( bits.getLongs().length > 4 ) { return false; } */ writeHeader( bits, keyId, 10, length ); if ( !writeLatin1Characters( string, bits ) ) return false; target.setValueBlocks( bits.getLongs() ); return true; } public static boolean writeLatin1Characters( String string, Bits bits ) { int length = string.length(); for ( int i = 0; i < length; i++ ) { char c = string.charAt( i ); if ( c < 0 || c >= 256 ) return false; bits.put( c, 8 ); // Just the lower byte } return true; } private static boolean encodeUTF8( int keyId, String string, PropertyBlock target, int payloadSize ) { try { byte[] bytes = string.getBytes( "UTF-8" ); if ( bytes.length > payloadSize-3/*key*/-2/*enc+len*/ ) return false; Bits bits = newBits( 0, bytes.length ); /* if ( bits.getLongs().length > 4 ) { return false; } */ writeHeader( bits, keyId, 0, bytes.length ); // In this case it isn't the string length, but the number of bytes for ( byte value : bytes ) { bits.put( value ); } target.setValueBlocks( bits.getLongs() ); return true; } catch ( UnsupportedEncodingException e ) { throw new IllegalStateException( "All JVMs must support UTF-8", e ); } } private boolean doEncode( int keyId, byte[] data, PropertyBlock target, int payloadSize ) { if ( data.length > maxLength( payloadSize ) ) return false; Bits bits = newBits( encodingHeader, data.length ); /* if ( bits.getLongs().length > 4 ) { return false; } */ writeHeader( bits, keyId, encodingHeader, data.length ); for ( int i = 0; i < data.length; i++ ) { int encodedChar = encTranslate( data[i] ); bits.put( encodedChar, step ); } target.setValueBlocks( bits.getLongs() ); return true; } private static String decodeLatin1( Bits bits, int stringLength ) { // see decode char[] result = new char[stringLength]; for ( int i = 0; i < result.length; i++ ) { result[i] = (char) bits.getShort( 8 ); } return new String( result ); } private static String decodeUTF8( Bits bits, int stringLength ) { byte[] result = new byte[stringLength]; for ( int i = 0; i < stringLength; i++ ) { result[i] = bits.getByte(); } try { return new String( result, "UTF-8" ); } catch ( UnsupportedEncodingException e ) { throw new IllegalStateException( "All JVMs must support UTF-8", e ); } } public static int calculateNumberOfBlocksUsed( long firstBlock ) { /* * [ lll,llle][eeee,tttt][kkkk,kkkk][kkkk,kkkk][kkkk,kkkk] */ int encoding = (int) ( ( firstBlock & 0x1F0000000L ) >> 28 ); int length = (int) ( ( firstBlock & 0x7E00000000L ) >> 33 ); /* Bits bits = Bits.bitsFromLongs( new long[] {firstBlock} ); bits.getInt( 24 ); // key bits.getByte( 4 ); // type int encoding = bits.getByte( 5 ); int length = bits.getByte( 6 ); */ return calculateNumberOfBlocksUsed( encoding, length ); } public static int calculateNumberOfBlocksUsed( int encoding, int length ) { int bitsForCharacters = 0; if ( encoding == 0 || encoding == 10 ) bitsForCharacters = length*8; else bitsForCharacters = getEncodingTable( encoding ).step*length; int bitsInTotal = 24+4+5+6+bitsForCharacters; int result = ( bitsInTotal - 1 ) / 64 + 1; return result; } }