// Copyright 2017 JanusGraph Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package org.janusgraph.graphdb.database.serialize.attribute; import com.google.common.base.Preconditions; import org.janusgraph.core.Namifiable; import org.janusgraph.diskstorage.ScanBuffer; import org.janusgraph.diskstorage.WriteBuffer; import org.janusgraph.graphdb.database.idhandling.VariableLong; import org.janusgraph.graphdb.database.serialize.OrderPreservingSerializer; import org.janusgraph.graphdb.database.serialize.SupportsNullSerializer; import org.janusgraph.util.encoding.StringEncoding; import java.io.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; /** * Serializes Strings by trying to find the most efficient serialization format: * 1) ASCII encoding (one byte per char) * 2) Full UTF encoding (for non-ASCII strings) * 3) Using compression algorithms for long strings * * @author Matthias Broecheler (me@matthiasb.com) */ public class StringSerializer implements OrderPreservingSerializer<String>, SupportsNullSerializer { public static final int MAX_LENGTH = 128 * 1024 * 1024; //128 MB public static final int LONG_COMPRESSION_THRESHOLD = 16000; public static final int TEXT_COMRPESSION_THRESHOLD = 48; private static final long COMPRESSOR_BIT_LEN = 3; private static final int MAX_NUM_COMPRESSORS = (1<<COMPRESSOR_BIT_LEN); private static final long COMPRESSOR_BIT_MASK = MAX_NUM_COMPRESSORS-1; private static final long NO_COMPRESSION_OFFSET = COMPRESSOR_BIT_LEN+1; private final CharacterSerializer cs = new CharacterSerializer(); @Override public String readByteOrder(ScanBuffer buffer) { byte prefix = buffer.getByte(); if (prefix==-1) return null; assert prefix==0; StringBuilder s = new StringBuilder(); while (true) { char c = cs.readByteOrder(buffer); if (((int) c) > 0) s.append(c); else break; } return s.toString(); } @Override public void writeByteOrder(WriteBuffer buffer, String attribute) { if (attribute==null) { buffer.putByte((byte)-1); return; } else { buffer.putByte((byte)0); } for (int i = 0; i < attribute.length(); i++) { char c = attribute.charAt(i); Preconditions.checkArgument(((int) c) > 0, "No null characters allowed in string @ position %s: %s", i, attribute); cs.writeByteOrder(buffer, c); } cs.writeByteOrder(buffer, (char) 0); } @Override public void verifyAttribute(String value) { Preconditions.checkArgument(value.length()<=MAX_LENGTH,"String is too long: %s",value.length()); } @Override public String convert(Object value) { Preconditions.checkNotNull(value); if (value instanceof String) return (String)value; else if (value instanceof Namifiable) return ((Namifiable)value).name(); else return value.toString(); } @Override public String read(ScanBuffer buffer) { long length = VariableLong.readPositive(buffer); if (length==0) return null; long compressionId = length & COMPRESSOR_BIT_MASK; assert compressionId<MAX_NUM_COMPRESSORS; CompressionType compression = CompressionType.getFromId((int)compressionId); length = (length>>>COMPRESSOR_BIT_LEN); String value; if (compression==CompressionType.NO_COMPRESSION) { if ( (length&1)==0) { //ASCII encoding length = length>>>1; if (length==1) value=""; else if (length==2) { StringBuilder sb = new StringBuilder(); while (true) { int c = 0xFF & buffer.getByte(); sb.append((char)(c & 0x7F)); if ((c & 0x80) > 0) break; } value = sb.toString(); } else throw new IllegalArgumentException("Invalid ASCII encoding offset: " + length); } else { //variable full UTF encoding length = length>>>1; assert length>0 && length<=Integer.MAX_VALUE; StringBuilder sb = new StringBuilder((int)length); for (int i = 0; i < length; i++) { int b = buffer.getByte() & 0xFF; switch (b >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: sb.append((char)b); break; case 12: case 13: sb.append((char)((b & 0x1F) << 6 | buffer.getByte() & 0x3F)); break; case 14: sb.append((char)((b & 0x0F) << 12 | (buffer.getByte() & 0x3F) << 6 | buffer.getByte() & 0x3F)); break; } } value = sb.toString(); } } else { assert length<=Integer.MAX_VALUE; value = compression.decompress(buffer,(int)length); } return value; } @Override public void write(WriteBuffer buffer, String attribute) { CompressionType compression; if (attribute==null) { VariableLong.writePositive(buffer,0); return; } else if (attribute.length()>LONG_COMPRESSION_THRESHOLD) { compression=CompressionType.GZIP; } else { compression=CompressionType.NO_COMPRESSION; } assert compression!=null; assert compression.getId()<MAX_NUM_COMPRESSORS; if (compression==CompressionType.NO_COMPRESSION) { assert compression.getId()==0; if (StringEncoding.isAsciiString(attribute)) { if (attribute.length()==0) VariableLong.writePositive(buffer,1l<<NO_COMPRESSION_OFFSET); else VariableLong.writePositive(buffer,2l<<NO_COMPRESSION_OFFSET); for (int i = 0; i < attribute.length(); i++) { int c = attribute.charAt(i); assert c <= 127; byte b = (byte)c; if (i+1==attribute.length()) b |= 0x80; //End marker buffer.putByte(b); } } else { assert attribute.length()>0; VariableLong.writePositive(buffer,(((long)attribute.length())<<NO_COMPRESSION_OFFSET) + (1l<<COMPRESSOR_BIT_LEN)); //Marker for full UTF encoding for (int i = 0; i < attribute.length(); i++) { //variable encoding of the characters int c = attribute.charAt(i); if (c <= 0x007F) { buffer.putByte((byte)c); } else if (c > 0x07FF) { buffer.putByte((byte)(0xE0 | c >> 12 & 0x0F)); buffer.putByte((byte)(0x80 | c >> 6 & 0x3F)); buffer.putByte((byte)(0x80 | c & 0x3F)); } else { buffer.putByte((byte)(0xC0 | c >> 6 & 0x1F)); buffer.putByte((byte)(0x80 | c & 0x3F)); } } } } else { byte[] compressed = compression.compress(attribute); int length = compressed.length; assert length>0; VariableLong.writePositive(buffer,(((long)length)<<COMPRESSOR_BIT_LEN) + compression.getId()); buffer.putBytes(compressed); } } private enum CompressionType { NO_COMPRESSION { @Override public byte[] compress(String text) { throw new UnsupportedOperationException(); } @Override public String decompress(ScanBuffer buffer, int numBytes) { throw new UnsupportedOperationException(); } }, GZIP { @Override public byte[] compress(String text) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { OutputStream out = new GZIPOutputStream(baos); out.write(text.getBytes("UTF-8")); out.close(); } catch (IOException e) { throw new RuntimeException(e); } return baos.toByteArray(); } @Override public String decompress(final ScanBuffer buffer, final int numBytes) { try { InputStream in = new GZIPInputStream(new InputStream() { int bytesRead = 0; @Override public int read() throws IOException { if (++bytesRead>numBytes) return -1; return 0xFF & buffer.getByte(); } }); ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] bytes = new byte[8192]; int len; while ((len = in.read(bytes)) > 0) baos.write(bytes, 0, len); return new String(baos.toByteArray(), "UTF-8"); } catch (IOException e) { throw new RuntimeException(e); } } }; public abstract byte[] compress(String text); public abstract String decompress(ScanBuffer buffer, int numBytes); public int getId() { return this.ordinal(); } public static CompressionType getFromId(int id) { for (CompressionType ct : values()) if (ct.getId()==id) return ct; throw new IllegalArgumentException("Unknown compressor type for id: "+id); } } }