package eu.fbk.knowledgestore.data; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.math.BigInteger; import java.util.GregorianCalendar; import java.util.List; import java.util.Set; import java.util.TimeZone; import java.util.zip.Deflater; import java.util.zip.DeflaterOutputStream; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; import javax.annotation.Nullable; import javax.xml.datatype.XMLGregorianCalendar; import com.google.common.base.MoreObjects; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.io.ByteStreams; import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; import org.openrdf.model.BNode; import org.openrdf.model.Literal; import org.openrdf.model.Resource; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.Value; import org.openrdf.model.ValueFactory; import org.openrdf.model.vocabulary.XMLSchema; // NOTE: supports only serialization and deserialization of Record, URI, BNode, Literal, // Statement objects. For records, it is possible to specify which properties to serialize / // deserialize. public final class Serializer { private static final Set<String> KB_PREFIXES = ImmutableSet.of("dbpedia", "yago", "gn", "geonames", "lgdo", "lgv"); private static final String LANG_NS = "lang:"; private static final int TYPE_NULL = 0x00; private static final int TYPE_LIST = 0x10; private static final int TYPE_RECORD = 0x20; private static final int TYPE_LIT_STRING = 0x40; private static final int TYPE_LIT_STRING_LANG = 0x80; private static final int TYPE_LIT_TRUE = 0x01; private static final int TYPE_LIT_FALSE = 0x02; private static final int TYPE_LIT_LONG = 0x03; private static final int TYPE_LIT_INT = 0x04; private static final int TYPE_LIT_SHORT = 0x05; private static final int TYPE_LIT_BYTE = 0x06; private static final int TYPE_LIT_DOUBLE = 0x07; private static final int TYPE_LIT_FLOAT = 0x08; private static final int TYPE_LIT_BIG_INTEGER = 0x09; private static final int TYPE_LIT_BIG_DECIMAL = 0x0A; private static final int TYPE_LIT_DATETIME = 0x0B; private static final int TYPE_BNODE = 0x30; private static final int TYPE_URI_PLAIN = 0xC0; private static final int TYPE_URI_COMPRESSED = 0x0C; private static final int TYPE_STATEMENT = 0x0D; // Number serialization // bits len hi mask layout // 07 01 0x00 0x7F 0 7 // 14 02 0x80 0x3F 10 6 8 // 21 03 0xC0 0x1F 110 5 8 8 // 28 04 0xE0 0x0F 1110 4 8 8 8 // 35 05 0xF0 0x07 11110 3 8 8 8 8 // 42 06 0xF8 0x03 111110 2 8 8 8 8 8 // 49 07 0xFC 0x01 1111110 1 8 8 8 8 8 8 // 56 08 0xFE 0x00 11111110 8 8 8 8 8 8 8 // 64 09 0xFF 0x00 11111111 8 8 8 8 8 8 8 8 private final boolean compress; @Nullable private final Dictionary<URI> dictionary; private final ValueFactory factory; public Serializer() { this(false, null, null); } public Serializer(final boolean compress, @Nullable final Dictionary<URI> dictionary, @Nullable final ValueFactory factory) { this.compress = compress; this.dictionary = dictionary; this.factory = MoreObjects.firstNonNull(factory, Data.getValueFactory()); } public byte[] toBytes(final Object object) { try { final ByteArrayOutputStream stream = new ByteArrayOutputStream(); toStream(stream, object); return stream.toByteArray(); } catch (final IOException ex) { throw new Error("Unexpected exception (!): " + ex.getMessage(), ex); } } public Object fromBytes(final byte[] bytes) { try { return fromStream(new ByteArrayInputStream(bytes)); } catch (final IOException ex) { throw new Error("Unexpected exception (!): " + ex.getMessage(), ex); } } public void toStream(final OutputStream stream, final Object object) throws IOException { if (this.compress) { final Deflater deflater = new Deflater(Deflater.BEST_COMPRESSION, true); final DeflaterOutputStream compressStream = new DeflaterOutputStream(stream, deflater); writeObject(compressStream, object); compressStream.finish(); } else { writeObject(stream, object); } } public Object fromStream(final InputStream stream) throws IOException { if (this.compress) { final Inflater inflater = new Inflater(true); final InflaterInputStream compressStream = new InflaterInputStream(stream, inflater); return readObject(compressStream); } else { return readObject(stream); } } private void writeObject(final OutputStream stream, final Object object) throws IOException { if (object == null) { writeHeader(stream, TYPE_NULL, 0); } else if (object instanceof Iterable<?>) { final Iterable<?> iterable = (Iterable<?>) object; final int size = Iterables.size(iterable); writeHeader(stream, TYPE_LIST, size); for (final Object element : iterable) { writeObject(stream, element); } } else if (object instanceof Record) { final Record record = (Record) object; writeHeader(stream, TYPE_RECORD, record.getProperties().size()); writeObject(stream, record.getID()); for (final URI property : record.getProperties()) { writeCompressedURI(stream, property); final List<? extends Object> nodes = record.get(property); writeObject(stream, nodes.size() == 1 ? nodes.get(0) : nodes); } } else if (object instanceof Literal) { final Literal literal = (Literal) object; final URI datatype = literal.getDatatype(); if (datatype == null || datatype.equals(XMLSchema.STRING)) { final String language = literal.getLanguage(); final byte[] label = encodeString(literal.getLabel()); if (language == null) { writeHeader(stream, TYPE_LIT_STRING, label.length); } else { writeHeader(stream, TYPE_LIT_STRING_LANG, label.length); final URI langURI = this.factory.createURI("lang:" + language); writeCompressedURI(stream, langURI); } stream.write(label); } else if (datatype.equals(XMLSchema.BOOLEAN)) { writeHeader(stream, literal.booleanValue() ? TYPE_LIT_TRUE : TYPE_LIT_FALSE, 0); } else if (datatype.equals(XMLSchema.LONG)) { writeHeader(stream, TYPE_LIT_LONG, 0); writeNumber(stream, literal.longValue()); } else if (datatype.equals(XMLSchema.INT)) { writeHeader(stream, TYPE_LIT_INT, 0); writeNumber(stream, literal.longValue()); } else if (datatype.equals(XMLSchema.DOUBLE)) { writeHeader(stream, TYPE_LIT_DOUBLE, 0); stream.write(Longs.toByteArray(Double.doubleToLongBits(literal.doubleValue()))); } else if (datatype.equals(XMLSchema.FLOAT)) { writeHeader(stream, TYPE_LIT_FLOAT, 0); stream.write(Ints.toByteArray(Float.floatToIntBits(literal.floatValue()))); } else if (datatype.equals(XMLSchema.SHORT)) { writeHeader(stream, TYPE_LIT_SHORT, 0); writeNumber(stream, literal.longValue()); } else if (datatype.equals(XMLSchema.BYTE)) { writeHeader(stream, TYPE_LIT_BYTE, 0); writeNumber(stream, literal.longValue()); } else if (datatype.equals(XMLSchema.INTEGER)) { writeHeader(stream, TYPE_LIT_BIG_INTEGER, 0); final byte[] bytes = literal.integerValue().toByteArray(); writeNumber(stream, bytes.length); stream.write(bytes); } else if (datatype.equals(XMLSchema.DECIMAL)) { writeHeader(stream, TYPE_LIT_BIG_DECIMAL, 0); final byte[] bytes = encodeString(literal.decimalValue().toString()); writeNumber(stream, bytes.length); stream.write(bytes); } else if (datatype.equals(XMLSchema.DATETIME)) { writeHeader(stream, TYPE_LIT_DATETIME, 0); final XMLGregorianCalendar calendar = literal.calendarValue(); writeNumber(stream, calendar.getTimezone()); writeNumber(stream, calendar.toGregorianCalendar().getTimeInMillis()); } else { throw new UnsupportedOperationException("Don't know how to serialize: " + literal); } } else if (object instanceof BNode) { final byte[] id = encodeString(((BNode) object).getID()); writeHeader(stream, TYPE_BNODE, id.length); stream.write(id); } else if (object instanceof URI) { final URI uri = (URI) object; if (isVocabTerm(uri)) { writeHeader(stream, TYPE_URI_COMPRESSED, 0); writeCompressedURI(stream, uri); } else { final byte[] string = encodeString(uri.stringValue()); writeHeader(stream, TYPE_URI_PLAIN, string.length); stream.write(string); } } else if (object instanceof Statement) { final Statement statement = (Statement) object; writeHeader(stream, TYPE_STATEMENT, 0); writeObject(stream, statement.getSubject()); writeObject(stream, statement.getPredicate()); writeObject(stream, statement.getObject()); writeObject(stream, statement.getContext()); } else { throw new UnsupportedOperationException("Don't know how to serialize " + object.getClass()); } } private void writeHeader(final OutputStream stream, final int type, final int number) throws IOException { if ((type & 0xC0) != 0 && number <= 62) { stream.write(type | number + 1); } else if ((type & 0x30) != 0 && number <= 14) { stream.write(type | number + 1); } else if ((type & 0xF0) != 0) { stream.write(type); writeNumber(stream, number); } else { stream.write(type); } } private void writeCompressedURI(final OutputStream stream, final URI uri) throws IOException { if (this.dictionary != null) { final int key = this.dictionary.keyFor(uri, true); writeNumber(stream, key); } else { final String ns = uri.getNamespace(); if (LANG_NS.equals(ns)) { final byte[] utf8 = encodeString(uri.getLocalName()); writeNumber(stream, utf8.length << 2 | 1); stream.write(utf8); } else { final String prefix = Data.namespaceToPrefix(uri.getNamespace(), Data.getNamespaceMap()); if (prefix != null) { final byte[] utf8 = encodeString(prefix + ":" + uri.getLocalName()); writeNumber(stream, utf8.length << 2 | 3); stream.write(utf8); } else { final byte[] utf8 = encodeString(uri.stringValue()); writeNumber(stream, utf8.length << 1); stream.write(utf8); } } } } private void writeNumber(final OutputStream stream, final long num) throws IOException { if (num < 0L || num > 0xFFFFFFFFFFFFFFL /* 56 bit */) { writeNumberHelper(stream, 9, 0xFF, num); } else if (num <= 0x7FL /* 7 bit */) { writeNumberHelper(stream, 1, 0x00, num); } else if (num <= 0x3FFFL /* 14 bit */) { writeNumberHelper(stream, 2, 0x80, num); } else if (num <= 0x1FFFFFL /* 21 bit */) { writeNumberHelper(stream, 3, 0xC0, num); } else if (num <= 0xFFFFFFFL /* 28 bit */) { writeNumberHelper(stream, 4, 0xE0, num); } else if (num <= 0x7FFFFFFFFL /* 35 bit */) { writeNumberHelper(stream, 5, 0xF0, num); } else if (num <= 0x3FFFFFFFFFFL /* 42 bit */) { writeNumberHelper(stream, 6, 0xF8, num); } else if (num <= 0x1FFFFFFFFFFFFL /* 49 bit */) { writeNumberHelper(stream, 7, 0xFC, num); } else { writeNumberHelper(stream, 8, 0xFE, num); } } private void writeNumberHelper(final OutputStream stream, final int len, final int mask, final long num) throws IOException { stream.write(mask | (int) (num >>> (len - 1) * 8)); for (int i = len - 2; i >= 0; --i) { stream.write((int) (num >>> i * 8 & 0xFF)); } } private Object readObject(final InputStream stream) throws IOException { // Read header: type and optional number used later for parsing int type = stream.read(); if (type < 0) { throw new EOFException(); } int num = 0; if ((type & 0xC0) != 0) { final int n = type & 0x3F; num = n > 0 ? n - 1 : (int) readNumber(stream); type = type & 0xC0; } else if ((type & 0x30) != 0) { final int n = type & 0x0F; num = n > 0 ? n - 1 : (int) readNumber(stream); type = type & 0x30; } // Read the remainder based on parsed type switch (type) { case TYPE_NULL: return null; case TYPE_LIST: final List<Object> list = Lists.newArrayListWithCapacity(num); for (int i = 0; i < num; ++i) { list.add(readObject(stream)); } return list; case TYPE_RECORD: final Record record = Record.create(); record.setID((URI) readObject(stream)); for (int i = 0; i < num; ++i) { final URI property = readCompressedURI(stream); final Object value = readObject(stream); record.set(property, value); } return record; case TYPE_BNODE: final String bnodeID = decodeString(readBytes(stream, num)); return this.factory.createBNode(bnodeID); case TYPE_URI_COMPRESSED: return readCompressedURI(stream); case TYPE_URI_PLAIN: final String uriString = decodeString(readBytes(stream, num)); return this.factory.createURI(uriString); case TYPE_LIT_STRING: final String plainLabel = decodeString(readBytes(stream, num)); return this.factory.createLiteral(plainLabel); case TYPE_LIT_STRING_LANG: final String lang = readCompressedURI(stream).getLocalName(); final String label = decodeString(readBytes(stream, num)); return this.factory.createLiteral(label, lang); case TYPE_LIT_TRUE: return this.factory.createLiteral(true); case TYPE_LIT_FALSE: return this.factory.createLiteral(false); case TYPE_LIT_LONG: final long longVal = readNumber(stream); return this.factory.createLiteral(longVal); case TYPE_LIT_INT: final int intVal = (int) readNumber(stream); return this.factory.createLiteral(intVal); case TYPE_LIT_SHORT: final short shortVal = (short) readNumber(stream); return this.factory.createLiteral(shortVal); case TYPE_LIT_BYTE: final byte byteVal = (byte) readNumber(stream); return this.factory.createLiteral(byteVal); case TYPE_LIT_DOUBLE: final byte[] doubleBytes = readBytes(stream, 8); final double doubleVal = Double.longBitsToDouble(Longs.fromByteArray(doubleBytes)); return this.factory.createLiteral(doubleVal); case TYPE_LIT_FLOAT: final byte[] floatBytes = readBytes(stream, 4); final float floatVal = Float.intBitsToFloat(Ints.fromByteArray(floatBytes)); return this.factory.createLiteral(floatVal); case TYPE_LIT_BIG_INTEGER: final int bigintLen = (int) readNumber(stream); final String bigintVal = new BigInteger(readBytes(stream, bigintLen)).toString(); return this.factory.createLiteral(bigintVal, XMLSchema.INTEGER); case TYPE_LIT_BIG_DECIMAL: final int bigdecLen = (int) readNumber(stream); final String bigdecVal = decodeString(readBytes(stream, bigdecLen)); return this.factory.createLiteral(bigdecVal, XMLSchema.DECIMAL); case TYPE_LIT_DATETIME: final int tz = (int) readNumber(stream); final long millis = readNumber(stream); final GregorianCalendar calendar = new GregorianCalendar(); calendar.setTimeInMillis(millis); calendar.setTimeZone(TimeZone.getTimeZone(String.format("GMT%s%02d:%02d", tz >= 0 ? "+" : "-", Math.abs(tz) / 60, Math.abs(tz) % 60))); return this.factory.createLiteral(Data.getDatatypeFactory().newXMLGregorianCalendar( calendar)); case TYPE_STATEMENT: final Resource subj = (Resource) readObject(stream); final URI pred = (URI) readObject(stream); final Value obj = (Value) readObject(stream); final Resource ctx = (Resource) readObject(stream); return ctx == null ? this.factory.createStatement(subj, pred, obj) : this.factory .createStatement(subj, pred, obj, ctx); default: throw new UnsupportedOperationException("Don't know how to deserialize type " + type); } } private byte[] readBytes(final InputStream stream, final int length) throws IOException { final byte[] bytes = new byte[length]; ByteStreams.readFully(stream, bytes); return bytes; } private URI readCompressedURI(final InputStream stream) throws IOException { if (this.dictionary != null) { final int key = (int) readNumber(stream); return this.dictionary.objectFor(key); } else { final int header = (int) readNumber(stream); if ((header & 0x1) == 0) { final String string = decodeString(readBytes(stream, header >> 1)); return this.factory.createURI(string); } else { final String string = decodeString(readBytes(stream, header >> 2)); return (header & 0x3) == 1 ? this.factory.createURI(LANG_NS, string) // : (URI) Data.parseValue(string, Data.getNamespaceMap()); } } } private long readNumber(final InputStream stream) throws IOException { final int b = stream.read(); if (b < 0) { throw new EOFException(); } if (b <= 0x00 + 0x7F) { return readNumberHelper(stream, 1, b & 0x7F); } else if (b <= 0x80 + 0x3F) { return readNumberHelper(stream, 2, b & 0x3F); } else if (b <= 0xC0 + 0x1F) { return readNumberHelper(stream, 3, b & 0x1F); } else if (b <= 0xE0 + 0x0F) { return readNumberHelper(stream, 4, b & 0x0F); } else if (b <= 0xF0 + 0x07) { return readNumberHelper(stream, 5, b & 0x07); } else if (b <= 0xF8 + 0x03) { return readNumberHelper(stream, 6, b & 0x03); } else if (b <= 0xFC + 0x01) { return readNumberHelper(stream, 7, b & 0x01); } else if (b <= 0xFE + 0x01) { return readNumberHelper(stream, 8, b & 0x00); } else { return readNumberHelper(stream, 9, b & 0x00); } } private long readNumberHelper(final InputStream stream, final int len, final int start) throws IOException { long num = start; for (int i = 1; i < len; ++i) { final int c = stream.read(); if (c < 0) { throw new EOFException(); } num = num << 8 | c & 0xFF; } return num; } private byte[] encodeString(final String string) { // return string.getBytes(Charsets.UTF_8); return Smaz.compress(string); } private String decodeString(final byte[] bytes) { // return new String(bytes, Charsets.UTF_8); return Smaz.decompress(bytes); } private static boolean isVocabTerm(final URI uri) { final String prefix = Data.namespaceToPrefix(uri.getNamespace(), Data.getNamespaceMap()); return prefix != null && !KB_PREFIXES.contains(prefix); } }