package eu.fbk.knowledgestore.runtime;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.GregorianCalendar;
import java.util.List;
import java.util.Set;
import java.util.TimeZone;
import javax.annotation.Nullable;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.XMLGregorianCalendar;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.Encoder;
import org.apache.avro.io.EncoderFactory;
import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.model.vocabulary.XMLSchema;
import eu.fbk.knowledgestore.data.Data;
import eu.fbk.knowledgestore.data.Dictionary;
import eu.fbk.knowledgestore.data.Record;
// NOTE: supports only serialization and deserialization of Record, URI, BNode, Literal,
// Statement objects. For records, it is possible to specify which properties to serialize /
// deserialize.
// TODO: add ideas from smaz/jsmaz to dictionary-compress short strings / uris
// <https://github.com/icedrake/jsmaz> (30-50% string reduction achievable)
public final class SerializerAvro {
private final Dictionary<URI> dictionary;
private final ValueFactory factory;
private final DatatypeFactory datatypeFactory;
public SerializerAvro() {
this((Dictionary<URI>) null);
}
public SerializerAvro(@Nullable final Dictionary<URI> dictionary) {
this.dictionary = dictionary;
this.factory = Data.getValueFactory();
this.datatypeFactory = Data.getDatatypeFactory();
}
public SerializerAvro(final String fileName) throws IOException {
this.dictionary = Dictionary.createHadoopDictionary(URI.class, fileName);
this.factory = Data.getValueFactory();
this.datatypeFactory = Data.getDatatypeFactory();
}
public Dictionary<URI> getDictionary() {
return this.dictionary;
}
public byte[] compressURI(final URI uri) {
Preconditions.checkNotNull(uri);
try {
final ByteArrayOutputStream stream = new ByteArrayOutputStream();
final Encoder encoder = EncoderFactory.get().directBinaryEncoder(stream, null);
final DatumWriter<Object> writer = new GenericDatumWriter<Object>(
Schemas.COMPRESSED_IDENTIFIER);
this.dictionary.keyFor(uri); // ensure a compressed version of URI is available
final Object generic = encodeIdentifier(uri);
writer.write(generic, encoder);
return stream.toByteArray();
} catch (final IOException ex) {
throw new Error("Unexpected exception (!): " + ex.getMessage(), ex);
}
}
public URI expandURI(final byte[] bytes) {
Preconditions.checkNotNull(bytes);
try {
final InputStream stream = new ByteArrayInputStream(bytes);
final Decoder decoder = DecoderFactory.get().directBinaryDecoder(stream, null);
final DatumReader<Object> reader = new GenericDatumReader<Object>(
Schemas.COMPRESSED_IDENTIFIER);
final Object generic = reader.read(null, decoder);
return (URI) decodeNode(generic);
} catch (final IOException ex) {
throw new Error("Unexpected exception (!): " + ex.getMessage(), ex);
}
}
public byte[] toBytes(final Object object) {
try {
final ByteArrayOutputStream stream = new ByteArrayOutputStream();
this.toStream(stream, object);
return stream.toByteArray();
} catch (final IOException ex) {
throw new Error("Unexpected exception (!): " + ex.getMessage(), ex);
}
}
public byte[] toBytes(final Record object, @Nullable final Set<URI> propertiesToSerialize) {
try {
final ByteArrayOutputStream stream = new ByteArrayOutputStream();
this.toStream(stream, object, propertiesToSerialize);
return stream.toByteArray();
} catch (final IOException ex) {
throw new Error("Unexpected exception (!): " + ex.getMessage(), ex);
}
}
public Object fromBytes(final byte[] bytes) {
try {
return this.fromStream(new ByteArrayInputStream(bytes));
} catch (final IOException ex) {
throw new Error("Unexpected exception (!): " + ex.getMessage(), ex);
}
}
public Record fromBytes(final byte[] bytes, final @Nullable Set<URI> propertiesToDeserialize) {
try {
return this.fromStream(new ByteArrayInputStream(bytes), propertiesToDeserialize);
} catch (final IOException ex) {
throw new Error("Unexpected exception (!): " + ex.getMessage(), ex);
}
}
public void toStream(final OutputStream stream, final Object object) throws IOException {
final Object generic = encodeNode(object);
final Encoder encoder = EncoderFactory.get().directBinaryEncoder(stream, null);
final DatumWriter<Object> writer = new GenericDatumWriter<Object>(Schemas.NODE);
writer.write(generic, encoder);
encoder.flush();
}
public void toStream(final OutputStream stream, final Record object,
@Nullable final Set<URI> propertiesToSerialize) throws IOException {
final Object generic = encodeRecord(object, propertiesToSerialize);
final Encoder encoder = EncoderFactory.get().directBinaryEncoder(stream, null);
final DatumWriter<Object> writer = new GenericDatumWriter<Object>(Schemas.NODE);
writer.write(generic, encoder);
encoder.flush();
}
public Object fromStream(final InputStream stream) throws IOException {
final Decoder decoder = DecoderFactory.get().directBinaryDecoder(stream, null);
final DatumReader<Object> reader = new GenericDatumReader<Object>(Schemas.NODE);
final Object generic = reader.read(null, decoder);
return decodeNode(generic);
}
public Record fromStream(final InputStream stream,
@Nullable final Set<URI> propertiesToDeserialize) throws IOException {
final Decoder decoder = DecoderFactory.get().directBinaryDecoder(stream, null);
final DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(
Schemas.NODE);
final GenericRecord generic = reader.read(null, decoder);
return decodeRecord(generic, propertiesToDeserialize);
}
private List<Object> decodeNodes(final Object generic) {
if (generic instanceof Iterable<?>) {
final Iterable<?> iterable = (Iterable<?>) generic;
final int size = Iterables.size(iterable);
final List<Object> nodes = Lists.<Object>newArrayListWithCapacity(size);
for (final Object element : iterable) {
nodes.add(decodeNode(element));
}
return nodes;
}
Preconditions.checkNotNull(generic);
return ImmutableList.of(decodeNode(generic));
}
private Object decodeNode(final Object generic) {
if (generic instanceof GenericRecord) {
final GenericRecord record = (GenericRecord) generic;
final Schema schema = record.getSchema();
if (schema.equals(Schemas.RECORD)) {
return decodeRecord(record, null);
} else if (schema.equals(Schemas.PLAIN_IDENTIFIER)
|| schema.equals(Schemas.COMPRESSED_IDENTIFIER)) {
return decodeIdentifier(record);
} else if (schema.equals(Schemas.STATEMENT)) {
return decodeStatement(record);
}
}
return decodeLiteral(generic);
}
@SuppressWarnings("unchecked")
private Record decodeRecord(final GenericRecord generic,
@Nullable final Set<URI> propertiesToDecode) {
final Record record = Record.create();
final GenericRecord encodedID = (GenericRecord) generic.get(0);
if (encodedID != null) {
record.setID((URI) decodeIdentifier(encodedID));
}
for (final GenericRecord prop : (Iterable<GenericRecord>) generic.get(1)) {
final URI property = (URI) decodeIdentifier((GenericRecord) prop.get(0));
final List<Object> values = decodeNodes(prop.get(1));
if (propertiesToDecode == null || propertiesToDecode.contains(property)) {
record.set(property, values);
}
}
return record;
}
private Value decodeValue(final Object generic) {
if (generic instanceof GenericRecord) {
final GenericRecord record = (GenericRecord) generic;
final Schema schema = record.getSchema();
if (schema.equals(Schemas.COMPRESSED_IDENTIFIER)
|| schema.equals(Schemas.PLAIN_IDENTIFIER)) {
return decodeIdentifier(record);
}
}
return decodeLiteral(generic);
}
private Resource decodeIdentifier(final GenericRecord record) {
final Schema schema = record.getSchema();
if (schema.equals(Schemas.COMPRESSED_IDENTIFIER)) {
try {
return this.dictionary.objectFor((Integer) record.get(0));
} catch (final IOException ex) {
throw new IllegalStateException("Cannot access dictionary: " + ex.getMessage(), ex);
}
} else if (schema.equals(Schemas.PLAIN_IDENTIFIER)) {
final String string = record.get(0).toString();
if (string.startsWith("_:")) {
return this.factory.createBNode(string.substring(2));
} else {
return this.factory.createURI(string);
}
}
throw new IllegalArgumentException("Unsupported encoded identifier: " + record);
}
private Literal decodeLiteral(final Object generic) {
if (generic instanceof GenericRecord) {
final GenericRecord record = (GenericRecord) generic;
final Schema schema = record.getSchema();
if (schema.equals(Schemas.STRING_LANG)) {
final String label = record.get(0).toString(); // Utf8 class used
final Object language = record.get(1);
return this.factory.createLiteral(label, language.toString());
} else if (schema.equals(Schemas.SHORT)) {
return this.factory.createLiteral(((Integer) record.get(0)).shortValue());
} else if (schema.equals(Schemas.BYTE)) {
return this.factory.createLiteral(((Integer) record.get(0)).byteValue());
} else if (schema.equals(Schemas.BIGINTEGER)) {
return this.factory.createLiteral(record.get(0).toString(), XMLSchema.INTEGER);
} else if (schema.equals(Schemas.BIGDECIMAL)) {
return this.factory.createLiteral(record.get(0).toString(), XMLSchema.DECIMAL);
} else if (schema.equals(Schemas.CALENDAR)) {
final int tz = (Integer) record.get(0);
final GregorianCalendar calendar = new GregorianCalendar();
calendar.setTimeInMillis((Long) record.get(1));
calendar.setTimeZone(TimeZone.getTimeZone(String.format("GMT%s%02d:%02d",
tz >= 0 ? "+" : "-", Math.abs(tz) / 60, Math.abs(tz) % 60)));
return this.factory.createLiteral(this.datatypeFactory
.newXMLGregorianCalendar(calendar));
}
} else if (generic instanceof CharSequence) {
return this.factory.createLiteral(generic.toString()); // Utf8 class used
} else if (generic instanceof Boolean) {
return this.factory.createLiteral((Boolean) generic);
} else if (generic instanceof Long) {
return this.factory.createLiteral((Long) generic);
} else if (generic instanceof Integer) {
return this.factory.createLiteral((Integer) generic);
} else if (generic instanceof Double) {
return this.factory.createLiteral((Double) generic);
} else if (generic instanceof Float) {
return this.factory.createLiteral((Float) generic);
}
Preconditions.checkNotNull(generic);
throw new IllegalArgumentException("Unsupported generic data: " + generic);
}
private Statement decodeStatement(final GenericRecord record) {
final Resource subj = decodeIdentifier((GenericRecord) record.get(0));
final URI pred = (URI) decodeIdentifier((GenericRecord) record.get(1));
final Value obj = decodeValue(record.get(2));
final Resource ctx = decodeIdentifier((GenericRecord) record.get(3));
if (ctx == null) {
return this.factory.createStatement(subj, pred, obj);
} else {
return this.factory.createStatement(subj, pred, obj, ctx);
}
}
private Object encodeNodes(final Iterable<? extends Object> nodes) {
final int size = Iterables.size(nodes);
if (size == 1) {
return encodeNode(Iterables.get(nodes, 0));
}
final List<Object> list = Lists.<Object>newArrayListWithCapacity(size);
for (final Object node : nodes) {
list.add(encodeNode(node));
}
return list;
}
private Object encodeNode(final Object node) {
if (node instanceof Record) {
return encodeRecord((Record) node, null);
} else if (node instanceof Literal) {
return encodeLiteral((Literal) node);
} else if (node instanceof Resource) {
return encodeIdentifier((Resource) node);
} else if (node instanceof Statement) {
return encodeStatement((Statement) node);
}
Preconditions.checkNotNull(node);
throw new IllegalArgumentException("Unsupported node: " + node);
}
private Object encodeRecord(final Record record, @Nullable final Set<URI> propertiesToEncode) {
final URI id = record.getID();
final Object encodedID = id == null ? null : encodeIdentifier(id);
final List<Object> props = Lists.newArrayList();
for (final URI property : record.getProperties()) {
if (propertiesToEncode == null || propertiesToEncode.contains(property)) {
ensureInDictionary(property);
final List<? extends Object> nodes = record.get(property);
if (property.equals(RDF.TYPE)) {
for (final Object value : nodes) {
if (value instanceof URI) {
ensureInDictionary((URI) value);
}
}
}
final GenericData.Record prop = new GenericData.Record(Schemas.PROPERTY);
prop.put("propertyURI", encodeIdentifier(property));
prop.put("propertyValue", encodeNodes(nodes));
props.add(prop);
}
}
return SerializerAvro.newGenericRecord(Schemas.RECORD, encodedID, props);
}
private Object encodeValue(final Value value) {
if (value instanceof Literal) {
return encodeLiteral((Literal) value);
} else if (value instanceof Resource) {
return encodeIdentifier((Resource) value);
} else {
throw new IllegalArgumentException("Unsupported value: " + value);
}
}
private Object encodeIdentifier(final Resource identifier) {
if (identifier instanceof URI) {
try {
final Integer key = this.dictionary.keyFor((URI) identifier, false);
if (key != null) {
return SerializerAvro.newGenericRecord(Schemas.COMPRESSED_IDENTIFIER, key);
}
} catch (final IOException ex) {
throw new IllegalStateException("Cannot access dictionary: " + ex.getMessage(), ex);
}
}
final String id = identifier instanceof BNode ? "_:" + ((BNode) identifier).getID()
: identifier.stringValue();
return SerializerAvro.newGenericRecord(Schemas.PLAIN_IDENTIFIER, id);
}
private Object encodeLiteral(final Literal literal) {
final URI datatype = literal.getDatatype();
if (datatype == null || datatype.equals(XMLSchema.STRING)) {
final String language = literal.getLanguage();
if (language == null) {
return literal.getLabel();
} else {
return SerializerAvro.newGenericRecord(Schemas.STRING_LANG, literal.getLabel(),
language);
}
} else if (datatype.equals(XMLSchema.BOOLEAN)) {
return literal.booleanValue();
} else if (datatype.equals(XMLSchema.LONG)) {
return literal.longValue();
} else if (datatype.equals(XMLSchema.INT)) {
return literal.intValue();
} else if (datatype.equals(XMLSchema.DOUBLE)) {
return literal.doubleValue();
} else if (datatype.equals(XMLSchema.FLOAT)) {
return literal.floatValue();
} else if (datatype.equals(XMLSchema.SHORT)) {
return SerializerAvro.newGenericRecord(Schemas.SHORT, literal.intValue());
} else if (datatype.equals(XMLSchema.BYTE)) {
return SerializerAvro.newGenericRecord(Schemas.BYTE, literal.intValue());
} else if (datatype.equals(XMLSchema.INTEGER)) {
return SerializerAvro.newGenericRecord(Schemas.BIGINTEGER, literal.stringValue());
} else if (datatype.equals(XMLSchema.DECIMAL)) {
return SerializerAvro.newGenericRecord(Schemas.BIGDECIMAL, literal.stringValue());
} else if (datatype.equals(XMLSchema.DATETIME)) {
final XMLGregorianCalendar calendar = literal.calendarValue();
return SerializerAvro.newGenericRecord(Schemas.CALENDAR, calendar.getTimezone(),
calendar.toGregorianCalendar().getTimeInMillis());
}
throw new IllegalArgumentException("Unsupported literal: " + literal);
}
private Object encodeStatement(final Statement statement) {
return SerializerAvro.newGenericRecord(Schemas.STATEMENT,
encodeIdentifier(statement.getSubject()),
encodeIdentifier(statement.getPredicate()), //
encodeValue(statement.getObject()), //
encodeIdentifier(statement.getContext()));
}
private URI ensureInDictionary(final URI uri) {
try {
this.dictionary.keyFor(uri);
return uri;
} catch (final IOException ex) {
throw new IllegalStateException("Cannot access dictionary: " + ex.getMessage(), ex);
}
}
private static GenericData.Record newGenericRecord(final Schema schema,
final Object... fieldValues) {
final GenericData.Record record = new GenericData.Record(schema);
for (int i = 0; i < fieldValues.length; ++i) {
record.put(i, fieldValues[i]);
}
return record;
}
private static final class Schemas {
/** The namespace for KS-specific AVRO schemas. */
public static final String NAMESPACE = "eu.fbk.knowledgestore";
/** AVRO schema for NULL. */
public static final Schema NULL = Schema.create(Schema.Type.NULL);
/** AVRO schema for boolean literals. */
public static final Schema BOOLEAN = Schema.create(Schema.Type.BOOLEAN);
/** AVRO schema for string literals. */
public static final Schema STRING = Schema.create(Schema.Type.STRING);
/** AVRO schema for string literals with a language. */
public static final Schema STRING_LANG = Schema.createRecord("stringlang", null,
Schemas.NAMESPACE, false);
/** AVRO schema for long literals. */
public static final Schema LONG = Schema.create(Schema.Type.LONG);
/** AVRO schema for int literals. */
public static final Schema INT = Schema.create(Schema.Type.INT);
/** AVRO schema for short literals. */
public static final Schema SHORT = Schema.createRecord("short", null, Schemas.NAMESPACE,
false);
/** AVRO schema for byte literals. */
public static final Schema BYTE = Schema.createRecord("byte", null, Schemas.NAMESPACE,
false);
/** AVRO schema for double literals. */
public static final Schema DOUBLE = Schema.create(Schema.Type.DOUBLE);
/** AVRO schema for float literals. */
public static final Schema FLOAT = Schema.create(Schema.Type.FLOAT);
/** AVRO schema for big integer literals. */
public static final Schema BIGINTEGER = Schema.createRecord("biginteger", null,
Schemas.NAMESPACE, false);
/** AVRO schema for big decimal literals. */
public static final Schema BIGDECIMAL = Schema.createRecord("bigdecimal", null,
Schemas.NAMESPACE, false);
/** AVRO schema for non-compressed IDs (URIs, BNodes). */
public static final Schema PLAIN_IDENTIFIER = Schema //
.createRecord("plainidentifier", null, Schemas.NAMESPACE, false);
/** AVRO schema for compressed ID (URIs, BNodes). */
public static final Schema COMPRESSED_IDENTIFIER = Schema //
.createRecord("compressedidentifier", null, Schemas.NAMESPACE, false);
/** AVRO schema for any ID (URIs, BNodes). */
public static final Schema IDENTIFIER = Schema.createUnion(ImmutableList.<Schema>of(
PLAIN_IDENTIFIER, COMPRESSED_IDENTIFIER));
/** AVRO schema for calendar literals. */
public static final Schema CALENDAR = Schema.createRecord("calendar", null,
Schemas.NAMESPACE, false);
/** AVRO schema for RDF statements. */
public static final Schema STATEMENT = Schema.createRecord("statement", null,
Schemas.NAMESPACE, false);
/** AVRO schema for record nodes ({@code Record}). */
public static final Schema RECORD = Schema.createRecord("struct", null, Schemas.NAMESPACE,
false);
/** AVRO schema for generic data model nodes. */
public static final Schema NODE = Schema.createUnion(ImmutableList.<Schema>of(
Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT,
Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER,
Schemas.BIGDECIMAL, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER,
Schemas.CALENDAR, Schemas.STATEMENT, Schemas.RECORD));
/** AVRO schema for lists of nodes. */
public static final Schema LIST = Schema.createArray(Schemas.NODE);
/** AVRO schema for properties of a record node. */
public static final Schema PROPERTY = Schema.createRecord("property", null,
Schemas.NAMESPACE, false);
private Schemas() {
}
static {
Schemas.STRING_LANG.setFields(ImmutableList.<Schema.Field>of(new Schema.Field("label",
Schemas.STRING, null, null), new Schema.Field("language", Schemas.STRING,
null, null)));
Schemas.SHORT.setFields(ImmutableList.<Schema.Field>of(new Schema.Field("short",
Schemas.INT, null, null)));
Schemas.BYTE.setFields(ImmutableList.<Schema.Field>of(new Schema.Field("byte",
Schemas.INT, null, null)));
Schemas.BIGINTEGER.setFields(ImmutableList.<Schema.Field>of(new Schema.Field(
"biginteger", Schemas.STRING, null, null)));
Schemas.BIGDECIMAL.setFields(ImmutableList.<Schema.Field>of(new Schema.Field(
"bigdecimal", Schemas.STRING, null, null)));
Schemas.PLAIN_IDENTIFIER.setFields(ImmutableList.<Schema.Field>of(new Schema.Field(
"identifier", Schemas.STRING, null, null)));
Schemas.COMPRESSED_IDENTIFIER.setFields(ImmutableList
.<Schema.Field>of(new Schema.Field("identifier", Schemas.INT, null, null)));
Schemas.CALENDAR.setFields(ImmutableList.<Schema.Field>of(new Schema.Field("timezone",
Schemas.INT, null, null), new Schema.Field("timestamp", Schemas.LONG, null,
null)));
Schemas.STATEMENT.setFields(ImmutableList.<Schema.Field>of(
new Schema.Field("subject", Schemas.IDENTIFIER, null, null),
new Schema.Field("predicate", Schemas.IDENTIFIER, null, null),
new Schema.Field("object", Schema.createUnion(ImmutableList.<Schema>of(
Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG,
Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE,
Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL,
Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER,
Schemas.COMPRESSED_IDENTIFIER)), null, null), //
new Schema.Field("context", Schemas.IDENTIFIER, null, null)));
Schemas.PROPERTY.setFields(ImmutableList.<Schema.Field>of(
new Schema.Field("propertyURI", Schemas.COMPRESSED_IDENTIFIER, null, null),
new Schema.Field("propertyValue", Schema.createUnion(ImmutableList.<Schema>of(
Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG,
Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE,
Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL,
Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER,
Schemas.COMPRESSED_IDENTIFIER, Schemas.STATEMENT, Schemas.RECORD,
Schemas.LIST)), null, null)));
Schemas.RECORD.setFields(ImmutableList.<Schema.Field>of(
new Schema.Field("id",
Schema.createUnion(ImmutableList.<Schema>of(Schemas.NULL,
Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)),
null, null), //
new Schema.Field("properties", Schema.createArray(Schemas.PROPERTY), null,
null)));
}
}
}