package edu.isi.karma.kr2rml.writer;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.SchemaBuilder.FieldAssembler;
import org.apache.avro.SchemaBuilder.RecordBuilder;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import edu.isi.karma.kr2rml.PredicateObjectMap;
import edu.isi.karma.kr2rml.RefObjectMap;
import edu.isi.karma.kr2rml.mapping.R2RMLMappingIdentifier;
import edu.isi.karma.kr2rml.planning.TriplesMap;
import edu.isi.karma.kr2rml.planning.TriplesMapGraph;
import edu.isi.karma.modeling.Uris;
import edu.isi.karma.rep.RepFactory;
public class AvroKR2RMLRDFWriter extends SFKR2RMLRDFWriter<GenericRecord> {
private static Logger LOG = LoggerFactory.getLogger(AvroKR2RMLRDFWriter.class);
protected Map<String, Schema> triplesMapIdToSchema = new HashMap<>();
protected RepFactory rep;
protected Schema rootSchema;
private OutputStream output;
private DatumWriter<GenericRecord> datumWriter;
private DataFileWriter<GenericRecord> dfw;
//TODO come up with a good naming convention for records
private int id = 1;
public AvroKR2RMLRDFWriter(OutputStream output)
{
super(new PrintWriter(output));
this.output = output;
}
public void setRepFactory(RepFactory rep)
{
this.rep = rep;
}
public void setProcessingOrder(Map<TriplesMapGraph, List<String>> triplesMapProcessingOrder) throws IOException
{
for(Entry<TriplesMapGraph, List<String>> entry : triplesMapProcessingOrder.entrySet())
{
for(String triplesMapId : entry.getValue())
{
triplesMapIdToSchema.put(triplesMapId, getSchemaForTriplesMap(entry.getKey(), triplesMapId));
}
}
String rootTriplesMapId = this.rootTriplesMapIds.iterator().next();
rootSchema = triplesMapIdToSchema.get(rootTriplesMapId);
datumWriter = new GenericDatumWriter<>(rootSchema);
dfw = new DataFileWriter<>(datumWriter);
dfw.create(rootSchema, output);
}
protected Schema getSchemaForTriplesMap(TriplesMapGraph graph, String triplesMapId)
{
TriplesMap map = graph.getTriplesMap(triplesMapId);
RecordBuilder<Schema> rb = SchemaBuilder.record("subjr"+(id++));
Set<String> currentPredicates = new HashSet<>();
FieldAssembler<Schema> fieldAssembler = rb.fields();
for(PredicateObjectMap pom : map.getPredicateObjectMaps())
{
boolean isMap = false;
Schema targetSchema = null;
String predicateShortHand = null;
if(pom.getPredicate().getTemplate().getAllColumnNameTermElements().isEmpty())
{
String predicate = pom.getPredicate().getTemplate().getR2rmlTemplateString(rep);
predicateShortHand = shortHandURIGenerator.getShortHand(predicate).toString().replaceAll("[^\\w]", "_");
}
else
{
isMap = true;
}
if(pom.getObject() != null && pom.getObject().hasRefObjectMap())
{
RefObjectMap refObjectMap = pom.getObject().getRefObjectMap();
if(!refObjectMap.getParentTriplesMap().getId().equalsIgnoreCase(triplesMapId))
{
targetSchema = triplesMapIdToSchema.get(refObjectMap.getParentTriplesMap().getId());
}
}
if(currentPredicates.add(predicateShortHand))
{
fieldAssembler = addField(fieldAssembler, pom, isMap,
targetSchema, predicateShortHand);
}
else
{
//TODO handle conflicting types
LOG.warn("Duplicate predicate detected in schema");
}
}
fieldAssembler = fieldAssembler.name("id").type().unionOf().array().items().stringType().and().stringType().and().nullType().endUnion().noDefault();
fieldAssembler = fieldAssembler.name("rdf_type").type().unionOf().array().items().stringType().and().stringType().and().nullType().endUnion().noDefault();
return fieldAssembler.endRecord();
}
private FieldAssembler<Schema> addField(
FieldAssembler<Schema> fieldAssembler, PredicateObjectMap pom,
boolean isMap, Schema targetSchema, String predicateShortHand) {
try{
if(isMap)
{
if(targetSchema == null)
{
fieldAssembler = fieldAssembler.name(pom.getPredicate().getId().replaceAll("[^\\w]", "_")).type().unionOf().map().values().unionOf().map().values().stringType().and().stringType().and().nullType().endUnion().and().nullType().endUnion().noDefault();
}
else
{
fieldAssembler = fieldAssembler.name(pom.getPredicate().getId().replaceAll("[^\\w]", "_")).type().unionOf().map().values().unionOf().map().values(targetSchema).and().type(targetSchema).and().nullType().endUnion().and().nullType().endUnion().noDefault();
}
}
else {
if(targetSchema == null)
{
fieldAssembler = fieldAssembler.name(predicateShortHand).type().unionOf().array().items().stringType().and().stringType().and().nullType().endUnion().noDefault();
}
else
{
fieldAssembler = fieldAssembler.name(predicateShortHand).type().unionOf().array().items(targetSchema).and().type(targetSchema).and().nullType().endUnion().noDefault();
}
}
}
catch(Exception e)
{
LOG.error("Unable to add field: " + predicateShortHand + " for " + pom.getTriplesMap().getSubject().getTemplate().toString(), e);
}
return fieldAssembler;
}
@Override
protected void initializeOutput() {
// TODO Auto-generated method stub
}
@Override
protected void addValue(PredicateObjectMap pom, GenericRecord subject, String predicateUri,
Object object) {
String shortHandPredicateURI = shortHandURIGenerator.getShortHand(predicateUri).toString().replaceAll("[^\\w]", "_");
Schema schema = subject.getSchema();
Field field = schema.getField(shortHandPredicateURI);
Field mapField = schema.getField(pom.getPredicate().getId().replaceAll("[^\\w]", "_"));
if (subject.get(shortHandPredicateURI) != null || predicateUri.contains(Uris.RDF_TYPE_URI)) {
if(field != null)
{
addValueToArray(pom, subject, object,
shortHandPredicateURI);
}
else if(mapField != null && mapField.schema().getType() == Schema.Type.MAP)
{
addValueToMap(pom, subject,object, shortHandPredicateURI);
}
}
else
{
if(field != null)
{
subject.put(shortHandPredicateURI, object);
}
else if(mapField != null && mapField.schema().getTypes().get(0).getType() == Schema.Type.MAP)
{
addValueToMap(pom, subject,object, shortHandPredicateURI);
}
}
}
@SuppressWarnings("unchecked")
protected void addValueToMap(PredicateObjectMap pom, GenericRecord subject, Object object,
String shortHandPredicateURI)
{
String mapPredicateName = pom.getPredicate().getId().replaceAll("[^\\w]", "_");
if(object instanceof String)
{
Map<String, String> values;
if(subject.get(mapPredicateName)== null)
{
subject.put(mapPredicateName, new ConcurrentHashMap<String, String>());
}
values = (Map<String, String>)subject.get(mapPredicateName);
values.put(shortHandPredicateURI, (String)object);
}
else if(object instanceof GenericRecord)
{
Map<String, GenericRecord> values;
if(subject.get(mapPredicateName)== null)
{
subject.put(mapPredicateName, new ConcurrentHashMap<String, GenericRecord>());
}
values = (Map<String, GenericRecord>)subject.get(mapPredicateName);
values.put(shortHandPredicateURI, (GenericRecord)object);
}
}
@SuppressWarnings("unchecked")
@Override
protected void addValueToArray(PredicateObjectMap pom, GenericRecord subject, Object object,
String shortHandPredicateURI) {
Object currentObj = subject.get(shortHandPredicateURI);
GenericArray<GenericRecord> array = null;
GenericArray<String> strings = null;
if(object instanceof GenericRecord)
{
if(currentObj != null)
{
if(currentObj instanceof GenericArray)
{
array = (GenericArray<GenericRecord>) currentObj;
array.add((GenericRecord) object);
}
else if(currentObj instanceof GenericRecord)
{
array = new GenericData.Array<>(subject.getSchema().getField(shortHandPredicateURI).schema().getTypes().get(0), new LinkedList<GenericRecord>());
array.add((GenericRecord)object);
array.add((GenericRecord)currentObj);
}
}
else
{
GenericRecord objectToAdd = (GenericRecord)object;
array = new GenericData.Array<>(objectToAdd.getSchema(), new LinkedList<GenericRecord>());
array.add(objectToAdd);
}
subject.put(shortHandPredicateURI, array);
}
else if(object instanceof String)
{
if(currentObj != null)
{
if(currentObj instanceof GenericArray)
{
strings = (GenericArray<String>) currentObj;
strings.add((String) object);
}
else if(currentObj instanceof String)
{
strings = new GenericData.Array<>(SchemaBuilder.array().items().stringType(), new LinkedList<String>());
strings.add((String)object);
strings.add((String)currentObj);
}
}
else
{
String objectToAdd = (String)object;
strings = new GenericData.Array<>(SchemaBuilder.array().items().stringType(), new LinkedList<String>());
strings.add(objectToAdd);
}
subject.put(shortHandPredicateURI, strings);
}
}
@Override
protected Object generateLanguageLiteral(Object literal, String language) {
return literal;
}
@Override
public void finishRow() {
for(Map<String, GenericRecord> records : this.rootObjectsByTriplesMapId.values())
{
for(GenericRecord record : records.values()){
try {
collapseSameType(record);
dfw.append(record);
} catch (Exception e) {
LOG.error("Unable to append Avro record to writer!", e);
}
}
}
for(Entry<String, ConcurrentHashMap<String, GenericRecord>> entry : this.rootObjectsByTriplesMapId.entrySet())
{
entry.getValue().clear();
}
for(Entry<String, ConcurrentHashMap<String, GenericRecord>> entry : this.generatedObjectsByTriplesMapId.entrySet())
{
entry.getValue().clear();
}
this.generatedObjectsWithoutTriplesMap.clear();
};
@Override
public void close() {
try {
dfw.flush();
output.flush();
output.close();
} catch (IOException e) {
LOG.error("Unable to flush and close output!", e);
}
}
@SuppressWarnings("rawtypes")
@Override
protected void collapseSameType(GenericRecord obj) {
for (Field f : obj.getSchema().getFields()) {
Object value = obj.get(f.name());
if(value == null)
{
continue;
}
if (value instanceof GenericRecord)
collapseSameType((GenericRecord)value);
if (value instanceof GenericArray) {
GenericArray array = (GenericArray)value;
Set<Object> valuesHash = new HashSet<>();
boolean unmodified = true;
for (int i = 0; i < array.size(); i++) {
Object o = array.get(i);
if (o instanceof GenericRecord)
collapseSameType((GenericRecord) o);
unmodified &= valuesHash.add(o);
}
if(!unmodified)
{
GenericArray<Object> newValues = new GenericData.Array<>(array.getSchema(), valuesHash);
obj.put(f.name(), newValues);
}
}
}
}
@Override
public GenericRecord getNewObject(String triplesMapId, String subjUri) {
GenericRecord record =new GenericData.Record(this.triplesMapIdToSchema.get(triplesMapId));
record.put("id", subjUri);
return record;
}
@Override
protected Object convertLiteral(String value, String literalType, String language) {
return value;
}
@Override
public void setR2RMLMappingIdentifier(
R2RMLMappingIdentifier mappingIdentifer) {
}
}