package com.linkedin.data.schema;
import com.linkedin.data.ByteString;
import com.linkedin.data.DataList;
import com.linkedin.data.DataMap;
import com.linkedin.data.codec.JacksonDataCodec;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
/**
* Encodes {@link DataSchema} types to Pegasus data language (.pdl) source code.
*/
public class SchemaToPdlEncoder extends AbstractSchemaEncoder
{
private static final Set<String> KEYWORDS = new HashSet<>(Arrays.asList(
"array", "enum", "fixed", "import", "includes", "map", "namespace", "package",
"record", "typeref", "union", "null", "true", "false"
));
private static final JacksonDataCodec CODEC = new JacksonDataCodec();
private final Writer _out;
private Map<String, Name> _importsByLocalName;
private int _indentDepth = 0;
private String _namespace = null;
/**
* Construct a .pdl source code encoder.
*
* @param out provides the encoded .pdl destination.
*/
public SchemaToPdlEncoder(Writer out)
{
_out = out;
}
/**
* Write the provided schema as the top level type in a .pdl file.
*
* @param schema provides the schema to encode to .pdl and emit to this instance's writer.
* @throws IOException if a writer IO exception occurs.
*/
@Override
public void encode(DataSchema schema) throws IOException
{
if (_typeReferenceFormat != TypeReferenceFormat.DENORMALIZE)
{
_importsByLocalName = computeImports(schema);
}
else
{
_importsByLocalName = Collections.emptyMap();
}
if (schema instanceof NamedDataSchema)
{
NamedDataSchema namedSchema = (NamedDataSchema) schema;
boolean hasNamespace = StringUtils.isNotBlank(namedSchema.getNamespace());
boolean hasPackage = StringUtils.isNotBlank(namedSchema.getPackage());
if (hasNamespace || hasPackage)
{
if (hasNamespace)
{
writeLine("namespace " + escapeIdentifier(namedSchema.getNamespace()));
_namespace = namedSchema.getNamespace();
}
if (hasPackage)
{
writeLine("package " + escapeIdentifier(namedSchema.getPackage()));
}
newline();
}
if (_importsByLocalName.size() > 0)
{
// Sort imports by fully qualified name
for (Name importName : new TreeSet<>(_importsByLocalName.values()))
{
if (!importName.getNamespace().equals(_namespace))
{
writeLine("import " + escapeIdentifier(importName.getFullName()));
}
}
newline();
}
}
writeInlineSchema(schema);
}
/**
* Write a schema as inline code, not including any namespace, package or import preamble.
*
* @param schema provides the schema to write.
*/
private void writeInlineSchema(DataSchema schema) throws IOException
{
switch (schema.getType())
{
case RECORD:
writeRecord((RecordDataSchema) schema);
break;
case ENUM:
writeEnum((EnumDataSchema) schema);
break;
case FIXED:
writeFixed((FixedDataSchema) schema);
break;
case TYPEREF:
writeTyperef((TyperefDataSchema) schema);
break;
case ARRAY:
writeArray((ArrayDataSchema) schema);
break;
case MAP:
writeMap((MapDataSchema) schema);
break;
case UNION:
writeUnion((UnionDataSchema) schema);
break;
case BOOLEAN:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case STRING:
case BYTES:
writePrimitive((PrimitiveDataSchema) schema);
break;
default:
throw new IllegalArgumentException("Unrecognized schema type " + schema.getClass());
}
}
private void writeRecord(RecordDataSchema schema) throws IOException
{
writeDoc(schema.getDoc());
writeProperties(schema.getProperties());
write("record ");
write(toTypeIdentifier(schema));
List<NamedDataSchema> includes = schema.getInclude();
if (includes.size() > 0)
{
write(" includes ");
for (Iterator<NamedDataSchema> iter = includes.iterator(); iter.hasNext();)
{
NamedDataSchema include = iter.next();
writeReferenceOrInline(include, schema.isIncludeDeclaredInline(include));
if (iter.hasNext())
{
write(", ");
}
}
}
write(" {");
newline();
_indentDepth++;
for (RecordDataSchema.Field field : schema.getFields())
{
if (field.getRecord().equals(schema))
{
writeDoc(field.getDoc());
writeProperties(field.getProperties());
indent();
write(escapeIdentifier(field.getName()));
write(": ");
writeReferenceOrInline(field.getType(), field.isDeclaredInline());
if (field.getOptional())
{
write("?");
}
if (field.getDefault() != null)
{
write(" = ");
write(toJson(field.getDefault()));
}
newline();
}
}
_indentDepth--;
indent();
write("}");
}
private void writeEnum(EnumDataSchema schema) throws IOException
{
writeDoc(schema.getDoc());
DataMap properties = new DataMap(schema.getProperties());
DataMap propertiesMap = new DataMap(coercePropertyToDataMapOrFail(schema, "symbolProperties", properties.remove("symbolProperties")));
DataMap deprecatedMap = coercePropertyToDataMapOrFail(schema, "deprecatedSymbols", properties.remove("deprecatedSymbols"));
writeProperties(properties);
write("enum ");
write(toTypeIdentifier(schema));
write(" {");
newline();
_indentDepth++;
Map<String, String> docs = schema.getSymbolDocs();
for (String symbol : schema.getSymbols())
{
writeDoc(docs.get(symbol));
DataMap symbolProperties = coercePropertyToDataMapOrFail(schema, "symbolProperties." + symbol, propertiesMap.get(symbol));
Object deprecated = deprecatedMap.get(symbol);
if (deprecated != null)
{
symbolProperties.put("deprecated", deprecated);
}
writeProperties(symbolProperties);
writeLine(symbol);
}
_indentDepth--;
indent();
write("}");
}
private void writeFixed(FixedDataSchema schema) throws IOException
{
writeDoc(schema.getDoc());
writeProperties(schema.getProperties());
write("fixed ");
write(toTypeIdentifier(schema));
write(" ");
write(String.valueOf(schema.getSize()));
}
private void writeTyperef(TyperefDataSchema schema) throws IOException
{
writeDoc(schema.getDoc());
writeProperties(schema.getProperties());
write("typeref ");
write(toTypeIdentifier(schema));
write(" = ");
DataSchema ref = schema.getRef();
writeReferenceOrInline(ref, schema.isRefDeclaredInline());
}
private void writeMap(MapDataSchema schema) throws IOException
{
write("map[string, ");
writeReferenceOrInline(schema.getValues(), schema.isValuesDeclaredInline());
write("]");
}
private void writeArray(ArrayDataSchema schema) throws IOException
{
write("array[");
writeReferenceOrInline(schema.getItems(), schema.isItemsDeclaredInline());
write("]");
}
private void writeUnion(UnionDataSchema schema) throws IOException
{
write("union[");
for(Iterator<DataSchema> iter = schema.getTypes().iterator(); iter.hasNext();)
{
DataSchema member = iter.next();
writeReferenceOrInline(member, schema.isTypeDeclaredInline(member));
if (iter.hasNext())
{
write(", ");
}
}
write("]");
}
private void writePrimitive(PrimitiveDataSchema schema) throws IOException
{
write(schema.getUnionMemberKey());
}
/**
* Coerces a schema property value to a DataMap or, if it cannot be coerced, throws an exception.
* If the value is a DataMap, return it. If the value is null, return an empty DataMap.
* @param schema provides the schema this property belongs to, for error reporting purposes.
* @param name provides the schema's property path to this value as a string, for error reporting purposes.
* @param value provides the property value to coerce.
* @return the property value, coerced to a DataMap.
* @throws IllegalArgumentException if the property value cannot be coerced to a DataMap.
*/
private DataMap coercePropertyToDataMapOrFail(NamedDataSchema schema, String name, Object value)
{
if (value == null)
{
return new DataMap();
}
if (!(value instanceof DataMap))
{
throw new IllegalArgumentException("'" + name + "' in " + schema.getFullName() +
" must be of type DataMap, but is: " + value.getClass());
}
return (DataMap) value;
}
/**
* Write a documentation string to .pdl code.
* The documentation string will be embedded in a properly indented javadoc style doc string delimiters and margin.
* @param doc provides the documentation to write.
*/
private void writeDoc(String doc) throws IOException
{
if (StringUtils.isNotBlank(doc))
{
writeLine("/**");
for (String line : doc.split("\n"))
{
indent();
write(" * ");
write(line);
newline();
}
writeLine(" */");
}
}
/**
* Serialize a pegasus Data binding type to JSON.
* Valid types: DataList, DataMap, String, Int, Long, Float, Double, Boolean, ByteArray
* @param value provides the value to serialize to JSON.
* @return a JSON serialized string representation of the data value.
*/
private String toJson(Object value) throws IOException
{
if (value instanceof DataMap)
{
return CODEC.mapToString((DataMap) value);
}
else if (value instanceof DataList)
{
return CODEC.listToString((DataList) value);
}
else if (value instanceof String)
{
return "\"" + StringEscapeUtils.escapeJson((String) value) + "\"";
}
else if (value instanceof Number)
{
return String.valueOf(value);
}
else if (value instanceof Boolean)
{
return String.valueOf(value);
}
else if (value instanceof ByteString)
{
return ((ByteString) value).asAvroString();
}
else
{
throw new IllegalArgumentException("Unsupported data type: " + value.getClass());
}
}
/**
* Writes a data schema type to .pdl code, either as a by-name reference, or as an inlined declaration.
*
* This instance's TypeReferenceFormat is respected. If DENORMALIZE, the schema is inlined at it's first lexical
* appearance. If PRESERVE, it is inlined only if it was originally inlined.
*
* @param dataSchema provides the data schema to write.
* @param originallyInlined if true, the original schema inlined this type declaration, otherwise it used a by-name
* reference.
*/
private void writeReferenceOrInline(DataSchema dataSchema, boolean originallyInlined) throws IOException
{
TypeRepresentation representation = selectTypeRepresentation(dataSchema, originallyInlined);
markEncountered(dataSchema);
if (representation == TypeRepresentation.DECLARED_INLINE)
{
writeInlineSchema(dataSchema);
}
else
{
if (dataSchema instanceof NamedDataSchema)
{
write(toTypeIdentifier((NamedDataSchema) dataSchema));
}
else
{
throw new IllegalArgumentException("Unnamed not marked as inline: " + dataSchema);
}
}
}
/**
* Writes a set of schema properties to .pdl.
* @param properties provides the properties to write.
*/
private void writeProperties(Map<String, Object> properties) throws IOException
{
writeProperties(Collections.emptyList(), properties);
}
/**
* Writes a set of schema properties that share a common prefix to .pdl.
* @param prefix provides the common prefix of all the properties.
* @param properties provides the properties to write.
*/
private void writeProperties(List<String> prefix, Map<String, Object> properties) throws IOException
{
for (Map.Entry<String, Object> entry : properties.entrySet())
{
String key = entry.getKey();
Object value = entry.getValue();
ArrayList<String> pathParts = new ArrayList<>(prefix);
pathParts.add(key);
if (value instanceof DataMap)
{
// Favor @x.y.z = "value" property encoding style over @x = { "y": { "z": "value" } }
writeProperties(pathParts, (DataMap) value);
}
else if (value instanceof DataList)
{
writeProperty(pathParts, CODEC.listToString((DataList) value));
}
else if (Boolean.TRUE.equals(value))
{
// Use shorthand for boolean true. Instead of writing "@deprecated = true",
// write "@deprecated".
indent();
write("@");
write(pathToString(pathParts));
newline();
}
else
{
writeProperty(pathParts, value);
}
}
}
/**
* Write a property string to this encoder's writer.
* @param path provides the property's full path.
* @param value provides the property's value, it may be any valid pegasus Data binding type (DataList, DataMap,
* String, Int, Long, Float, Double, Boolean, ByteArray)
*/
private void writeProperty(List<String> path, Object value) throws IOException
{
indent();
write("@");
write(pathToString(path));
write(" = ");
write(toJson(value));
newline();
}
/**
* Converts a property path list to an escaped .pdl path string.
* @param path provide a property path list.
* @return a escaped .pdl path string.
*/
private String pathToString(List<String> path)
{
return path.stream().map(this::escapeIdentifier).collect(Collectors.joining("."));
}
/**
* Calculates which types to import to minimize the need to fully qualify names in a .pdl source file.
*
* When multiple referenced types have the same unqualified name only one is imported using the following rules:
* - Prefer types from the current namespace over types from other namespaces with colliding unqualified names.
* - Prefer the first lexically encountered type.
*
* The resulting import list includes types from the current namespace. These should not be explicitly written
* as import statements in the .pdl source, but are essential to keep in the import set to prevent collisions with
* types from other namespaces.
*
* Any type that is not imported must be referenced by fully qualified name through the .pdl source.
*
* @param schema provide the top level schema to calculate imports for.
* @return a sorted map of schema type names to import, keyed by local name.
*/
private Map<String, Name> computeImports(DataSchema schema) throws IOException
{
Map<String, Name> imports = new HashMap<>();
computeImports(schema, true, imports);
return imports;
}
/**
* See @{link computeImports}.
*
* @param schema provides a schema to search for referenced types.
* @param isDeclaredInline true if the schema should be treated as an inline declaration, false if it should be
* considered a by-name reference.
* @param importsAcc provides an imports result accumulator.
*/
private void computeImports(DataSchema schema, boolean isDeclaredInline, Map<String, Name> importsAcc) throws IOException
{
if (!isDeclaredInline)
{
if (schema instanceof NamedDataSchema)
{
NamedDataSchema namedSchema = (NamedDataSchema) schema;
Name name = new Name(namedSchema.getFullName());
if (name.getNamespace().equals(_namespace))
{
// Prefer importing types in the current namespace over types from other namespaces with colliding unqualified
// names.
importsAcc.put(name.getName(), name);
}
else
{
importsAcc.putIfAbsent(name.getName(), name);
}
}
}
else
{
if (schema instanceof RecordDataSchema)
{
RecordDataSchema recordSchema = (RecordDataSchema) schema;
for (RecordDataSchema.Field field : recordSchema.getFields())
{
computeImports(field.getType(), field.isDeclaredInline(), importsAcc);
}
for (NamedDataSchema include : recordSchema.getInclude())
{
computeImports(include, true, importsAcc);
}
}
else if (schema instanceof TyperefDataSchema)
{
TyperefDataSchema typerefSchema = (TyperefDataSchema) schema;
computeImports(typerefSchema.getRef(), typerefSchema.isRefDeclaredInline(), importsAcc);
}
else if (schema instanceof UnionDataSchema)
{
UnionDataSchema unionSchema = (UnionDataSchema) schema;
for (DataSchema member : unionSchema.getTypes())
{
computeImports(member, unionSchema.isTypeDeclaredInline(member), importsAcc);
}
}
else if (schema instanceof MapDataSchema)
{
MapDataSchema mapSchema = (MapDataSchema) schema;
computeImports(mapSchema.getValues(), mapSchema.isValuesDeclaredInline(), importsAcc);
}
else if (schema instanceof ArrayDataSchema)
{
ArrayDataSchema arraySchema = (ArrayDataSchema) schema;
computeImports(arraySchema.getItems(), arraySchema.isItemsDeclaredInline(), importsAcc);
}
}
}
/**
* Get the .pdl escaped source identifier for the given named type.
* If the type is imported, it's simple name will be returned, else it's fully qualified name will be returned.
*
* @param schema provides the named schema to get a .pdl escaped source identifier for.
* @return a escaped source identifier.
*/
private String toTypeIdentifier(NamedDataSchema schema)
{
if (schema.getNamespace().equals(_namespace) ||
(_importsByLocalName.containsKey(schema.getName()) &&
_importsByLocalName.get(schema.getName()).getNamespace().equals(schema.getNamespace())))
{
return escapeIdentifier(schema.getName());
}
else
{
return escapeIdentifier(schema.getFullName());
}
}
/**
* Escape an identifier for use in .pdl source code, replacing all identifiers that would conflict with .pdl
* keywords with a '`' escaped identifier. The identifier may be either qualified or unqualified.
*
* @param identifier provides the identifier to escape.
* @return an escaped identifier for use in .pdl source code.
*/
private String escapeIdentifier(String identifier)
{
return Arrays.stream(identifier.split("\\.")).map(part -> {
if (KEYWORDS.contains(part))
{
return '`' + part.trim() + '`';
}
else
{
return part.trim();
}
}).collect(Collectors.joining("."));
}
/**
* Write an intended line of .pdl code.
* The code will be prefixed by the current indentation and suffixed with a newline.
* @param code provide the line of .pdl code.
*/
private void writeLine(String code) throws IOException
{
indent();
write(code);
newline();
}
/**
* Writes the current indentation as .pdl source.
* Typically used in conjunction with write() and newline() to emit an entire line of .pdl source.
*/
private void indent() throws IOException
{
for (int i = 0; i < _indentDepth; i++)
{
_out.write(" ");
}
}
/**
* Write a fragment of .pdl code.
* The code fragment will be written verbatim.
* @param codeFragment provides the fragment to write.
*/
private void write(String codeFragment) throws IOException
{
_out.write(codeFragment);
}
/**
* Write a newline as .pdl source.
* Typically used in conjunction with indent() and write() to emit an entire line of .pdl source.
*/
private void newline() throws IOException
{
_out.write(System.lineSeparator());
}
}