package eu.ehri.project.indexing.converter.impl;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.jayway.jsonpath.*;
import com.jayway.jsonpath.internal.spi.json.JacksonJsonProvider;
import eu.ehri.project.indexing.converter.Converter;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
/**
* Convert from EHRI graph JSON to Solr documents.
*/
public class JsonConverter implements Converter<JsonNode, JsonNode> {
private static final Logger logger = LoggerFactory.getLogger(JsonConverter.class);
/**
* Set of key -> JsonPath extractors
*/
private static final Map<String, List<JsonPath>> jsonPaths = Utils.loadPaths();
/**
* Json Parser
*/
private final ParseContext parseContext;
/**
* Keys which have types that require special handling.
*/
private static final Map<String, List<String>> types = Utils.loadTypeKeys();
/**
* Keys which need a default value
*/
private static final Map<String, List<String>> defaults = Utils.loadDefaultKeys();
// JSON mapper
private static final ObjectMapper mapper = new ObjectMapper();
/**
* Default locale for language/country conversions...
*/
private static final Locale defaultLocale = Locale.ENGLISH;
/**
* Format dates and times for Solr.
*/
private static final DateTimeFormatter dateTimeFormatter
= ISODateTimeFormat.dateTime().withZoneUTC();
/**
* Additional country/code mappings...
*/
private static final Map<String, String> additionalCountries = ImmutableMap.of(
// As of 2016 ISO haven't assigned Kosovo a code, so
// we are using this "user-assigned" one instead.
"xk", "Kosovo"
);
/**
* Static lookup of country names.
*/
private static final ImmutableMap<String, String> countryLookup;
static {
Map<String, String> countries = Maps.newHashMap();
for (String cc : Locale.getISOCountries()) {
countries.put(cc.toLowerCase(),
new Locale(defaultLocale.getLanguage(), cc).getDisplayCountry());
}
countries.putAll(additionalCountries);
countryLookup = ImmutableMap.copyOf(countries);
}
public JsonConverter() {
parseContext = JsonPath.using(new JacksonJsonProvider());
}
/**
* Convert a individual item into one or more output items
*
* @param node A JSON node representing a single item
* @return The output nodes
* the converted data
*/
public Iterable<JsonNode> convert(JsonNode node) throws ConverterException {
List<JsonNode> out = Lists.newArrayList();
Iterator<JsonNode> descriptions = node.path("relationships")
.path("describes").iterator();
if (descriptions.hasNext()) {
while (descriptions.hasNext()) {
out.add(mapper.valueToTree(postProcess(getDescribedData(descriptions.next(), node))));
}
} else {
out.add(mapper.valueToTree(postProcess(getData(node))));
}
return out;
}
/**
* Get data for items where most of it resides in the description
* nodes.
*
* @param description The description's JSON node
* @param item The item's JSON node
* @return A map of the extracted data
*/
private Map<String, Object> getDescribedData(JsonNode description, JsonNode item) {
// Tricky code alert!
// Matching paths in the 'item' node overrides that of the description,
// though in practice there should almost never be any collisions. The
// exceptions are the 'id', 'itemId' and 'type' fields. In these cases
// we use the item's type (i.e. DocumentaryUnit instead of DocumentaryUnitDescription)
// but the description's id. We therefore only have to prevent the description's
// 'id' field being overwritten when we merge the item and description data.
Map<String, Object> descriptionData = getData(description);
// Merge the data, preventing overwriting of the id key - any other
// keys should be overwritten.
Map<String, Object> itemData = getData(item);
for (Map.Entry<String, Object> itemDataValue : itemData.entrySet()) {
if (!itemDataValue.getKey().equals("id") && itemDataValue.getValue() != null) {
descriptionData.put(itemDataValue.getKey(), itemDataValue.getValue());
}
}
return descriptionData;
}
private static boolean successfulMatch(Object value) {
if (value == null) {
return false;
} else if (value instanceof List) {
return !((List)value).isEmpty();
}
return true;
}
/**
* Get data for non-described items.
*
* @param node The item's JSON node
* @return A map of the extracted data
*/
private Map<String, Object> getData(JsonNode node) {
Map<String, Object> data = Maps.newHashMap();
ReadContext ctx = parseContext.parse(node.toString());
// Extract specific properties
for (Map.Entry<String, List<JsonPath>> attrPath : jsonPaths.entrySet()) {
String attr = attrPath.getKey();
// First successfully matched path wins...
for (JsonPath path : attrPath.getValue()) {
try {
Object value = ctx.read(path);
if (successfulMatch(value)) {
data.put(attr, value);
break;
}
} catch (PathNotFoundException e) {
// Intentionally ignore missing paths.
}
}
}
// Any keys in the 'data' section are indexed in dynamic fields
Iterator<Map.Entry<String, JsonNode>> dataFields = node.path("data").fields();
while (dataFields.hasNext()) {
Map.Entry<String, JsonNode> field = dataFields.next();
String key = field.getKey();
if (!jsonPaths.containsKey(key)) {
JsonToken value = field.getValue().asToken();
switch (value) {
case VALUE_STRING:
data.put(key + "_t", field.getValue().asText());
break;
case VALUE_NUMBER_INT:
data.put(key + "_i", value.asString());
break;
case VALUE_NUMBER_FLOAT:
data.put(key + "_f", value.asString());
break;
case VALUE_TRUE:
data.put(key + "_b", Boolean.TRUE);
break;
case VALUE_FALSE:
data.put(key + "_b", Boolean.FALSE);
break;
case START_ARRAY:
data.put(key + "_ss", field.getValue().iterator());
break;
case VALUE_NULL:
break;
default:
System.err.println("Unknown token " + value);
}
}
}
return data;
}
/**
* Do various post-processing steps on index data.
*
* @param data The original data.
* @return The enhanced data.
*/
public static Map<String, Object> postProcess(Map<String, Object> data) {
// Extract the ID, which should always be there...
String id = (String) data.get("id");
// Fix date format
List<String> dateKeys = types.get("date");
if (dateKeys != null) {
for (String key : dateKeys) {
if (data.containsKey(key)) {
try {
data.put(key, fixDates((String) data.get(key)));
} catch (IllegalArgumentException e) {
data.remove(key);
logger.error("Invalid date: {} (in {})", data.get(key), id);
}
}
}
}
// HACK! Combine dateStart and dateEnd into dateRange,
// ensuring there are no duplicates.
// FIXME: Unsafe cast
Set<Object> dateRanges = (HashSet<Object>) data.get("dateRange");
if (dateRanges == null) {
dateRanges = Sets.newHashSet();
}
for (String d : new String[]{"dateStart", "dateEnd"}) {
if (data.containsKey(d)) {
dateRanges.add(data.get(d));
}
}
if (!dateRanges.isEmpty()) {
logger.debug("Adding date range: {}", dateRanges);
data.put("dateRange", dateRanges);
}
// HACK! Set restricted=true for items that have accessors.
// NB: This should be done before setting field defaults, because
// the default for items without accessibleTo is ["ALLUSERS"]
data.put("restricted", data.containsKey("accessibleTo"));
// Add defaults
// NB: Simple defaults can be set directly in the schema, so this is
// only necessary when more complex operations need to be performed,
// i.e. defaults for multivalue fields with more than one entry.
for (Map.Entry<String, List<String>> entry : defaults.entrySet()) {
if (!data.containsKey(entry.getKey())) {
data.put(entry.getKey(), entry.getValue());
}
}
// HACK! Create a composite 'location' field from latitude and longitude
Object latitude = data.get("latitude");
Object longitude = data.get("longitude");
if (latitude != null && longitude != null) {
String location = latitude + "," + longitude;
logger.debug("Adding location value: {} -> {}", id, location);
data.put("location", location);
}
// HACK! Set isTopLevel attr for items where parentId is not defined
data.put("isTopLevel", !data.containsKey("parentId"));
// HACK! Set isParent when childCount > 0
data.put("isParent", data.containsKey("childCount")
&& (Integer)data.get("childCount") > 0);
// HACK: if countryCode is set, translate it to a name in the default locale:
if (data.containsKey("countryCode")) {
data.put("countryName", countryLookup.get(data.get("countryCode")));
}
// HACK: Set country name to name field on country type
if ("Country".equals(data.get("type"))) {
data.put("name", countryLookup.get(id));
}
// HACK: Set charCount field as sum of string field data...
int charCount = 0;
for (Map.Entry<String, Object> entry : data.entrySet()) {
Object dataItem = entry.getValue();
if (dataItem instanceof String) {
charCount += ((String) dataItem).length();
} else if (dataItem instanceof List) {
for (Object e : ((List) dataItem)) {
charCount += e.toString().length();
}
}
}
data.put("charCount", charCount);
// HACK: Add a boolean for links with/without an external body
if (data.containsKey("linkBodyName")) {
data.put("hasBody", true);
}
if (data.containsKey("targetIds")) {
Object targets = data.get("targetIds");
if (targets instanceof List) {
data.put("targetCount", ((List) targets).size());
}
}
return data;
}
/**
* Translate date strings
*
* @param date The date string to be fixed
* @return A Solr-compliant version of input string
*/
private static String fixDates(String date) {
return dateTimeFormatter.print(new DateTime(date));
}
}