package org.rakam.collection; import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.core.JsonTokenId; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.dataformat.csv.CsvParser; import com.google.common.collect.ImmutableList; import io.netty.handler.codec.http.HttpResponseStatus; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.rakam.analysis.ConfigManager; import org.rakam.analysis.metadata.Metastore; import org.rakam.analysis.metadata.SchemaChecker; import org.rakam.collection.FieldDependencyBuilder.FieldDependency; import org.rakam.config.ProjectConfig; import org.rakam.util.DateTimeUtils; import org.rakam.util.RakamException; import javax.inject.Inject; import java.io.IOException; import java.time.LocalTime; import java.time.temporal.ChronoField; import java.util.AbstractMap; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.IntStream; import static com.fasterxml.jackson.core.JsonToken.VALUE_STRING; import static java.lang.String.format; import static java.util.stream.IntStream.range; import static org.rakam.analysis.InternalConfig.USER_TYPE; import static org.rakam.collection.FieldType.STRING; import static org.rakam.collection.JsonEventDeserializer.getValueOfMagicField; import static org.rakam.util.AvroUtil.convertAvroSchema; import static org.rakam.util.AvroUtil.generateAvroSchema; public class CsvEventDeserializer extends JsonDeserializer<EventList> { private final Metastore metastore; private final Set<SchemaField> constantFields; private final ConfigManager configManager; private final Map<String, List<SchemaField>> dependentFields; private final JsonFactory jsonFactory = new JsonFactory(); private final SchemaChecker schemaChecker; private final ProjectConfig projectConfig; @Inject public CsvEventDeserializer( Metastore metastore, ProjectConfig projectConfig, ConfigManager configManager, SchemaChecker schemaChecker, FieldDependency fieldDependency) { this.metastore = metastore; this.configManager = configManager; this.projectConfig = projectConfig; this.schemaChecker = schemaChecker; this.constantFields = fieldDependency.constantFields; this.dependentFields = fieldDependency.dependentFields; } @Override public EventList deserialize(JsonParser jp, DeserializationContext ctxt) throws IOException { String project = (String) ctxt.getAttribute("project"); String collection = (String) ctxt.getAttribute("collection"); String apiKey = (String) ctxt.getAttribute("apiKey"); boolean useheader = Boolean.FALSE != ctxt.getAttribute("useHeader"); Map.Entry<List<SchemaField>, int[]> header; if (useheader) { header = readHeader((CsvParser) jp, project, collection); } else { List<SchemaField> vall = metastore.getCollection(project, collection); header = new AbstractMap.SimpleImmutableEntry<>(vall, IntStream.range(0, vall.size()).toArray()); } List<SchemaField> fields = header.getKey(); int[] indexes = header.getValue(); List<FieldType> types = Arrays.stream(indexes) .mapToObj(i -> header.getKey().get(i).getType()).collect(Collectors.toList()); Schema schema = convertAvroSchema(fields); GenericData.Record record = new GenericData.Record(schema); int idx = 0; List<Event> list = new ArrayList<>(); while (true) { JsonToken t = jp.nextToken(); if (t == null) { break; } switch (t.id()) { case JsonTokenId.ID_START_ARRAY: idx = 0; record = new GenericData.Record(schema); list.add(new Event(project, collection, null, fields, record)); break; case JsonTokenId.ID_END_ARRAY: continue; default: if (idx >= indexes.length) { throw new RakamException(String.format("Table has %d columns but csv file has more than %d columns", indexes.length, indexes.length), HttpResponseStatus.BAD_REQUEST); } record.put(indexes[idx], getValue(types.get(idx), jp)); idx += 1; break; } } return new EventList(Event.EventContext.apiKey(apiKey), project, list); } public Map.Entry<List<SchemaField>, int[]> readHeader(CsvParser jp, String project, String collection) throws IOException { List<SchemaField> fields = metastore.getCollection(project, collection); if (fields.isEmpty()) { fields = ImmutableList.copyOf(constantFields); } List<String> columns = new ArrayList<>(); Set<SchemaField> newFields = new HashSet<>(); while (jp.nextToken() == VALUE_STRING) { String name = SchemaField.stripName(jp.getValueAsString(), "header name"); Optional<SchemaField> existingField = fields.stream() .filter(f -> f.getName().equals(name)).findAny(); if (!existingField.isPresent()) { FieldType type = STRING; if (name.equals(projectConfig)) { type = configManager.setConfigOnce(project, USER_TYPE.name(), STRING); } SchemaField field = dependentFields.values().stream() .flatMap(e -> e.stream()) .filter(e -> e.getName().equals(name)) .findAny().orElse(new SchemaField(name, type)); newFields.add(field); } columns.add(name); } if (!newFields.isEmpty()) { fields = metastore.getOrCreateCollectionFieldList(project, collection, schemaChecker.checkNewFields(collection, newFields)); } final List<SchemaField> finalFields = fields; int[] indexes = columns.stream().mapToInt(colName -> range(0, finalFields.size()) .filter(i -> finalFields.get(i).getName().equals(colName)).findAny().getAsInt()) .toArray(); return new AbstractMap.SimpleImmutableEntry<>(fields, indexes); } public Object getValue(FieldType type, JsonParser jp) throws IOException { if (type == null) { return getValueOfMagicField(jp); } switch (type) { case STRING: String valueAsString = jp.getValueAsString(); if (valueAsString.length() > 100) { valueAsString = valueAsString.substring(0, 100); } return valueAsString; case BOOLEAN: return jp.getValueAsBoolean(); case LONG: return jp.getValueAsLong(); case INTEGER: return jp.getValueAsInt(); case DECIMAL: return jp.getValueAsDouble(); case TIME: return (long) LocalTime.parse(jp.getValueAsString()).get(ChronoField.MILLI_OF_DAY); case DOUBLE: return jp.getValueAsDouble(); case TIMESTAMP: if (jp.getCurrentToken() == JsonToken.VALUE_NUMBER_INT) { return jp.getValueAsLong(); } try { return DateTimeUtils.parseTimestamp(jp.getValueAsString()); } catch (Exception e) { return null; } case DATE: try { return DateTimeUtils.parseDate(jp.getValueAsString()); } catch (Exception e) { return null; } default: if (type.isMap()) { return getMap(type.getMapValueType(), jp.getValueAsString()); } if (type.isArray()) { return getArray(type.getArrayElementType(), jp.getValueAsString()); } throw new JsonMappingException(format("type is not supported.")); } } private GenericData.Array getArray(FieldType arrayElementType, String valueAsString) throws IOException { JsonParser parser = jsonFactory.createParser(valueAsString); List<Object> objects = new ArrayList<>(); JsonToken t = parser.getCurrentToken(); if (t != JsonToken.START_ARRAY) { return null; } else { t = parser.nextToken(); } for (; t != JsonToken.END_ARRAY; t = parser.nextToken()) { if (!t.isScalarValue()) { throw new JsonMappingException(String.format("Nested properties are not supported. ('%s' field)", arrayElementType.name())); } objects.add(getValue(arrayElementType, parser)); } return new GenericData.Array(generateAvroSchema(arrayElementType), objects); } private Map<String, Object> getMap(FieldType mapValueType, String valueAsString) throws IOException { Map<String, Object> map = new HashMap<>(); JsonParser parser = jsonFactory.createParser(valueAsString); JsonToken t = parser.getCurrentToken(); if (t != JsonToken.START_OBJECT) { return null; } else { t = parser.nextToken(); } for (; t == JsonToken.FIELD_NAME; t = parser.nextToken()) { String key = parser.getCurrentName(); if (!parser.nextToken().isScalarValue()) { throw new JsonMappingException(String.format("Nested properties are not supported. ('%s' field)", mapValueType.name())); } map.put(key, getValue(mapValueType, parser)); } return map; } }