/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.converter.csv; import java.util.Iterator; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonNull; import com.google.gson.JsonObject; import com.google.gson.JsonParser; import gobblin.configuration.WorkUnitState; import gobblin.converter.Converter; import gobblin.converter.DataConversionException; import gobblin.converter.SchemaConversionException; import gobblin.converter.SingleRecordIterable; /** * CsvToJsonConverterV2 accepts already deserialized (parsed) CSV row, String[], where you can use * @see CsvFileDownloader that conforms with RFC 4180 by leveraging Open CSV. * * Converts CSV to JSON. CSV schema is represented by the form of JsonArray same interface being used by CsvToJonConverter. * Each CSV record is represented by a array of String. * * Example of CSV schema: * [ { "columnName": "Day", "comment": "", "isNullable": "true", "dataType": { "type": "string" } }, { "columnName": "Pageviews", "comment": "", "isNullable": "true", "dataType": { "type": "long" } } ] */ public class CsvToJsonConverterV2 extends Converter<String, JsonArray, String[], JsonObject> { private static final Logger LOG = LoggerFactory.getLogger(CsvToJsonConverterV2.class); private static final JsonParser JSON_PARSER = new JsonParser(); private static final String COLUMN_NAME_KEY = "columnName"; private static final String JSON_NULL_VAL = "null"; public static final String CUSTOM_ORDERING = "converter.csv_to_json.custom_order"; private List<String> customOrder; @Override public Converter<String, JsonArray, String[], JsonObject> init(WorkUnitState workUnit) { super.init(workUnit); customOrder = workUnit.getPropAsList(CUSTOM_ORDERING, ""); if (!customOrder.isEmpty()) { LOG.info("Will use custom order to generate JSON from CSV: " + customOrder); } return this; } @Override public JsonArray convertSchema(String inputSchema, WorkUnitState workUnit) throws SchemaConversionException { Preconditions.checkNotNull(inputSchema, "inputSchema is required."); return JSON_PARSER.parse(inputSchema).getAsJsonArray(); } /** * Converts CSV (array of String) to JSON. * By default, fields between CSV and JSON are mapped in order bases and it validates if both input and output has same number of fields. * * Customization can be achieved by adding custom order where user can define list of indices of CSV fields correspond to output schema user defined. * Use case of customization (custom order): * In custom order, there are three input parameters generates output. * 1. Output schema: This is exact copy of input schema which is passed by user through job property. * 2. Custom order indices: This is indices passed by user through job property. * 3. Input record: This is CSV row, represented by array of String. * User usually does not have control on input record, and custom order is needed when output schema is not 1:1 match with input record. * Use cases: * 1. The order of input record(CSV in this case) does not match with output schema. * 2. Number of columns in output schema is greater or lesser than number of columns in input records. * * e.g: * 1. Different order * - Input record (CSV) * "2029", "94043", "Mountain view" * * - Output schema (derived from input schema): * [{"columnName":"street_number","dataType":{"type":"string"}},{"columnName":"city","dataType":{"type":"string"}},{"columnName":"zip_code","dataType":{"type":"string"}}] * * - Custom order indices * 0,2,1 * * - Output JSON (Key value is derived from output schema) * {"street_number" : "2029", "city" : "Mountain view" , "zip_code" : "94043" } * * 2. # of columns in input record(CSV) > # of columns in output schema * - Input record (CSV) * "2029", "Mountain view" , "USA", "94043" * * - Custom order indices * 0,1,3 * * - Output schema (derived from input schema): * [{"columnName":"street_number","dataType":{"type":"string"}},{"columnName":"city","dataType":{"type":"string"}},{"columnName":"zip_code","dataType":{"type":"string"}}] * * - Output JSON (Key value is derived from output schema) * {"street_number" : "2029", "city" : "Mountain view" , "zip_code" : "94043" } * * 3. # of columns in input record(CSV) < # of columns in output schema * - Input record (CSV) * "2029", "Mountain view", "94043" * * - Custom order (adding null when negative index is defined) * 0,1,-1,2 * * - Output schema (derived from input schema): * [{"columnName":"street_number","dataType":{"type":"string"}},{"columnName":"city","dataType":{"type":"string"}}, * {"columnName":"Country","isNullable":"true","dataType":{"type":"string"}},{"columnName":"zip_code","dataType":{"type":"string"}}] * * - Output JSON * {"street_number" : "2029", "city" : "Mountain view" , "Country" : null, "zip_code" : "94043" } * * {@inheritDoc} * @see gobblin.converter.Converter#convertRecord(java.lang.Object, java.lang.Object, gobblin.configuration.WorkUnitState) */ @Override public Iterable<JsonObject> convertRecord(JsonArray outputSchema, String[] inputRecord, WorkUnitState workUnit) throws DataConversionException { JsonObject outputRecord = null; if (!customOrder.isEmpty()) { outputRecord = createOutput(outputSchema, inputRecord, customOrder); } else { outputRecord = createOutput(outputSchema, inputRecord); } if (LOG.isDebugEnabled()) { LOG.debug("Converted into " + outputRecord); } return new SingleRecordIterable<JsonObject>(outputRecord); } @VisibleForTesting JsonObject createOutput(JsonArray outputSchema, String[] inputRecord) { Preconditions.checkArgument(outputSchema.size() == inputRecord.length, "# of columns mismatch. Input " + inputRecord.length + " , output: " + outputSchema.size()); JsonObject outputRecord = new JsonObject(); for (int i = 0; i < outputSchema.size(); i++) { String key = outputSchema.get(i).getAsJsonObject().get(COLUMN_NAME_KEY).getAsString(); if (StringUtils.isEmpty(inputRecord[i]) || JSON_NULL_VAL.equalsIgnoreCase(inputRecord[i])) { outputRecord.add(key, JsonNull.INSTANCE); } else { outputRecord.addProperty(key, inputRecord[i]); } } return outputRecord; } @VisibleForTesting JsonObject createOutput(JsonArray outputSchema, String[] inputRecord, List<String> customOrder) { Preconditions.checkArgument(outputSchema.size() == customOrder.size(), "# of columns mismatch. Input " + outputSchema.size() + " , output: " + customOrder.size()); JsonObject outputRecord = new JsonObject(); Iterator<JsonElement> outputSchemaIterator = outputSchema.iterator(); Iterator<String> customOrderIterator = customOrder.iterator(); while(outputSchemaIterator.hasNext() && customOrderIterator.hasNext()) { String key = outputSchemaIterator.next().getAsJsonObject().get(COLUMN_NAME_KEY).getAsString(); int i = Integer.parseInt(customOrderIterator.next()); Preconditions.checkArgument(i < inputRecord.length, "Index out of bound detected in customer order. Index: " + i + " , # of CSV columns: " + inputRecord.length); if (i < 0 || null == inputRecord[i] || JSON_NULL_VAL.equalsIgnoreCase(inputRecord[i])) { outputRecord.add(key, JsonNull.INSTANCE); continue; } outputRecord.addProperty(key, inputRecord[i]); } return outputRecord; } }