package com.thinkbiganalytics.discovery.parsers.hadoop; /*- * #%L * thinkbig-schema-discovery-default * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.google.common.util.concurrent.Uninterruptibles; import com.thinkbiganalytics.discovery.model.DefaultField; import com.thinkbiganalytics.discovery.model.DefaultHiveSchema; import com.thinkbiganalytics.discovery.schema.Field; import com.thinkbiganalytics.discovery.schema.QueryResult; import com.thinkbiganalytics.discovery.schema.QueryResultColumn; import com.thinkbiganalytics.discovery.schema.Schema; import com.thinkbiganalytics.discovery.util.ParserHelper; import com.thinkbiganalytics.discovery.util.TableSchemaType; import com.thinkbiganalytics.spark.rest.model.TransformRequest; import com.thinkbiganalytics.spark.rest.model.TransformResponse; import com.thinkbiganalytics.spark.shell.SparkShellProcess; import com.thinkbiganalytics.spark.shell.SparkShellProcessManager; import com.thinkbiganalytics.spark.shell.SparkShellRestClient; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Component; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.inject.Inject; /** * Utilizes Spark's support to infer schema from a sample file */ @Component public class SparkFileSchemaParserService { private static final Logger log = LoggerFactory.getLogger(SparkFileSchemaParserService.class); @Inject private SparkShellProcessManager shellProcessManager; private static String DATATYPE_PRECISION_SCALE_REGEX = "(.*)((\\([0-9]+,[0-9]+\\))|(\\([0-9]+\\)))"; /** * Communicates with Spark Shell processes */ @Inject private SparkShellRestClient restClient; /** * Delegate to spark shell service to load the file into a temporary table and loading it */ public Schema doParse(InputStream inputStream, SparkFileType fileType, TableSchemaType tableSchemaType) throws IOException { File tempFile = toFile(inputStream); try { SparkShellProcess shellProcess = shellProcessManager.getSystemProcess(); TransformResponse response = restClient.transform(shellProcess, createTransformRequest(tempFile, fileType)); while (response.getStatus() != TransformResponse.Status.SUCCESS) { if (response.getStatus() == TransformResponse.Status.ERROR) { throw new IOException("Failed to process data [" + response.getMessage() + "]"); } else { Uninterruptibles.sleepUninterruptibly(100L, TimeUnit.MILLISECONDS); } final Optional<TransformResponse> optionalResponse = restClient.getTable(shellProcess, response.getTable()); if (optionalResponse.isPresent()) { response = optionalResponse.get(); } } return toSchema(response.getResults(), fileType, tableSchemaType); } catch (Exception e) { log.error("Error parsing file {}: {}", fileType, e.getMessage()); throw new IOException("Unexpected exception. Verify file is the proper format", e); } finally { tempFile.delete(); } } // Port: 8450 private TransformRequest createTransformRequest(File localFile, SparkFileType fileType) { TransformRequest transformRequest = new TransformRequest(); transformRequest.setScript(toScript(localFile, fileType)); return transformRequest; } private String toScript(File localFile, SparkFileType fileType) { String path = "file://" + localFile.getAbsolutePath(); // IDE testing: //path = "file:///var/sample/signups.orc"; //path = "file:///var/sample/HiveGroup.parquet"; StringBuffer sb = new StringBuffer(); sb.append("import sqlContext.implicits._\n"); sb.append("import org.apache.spark.sql._\n"); String method; switch (fileType) { case AVRO: method = "avro"; sb.append("import com.databricks.spark.avro._\n"); sb.append("sqlContext.sparkContext.hadoopConfiguration.set(\"avro.mapred.ignore.inputs.without.extension\", \"false\")\n"); break; case JSON: method = "json"; break; case PARQUET: method = "parquet"; break; case ORC: method = "orc"; break; default: throw new UnsupportedOperationException("Type not supported [" + fileType + "]"); } sb.append(String.format("sqlContext.read.%s(\"%s\").limit(10).toDF()", method, path)); return sb.toString(); } private Schema toSchema(QueryResult results, SparkFileType fileType, TableSchemaType tableSchemaType) throws IOException { switch (tableSchemaType) { case HIVE: return toHiveSchema(results, fileType); default: throw new IOException("Unsupported schema type [" + tableSchemaType + "]"); } } /** * Strip out the (precision,scale) from the datatype and assign it to the proper field.precisionScale property * @param field the field to inspect */ private void setPrecisionAndScale(DefaultField field) { String dataType = field.getDerivedDataType(); Pattern pattern = Pattern.compile(DATATYPE_PRECISION_SCALE_REGEX); Matcher matcher = pattern.matcher(dataType); if (matcher.find()) { //group 1 is the string datatype //group 2 is the precision and scale String newDataType = matcher.group(1); String precisionAndScale = matcher.group(2); //replace the () precisionAndScale = precisionAndScale.replaceAll("\\(|\\)", ""); field.setDerivedDataType(newDataType); field.setPrecisionScale(precisionAndScale); } } private DefaultHiveSchema toHiveSchema(QueryResult result, SparkFileType fileType) { DefaultHiveSchema schema = new DefaultHiveSchema(); schema.setHiveFormat("STORED AS " + fileType); schema.setStructured(true); ArrayList<Field> fields = new ArrayList<>(); List<? extends QueryResultColumn> columns = result.getColumns(); for (QueryResultColumn column : columns) { DefaultField field = new DefaultField(); field.setName(column.getDisplayName()); field.setNativeDataType(column.getDataType()); field.setDerivedDataType(column.getDataType()); field.setDataTypeDescriptor(ParserHelper.hiveTypeToDescriptor(column.getDataType())); //strip the precisionScale and assign to the field property setPrecisionAndScale(field); // Add sample values List<Map<String, Object>> values = result.getRows(); for (Map<String, Object> colMap : values) { Object oVal = colMap.get(column.getDisplayName()); if (oVal != null) { field.getSampleValues().add(oVal.toString()); } } fields.add(field); } schema.setFields(fields); return schema; } private File toFile(InputStream is) throws IOException { File tempFile = File.createTempFile("kylo-spark-parser", ".dat"); try (FileOutputStream fos = new FileOutputStream(tempFile)) { IOUtils.copyLarge(is, fos); } log.info("Created temporary file {} success? {}", tempFile.getAbsoluteFile().toURI(), tempFile.exists()); return tempFile; } public enum SparkFileType { PARQUET, AVRO, JSON, ORC } }