/** * Copyright 2011-2017 Asakusa Framework Team. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.asakusafw.compiler.directio.hive; import java.io.IOException; import java.io.OutputStream; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.asakusafw.compiler.batch.AbstractWorkflowProcessor; import com.asakusafw.compiler.batch.WorkDescriptionProcessor; import com.asakusafw.compiler.batch.Workflow; import com.asakusafw.compiler.batch.processor.JobFlowWorkDescriptionProcessor; import com.asakusafw.compiler.flow.jobflow.JobflowModel; import com.asakusafw.directio.hive.info.InputInfo; import com.asakusafw.directio.hive.info.LocationInfo; import com.asakusafw.directio.hive.info.OutputInfo; import com.asakusafw.directio.hive.info.TableInfo; import com.asakusafw.runtime.directio.DataFormat; import com.asakusafw.vocabulary.directio.DirectFileInputDescription; import com.asakusafw.vocabulary.directio.DirectFileOutputDescription; import com.asakusafw.vocabulary.external.ExporterDescription; import com.asakusafw.vocabulary.external.ImporterDescription; /** * Collects Hive table definition and puts their schema information into compilation results. * @since 0.8.1 */ public class HiveSchemaCollectorProcessor extends AbstractWorkflowProcessor { static final Logger LOG = LoggerFactory.getLogger(HiveSchemaCollectorProcessor.class); /** * The schema output base path. */ public static final String PATH_BASE = "etc/hive-schema"; /** * The input schema file path. */ public static final String PATH_INPUT = PATH_BASE + "/input.json"; /** * The output schema file path. */ public static final String PATH_OUTPUT = PATH_BASE + "/output.json"; @Override public Collection<Class<? extends WorkDescriptionProcessor<?>>> getDescriptionProcessors() { List<Class<? extends WorkDescriptionProcessor<?>>> results = new ArrayList<>(); results.add(JobFlowWorkDescriptionProcessor.class); return results; } @Override public void process(Workflow workflow) throws IOException { LOG.debug("collecting Hive inputs/outputs"); Context context = new Context(); processBatch(context, workflow); List<InputInfo> inputs = normalize(context.inputs); LOG.debug("generating Hive input table schema: {} entries", inputs.size()); try (OutputStream stream = getEnvironment().openResource(PATH_INPUT)) { Persistent.write(InputInfo.class, inputs, stream); } List<OutputInfo> outputs = normalize(context.outputs); LOG.debug("generating Hive input table schema: {} entries", inputs.size()); try (OutputStream stream = getEnvironment().openResource(PATH_OUTPUT)) { Persistent.write(OutputInfo.class, outputs, stream); } } private void processBatch(Context context, Workflow workflow) { for (Workflow.Unit unit : workflow.getGraph().getNodeSet()) { JobflowModel jobflow = (JobflowModel) unit.getProcessed(); processJobflow(context, jobflow); } } private void processJobflow(Context context, JobflowModel jobflow) { LOG.debug("collectiong Hive inputs/outputs from jobflow: {}", jobflow.getFlowId()); for (JobflowModel.Import node : jobflow.getImports()) { ImporterDescription description = node.getDescription().getImporterDescription(); if (description instanceof DirectFileInputDescription) { processInput(context, (DirectFileInputDescription) description); } } for (JobflowModel.Export node : jobflow.getExports()) { ExporterDescription description = node.getDescription().getExporterDescription(); if (description instanceof DirectFileOutputDescription) { processOutput(context, (DirectFileOutputDescription) description); } } } private void processInput(Context context, DirectFileInputDescription description) { TableInfo info = processDataFormat(description.getFormat()); if (info == null) { return; } context.inputs.add(new InputInfo( new LocationInfo(description.getBasePath(), description.getResourcePattern()), info)); } private void processOutput(Context context, DirectFileOutputDescription description) { TableInfo info = processDataFormat(description.getFormat()); if (info == null) { return; } context.outputs.add(new OutputInfo( new LocationInfo(description.getBasePath(), description.getResourcePattern()), info)); } private TableInfo processDataFormat(Class<? extends DataFormat<?>> format) { if (TableInfo.Provider.class.isAssignableFrom(format) == false) { LOG.debug("not Hive table: {}", format.getName()); return null; } LOG.debug("found Hive table: {}", format.getName()); TableInfo.Provider provider; try { provider = format.asSubclass(TableInfo.Provider.class) .getConstructor() .newInstance(); } catch (ReflectiveOperationException e) { LOG.warn(MessageFormat.format( "error occurred while extracting Hive table schema: {0}", format.getName()), e); return null; } TableInfo schema = provider.getSchema(); if (schema == null) { return null; } LOG.debug("extracted Hive table: {} ({})", schema, provider); return schema; } private <T extends TableInfo.Provider> List<T> normalize(List<T> elements) { if (elements.size() <= 1) { return elements; } Set<T> saw = new HashSet<>(); List<T> normalized = new ArrayList<>(); for (T element : elements) { if (saw.contains(element)) { continue; } saw.add(element); normalized.add(element); } Collections.sort(normalized, (o1, o2) -> o1.getSchema().getName().compareTo(o2.getSchema().getName())); return normalized; } private static class Context { final List<InputInfo> inputs = new ArrayList<>(); final List<OutputInfo> outputs = new ArrayList<>(); Context() { return; } } }