/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.impl.logicalLayer; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.pig.ExecType; import org.apache.pig.LoadFunc; import org.apache.pig.PigException; import org.apache.pig.backend.datastorage.DataStorage; import org.apache.pig.data.DataType; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.io.FileSpec; import org.apache.pig.impl.plan.OperatorKey; import org.apache.pig.impl.plan.ProjectionMap; import org.apache.pig.impl.plan.RequiredFields; import org.apache.pig.impl.plan.VisitorException; import org.apache.pig.impl.util.MultiMap; import org.apache.pig.impl.util.Pair; import org.apache.pig.impl.util.WrappedIOException; import org.apache.pig.impl.logicalLayer.parser.ParseException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.SchemaMergeException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class LOLoad extends LogicalOperator { private static final long serialVersionUID = 2L; protected boolean splittable = true; private FileSpec mInputFileSpec; transient private LoadFunc mLoadFunc; private String mSchemaFile; private Schema mEnforcedSchema = null ; transient private DataStorage mStorage; private ExecType mExecType; private static Log log = LogFactory.getLog(LOLoad.class); private Schema mDeterminedSchema = null; /** * @param plan * LogicalPlan this operator is a part of. * @param key * OperatorKey for this operator * @param inputFileSpec * the file to be loaded * * @param execType * the execution mode @see org.apache.pig.ExecType * @param storage * the underlying storage * @param splittable * if the input file is splittable (.gz is not) * * */ public LOLoad(LogicalPlan plan, OperatorKey key, FileSpec inputFileSpec, ExecType execType, DataStorage storage, boolean splittable) throws IOException { super(plan, key); mInputFileSpec = inputFileSpec; //mSchemaFile = schemaFile; // schemaFile is the input file since we are trying // to deduce the schema by looking at the input file mSchemaFile = inputFileSpec.getFileName(); mStorage = storage; mExecType = execType; this.splittable = splittable; try { mLoadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(inputFileSpec.getFuncSpec()); }catch (ClassCastException cce) { log.error(inputFileSpec.getFuncSpec() + " should implement the LoadFunc interface."); throw WrappedIOException.wrap(cce); } catch (Exception e){ throw WrappedIOException.wrap(e); } } public FileSpec getInputFile() { return mInputFileSpec; } public void setInputFile(FileSpec inputFileSpec) throws IOException { try { mLoadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(inputFileSpec.getFuncSpec()); }catch (ClassCastException cce) { log.error(inputFileSpec.getFuncSpec() + " should implement the LoadFunc interface."); IOException ioe = new IOException(cce.getMessage()); ioe.setStackTrace(cce.getStackTrace()); throw ioe; } catch (Exception e){ IOException ioe = new IOException(e.getMessage()); ioe.setStackTrace(e.getStackTrace()); throw ioe; } mInputFileSpec = inputFileSpec; } public String getSchemaFile() { return mSchemaFile; } public LoadFunc getLoadFunc() { return mLoadFunc; } @Override public String name() { return "Load " + mKey.scope + "-" + mKey.id; } @Override public Schema getSchema() throws FrontendException { if (!mIsSchemaComputed) { // get the schema of the load function try { if (mEnforcedSchema != null) { mSchema = mEnforcedSchema ; return mSchema ; } if(null == mDeterminedSchema) { mSchema = mLoadFunc.determineSchema(mSchemaFile, mExecType, mStorage); mDeterminedSchema = mSchema; } mIsSchemaComputed = true; } catch (IOException ioe) { int errCode = 1018; String msg = "Problem determining schema during load"; FrontendException fee = new FrontendException(msg, errCode, PigException.INPUT, false, null, ioe); mIsSchemaComputed = false; mSchema = null; throw fee; } } return mSchema; } /* (non-Javadoc) * @see org.apache.pig.impl.logicalLayer.LogicalOperator#setSchema(org.apache.pig.impl.logicalLayer.schema.Schema) */ @Override public void setSchema(Schema schema) throws FrontendException { // In general, operators don't generate their schema until they're // asked, so ask them to do it. try { getSchema(); } catch (FrontendException ioe) { // It's fine, it just means we don't have a schema yet. } if (mSchema == null) { log.debug("Operator schema is null; Setting it to new schema"); mSchema = schema; } else { log.debug("Reconciling schema"); log.debug("mSchema: " + mSchema + " schema: " + schema); try { mSchema = mSchema.mergePrefixSchema(schema, true, true); } catch (SchemaMergeException e) { int errCode = 1019; String msg = "Unable to merge schemas"; throw new FrontendException(msg, errCode, PigException.INPUT, false, null, e); } } } @Override public boolean supportsMultipleInputs() { return false; } public void visit(LOVisitor v) throws VisitorException { v.visit(this); } public Schema getEnforcedSchema() { return mEnforcedSchema; } /*** * Set this when user enforces schema * @param enforcedSchema */ public void setEnforcedSchema(Schema enforcedSchema) { this.mEnforcedSchema = enforcedSchema; } public boolean isSplittable() { return splittable; } @Override public byte getType() { return DataType.BAG ; } /** * @return the DeterminedSchema */ public Schema getDeterminedSchema() { return mDeterminedSchema; } @Override public ProjectionMap getProjectionMap() { Schema outputSchema; try { outputSchema = getSchema(); } catch (FrontendException fee) { return null; } if(outputSchema == null) { return null; } Schema inputSchema = null; List<LogicalOperator> predecessors = (ArrayList<LogicalOperator>)mPlan.getPredecessors(this); if(predecessors != null) { try { inputSchema = predecessors.get(0).getSchema(); } catch (FrontendException fee) { return null; } } else { try { inputSchema = mLoadFunc.determineSchema(mSchemaFile, mExecType, mStorage); } catch (IOException ioe) { return null; } } if(inputSchema == null) { return null; } if(Schema.equals(inputSchema, outputSchema, false, true)) { //there is a one is to one mapping between input and output schemas return new ProjectionMap(false); } else { MultiMap<Integer, Pair<Integer, Integer>> mapFields = new MultiMap<Integer, Pair<Integer, Integer>>(); //compute the mapping assuming its a prefix projection for(int i = 0; i < inputSchema.size(); ++i) { mapFields.put(i, new Pair<Integer, Integer>(0, i)); } return new ProjectionMap(mapFields, null, null); } } @Override public List<RequiredFields> getRequiredFields() { List<RequiredFields> requiredFields = new ArrayList<RequiredFields>(); requiredFields.add(new RequiredFields(false, true)); return requiredFields; } }