/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.builtin; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.pig.FileInputLoadFunc; import org.apache.pig.LoadFunc; import org.apache.pig.LoadPushDown; import org.apache.pig.PigException; import org.apache.pig.ResourceSchema; import org.apache.pig.StoreFunc; import org.apache.pig.StoreFuncInterface; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextOutputFormat; import org.apache.pig.bzip2r.Bzip2TextInputFormat; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.util.ObjectSerializer; import org.apache.pig.impl.util.StorageUtil; import org.apache.pig.impl.util.UDFContext; /** * A load function that parses a line of input into fields using a delimiter to * set the fields. The delimiter is given as a regular expression. See * {@link java.lang.String#split(String)} and {@link java.util.regex.Pattern} * for more information. */ @SuppressWarnings("unchecked") public class PigStorage extends FileInputLoadFunc implements StoreFuncInterface, LoadPushDown { protected RecordReader in = null; protected RecordWriter writer = null; protected final Log mLog = LogFactory.getLog(getClass()); private String signature; private byte fieldDel = '\t'; private ArrayList<Object> mProtoTuple = null; private TupleFactory mTupleFactory = TupleFactory.getInstance(); private String loadLocation; public PigStorage() { } private boolean[] mRequiredColumns = null; private boolean mRequiredColumnsInitialized = false; /** * Constructs a Pig loader that uses specified regex as a field delimiter. * * @param delimiter * the single byte character that is used to separate fields. * ("\t" is the default.) */ public PigStorage(String delimiter) { this(); fieldDel = StorageUtil.parseFieldDel(delimiter); } @Override public Tuple getNext() throws IOException { mProtoTuple = new ArrayList<Object>(); if (!mRequiredColumnsInitialized) { if (signature!=null) { Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); mRequiredColumns = (boolean[])ObjectSerializer.deserialize(p.getProperty(signature)); } mRequiredColumnsInitialized = true; } try { boolean notDone = in.nextKeyValue(); if (!notDone) { return null; } Text value = (Text) in.getCurrentValue(); byte[] buf = value.getBytes(); int len = value.getLength(); int start = 0; int fieldID = 0; for (int i = 0; i < len; i++) { if (buf[i] == fieldDel) { if (mRequiredColumns==null || (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID])) readField(buf, start, i); start = i + 1; fieldID++; } } // pick up the last field if (start <= len && (mRequiredColumns==null || (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID]))) { readField(buf, start, len); } Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); return t; } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } @Override public void putNext(Tuple f) throws IOException { try { writer.write(null, f); } catch (InterruptedException e) { throw new IOException(e); } } private void readField(byte[] buf, int start, int end) { if (start == end) { // NULL value mProtoTuple.add(null); } else { mProtoTuple.add(new DataByteArray(buf, start, end)); } } @Override public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException { if (requiredFieldList == null) return null; if (requiredFieldList.getFields() != null) { int lastColumn = -1; for (RequiredField rf: requiredFieldList.getFields()) { if (rf.getIndex()>lastColumn) { lastColumn = rf.getIndex(); } } mRequiredColumns = new boolean[lastColumn+1]; for (RequiredField rf: requiredFieldList.getFields()) { if (rf.getIndex()!=-1) mRequiredColumns[rf.getIndex()] = true; } Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); try { p.setProperty(signature, ObjectSerializer.serialize(mRequiredColumns)); } catch (Exception e) { throw new RuntimeException("Cannot serialize mRequiredColumns"); } } return new RequiredFieldResponse(true); } @Override public boolean equals(Object obj) { if (obj instanceof PigStorage) return equals((PigStorage)obj); else return false; } public boolean equals(PigStorage other) { return this.fieldDel == other.fieldDel; } @Override public InputFormat getInputFormat() { if(loadLocation.endsWith(".bz2") || loadLocation.endsWith(".bz")) { return new Bzip2TextInputFormat(); } else { return new PigTextInputFormat(); } } @Override public void prepareToRead(RecordReader reader, PigSplit split) { in = reader; } @Override public void setLocation(String location, Job job) throws IOException { loadLocation = location; FileInputFormat.setInputPaths(job, location); } @Override public OutputFormat getOutputFormat() { return new PigTextOutputFormat(fieldDel); } @Override public void prepareToWrite(RecordWriter writer) { this.writer = writer; } @Override public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(location)); if( "true".equals( job.getConfiguration().get( "output.compression.enabled" ) ) ) { FileOutputFormat.setCompressOutput( job, true ); String codec = job.getConfiguration().get( "output.compression.codec" ); try { FileOutputFormat.setOutputCompressorClass( job, (Class<? extends CompressionCodec>) Class.forName( codec ) ); } catch (ClassNotFoundException e) { throw new RuntimeException("Class not found: " + codec ); } } else { if (location.endsWith(".bz2") || location.endsWith(".bz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (location.endsWith(".gz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { FileOutputFormat.setCompressOutput( job, false); } } } @Override public void checkSchema(ResourceSchema s) throws IOException { } @Override public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException { return LoadFunc.getAbsolutePath(location, curDir); } @Override public int hashCode() { return (int)fieldDel; } @Override public void setUDFContextSignature(String signature) { this.signature = signature; } @Override public List<OperatorSet> getFeatures() { return Arrays.asList(LoadPushDown.OperatorSet.PROJECTION); } @Override public void setStoreFuncUDFContextSignature(String signature) { } @Override public void cleanupOnFailure(String location, Job job) throws IOException { StoreFunc.cleanupOnFailureImpl(location, job); } }