/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.builtin; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Map; import java.util.Iterator; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.Log; import org.apache.pig.ExecType; import org.apache.pig.LoadFunc; import org.apache.pig.PigException; import org.apache.pig.StoreFunc; import org.apache.pig.ReversibleLoadStoreFunc; import org.apache.pig.backend.datastorage.DataStorage; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.io.BufferedPositionedInputStream; import org.apache.pig.impl.logicalLayer.schema.Schema; /** * A load function that parses a line of input into fields using a delimiter to set the fields. The * delimiter is given as a regular expression. See String.split(delimiter) and * http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/Pattern.html for more information. */ public class PigStorage extends Utf8StorageConverter implements ReversibleLoadStoreFunc { protected BufferedPositionedInputStream in = null; protected final Log mLog = LogFactory.getLog(getClass()); long end = Long.MAX_VALUE; private byte recordDel = '\n'; private byte fieldDel = '\t'; private ByteArrayOutputStream mBuf = null; private ArrayList<Object> mProtoTuple = null; private int os; private static final int OS_UNIX = 0; private static final int OS_WINDOWS = 1; private static final String UTF8 = "UTF-8"; public PigStorage() { os = OS_UNIX; if (System.getProperty("os.name").toUpperCase().startsWith("WINDOWS")) os = OS_WINDOWS; } /** * Constructs a Pig loader that uses specified regex as a field delimiter. * * @param delimiter * the single byte character that is used to separate fields. * ("\t" is the default.) */ public PigStorage(String delimiter) { this(); if (delimiter.length() == 1) { this.fieldDel = (byte)delimiter.charAt(0); } else if (delimiter.length() > 1 && delimiter.charAt(0) == '\\') { switch (delimiter.charAt(1)) { case 't': this.fieldDel = (byte)'\t'; break; case 'x': case 'u': this.fieldDel = Integer.valueOf(delimiter.substring(2)).byteValue(); break; default: throw new RuntimeException("Unknown delimiter " + delimiter); } } else { throw new RuntimeException("PigStorage delimeter must be a single character"); } } public Tuple getNext() throws IOException { if (in == null || in.getPosition() > end) { return null; } if (mBuf == null) mBuf = new ByteArrayOutputStream(4096); mBuf.reset(); while (true) { // BufferedPositionedInputStream is buffered, so I don't need // to buffer. int b = in.read(); if (b == fieldDel) { readField(); } else if (b == recordDel) { readField(); //Tuple t = mTupleFactory.newTuple(mProtoTuple); Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); mProtoTuple = null; return t; } else if (b == -1) { // hit end of file return null; } else { mBuf.write(b); } } } public void bindTo(String fileName, BufferedPositionedInputStream in, long offset, long end) throws IOException { this.in = in; this.end = end; // Since we are not block aligned we throw away the first // record and cound on a different instance to read it if (offset != 0) { getNext(); } } OutputStream mOut; public void bindTo(OutputStream os) throws IOException { mOut = os; } private void putField(Object field) throws IOException { //string constants for each delimiter String tupleBeginDelim = "("; String tupleEndDelim = ")"; String bagBeginDelim = "{"; String bagEndDelim = "}"; String mapBeginDelim = "["; String mapEndDelim = "]"; String fieldDelim = ","; String mapKeyValueDelim = "#"; switch (DataType.findType(field)) { case DataType.NULL: break; // just leave it empty case DataType.BOOLEAN: mOut.write(((Boolean)field).toString().getBytes()); break; case DataType.INTEGER: mOut.write(((Integer)field).toString().getBytes()); break; case DataType.LONG: mOut.write(((Long)field).toString().getBytes()); break; case DataType.FLOAT: mOut.write(((Float)field).toString().getBytes()); break; case DataType.DOUBLE: mOut.write(((Double)field).toString().getBytes()); break; case DataType.BYTEARRAY: { byte[] b = ((DataByteArray)field).get(); mOut.write(b, 0, b.length); break; } case DataType.CHARARRAY: // oddly enough, writeBytes writes a string mOut.write(((String)field).getBytes(UTF8)); break; case DataType.MAP: boolean mapHasNext = false; Map<Object, Object> m = (Map<Object, Object>)field; mOut.write(mapBeginDelim.getBytes(UTF8)); for(Object o: m.keySet()) { if(mapHasNext) { mOut.write(fieldDelim.getBytes(UTF8)); } else { mapHasNext = true; } putField(o); mOut.write(mapKeyValueDelim.getBytes(UTF8)); putField(m.get(o)); } mOut.write(mapEndDelim.getBytes(UTF8)); break; case DataType.TUPLE: boolean tupleHasNext = false; Tuple t = (Tuple)field; mOut.write(tupleBeginDelim.getBytes(UTF8)); for(int i = 0; i < t.size(); ++i) { if(tupleHasNext) { mOut.write(fieldDelim.getBytes(UTF8)); } else { tupleHasNext = true; } try { putField(t.get(i)); } catch (ExecException ee) { throw ee; } } mOut.write(tupleEndDelim.getBytes(UTF8)); break; case DataType.BAG: boolean bagHasNext = false; mOut.write(bagBeginDelim.getBytes(UTF8)); Iterator<Tuple> tupleIter = ((DataBag)field).iterator(); while(tupleIter.hasNext()) { if(bagHasNext) { mOut.write(fieldDelim.getBytes(UTF8)); } else { bagHasNext = true; } putField((Object)tupleIter.next()); } mOut.write(bagEndDelim.getBytes(UTF8)); break; default: { int errCode = 2108; String msg = "Could not determine data type of field: " + field; throw new ExecException(msg, errCode, PigException.BUG); } } } public void putNext(Tuple f) throws IOException { // I have to convert integer fields to string, and then to bytes. // If I use a DataOutputStream to convert directly from integer to // bytes, I don't get a string representation. int sz = f.size(); for (int i = 0; i < sz; i++) { Object field; try { field = f.get(i); } catch (ExecException ee) { throw ee; } putField(field); if (i == sz - 1) { // last field in tuple. mOut.write(recordDel); } else { mOut.write(fieldDel); } } } public void finish() throws IOException { } private void readField() { if (mProtoTuple == null) mProtoTuple = new ArrayList<Object>(); if (mBuf.size() == 0) { // NULL value mProtoTuple.add(null); } else { // TODO, once this can take schemas, we need to figure out // if the user requested this to be viewed as a certain // type, and if so, then construct it appropriately. byte[] array = mBuf.toByteArray(); if (array[array.length-1]=='\r' && os==OS_WINDOWS) { // This is a java 1.6 function. Until pig officially moves to // 1.6 we can't use this. // array = Arrays.copyOf(array, array.length-1); byte[] tmp = new byte[array.length - 1]; for (int i = 0; i < array.length - 1; i++) tmp[i] = array[i]; array = tmp; } if (array.length==0) mProtoTuple.add(null); else mProtoTuple.add(new DataByteArray(array)); } mBuf.reset(); } /* (non-Javadoc) * @see org.apache.pig.LoadFunc#determineSchema(java.lang.String, org.apache.pig.ExecType, org.apache.pig.backend.datastorage.DataStorage) */ public Schema determineSchema(String fileName, ExecType execType, DataStorage storage) throws IOException { // TODO Auto-generated method stub return null; } public void fieldsToRead(Schema schema) { // do nothing } public boolean equals(Object obj) { return equals((PigStorage)obj); } public boolean equals(PigStorage other) { return this.fieldDel == other.fieldDel; } /* (non-Javadoc) * @see org.apache.pig.StoreFunc#getStorePreparationClass() */ @Override public Class getStorePreparationClass() throws IOException { // TODO Auto-generated method stub return null; } }