/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.hadoop.pig;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.pig.StoreFunc;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.archive.hadoop.mapreduce.ZipNumAllOutputFormat;
/**
* Very simple/minimal StoreFunc to write {key,value} pairs into IA's
* "zip num" format, using the ZipNumAllOutputFormat class.
*/
public class ZipNumStorage extends StoreFunc
{
RecordWriter<Text, Text> writer;
protected int numLines = 3000;
Text key = new Text();
Text value = new Text();
public ZipNumStorage()
{
}
public ZipNumStorage(int lines)
{
numLines = lines;
}
public OutputFormat<Text, Text> getOutputFormat() throws IOException
{
return new ZipNumAllOutputFormat(numLines);
}
public void setStoreLocation( String location, Job job ) throws IOException
{
FileOutputFormat.setOutputPath( job, new Path(location) );
}
@Override
public void prepareToWrite( RecordWriter writer) throws IOException
{
this.writer = writer;
}
/**
* Tuples can have either one or two chararray values.
*
* If there is only 1, then we split it on the first space.
* If there are 2, then we just accept them as-is.
*
* The underlying ZipNumOutputFormat requires that the data
* be split into a (key,value) pair, with both the key and
* value being Strings (Pig chararray).
*/
public void putNext( Tuple tuple ) throws IOException
{
try
{
int size = tuple.size();
if ( size != 1 && size != 2 )
{
throw new IOException( "Invalid tuple size, must be 1 or 2: " + size );
}
if ( DataType.findType( tuple.get(0) ) != DataType.CHARARRAY )
{
throw new IOException( "Invalid type for tuple 0, not CHARARRAY: " + DataType.findTypeName( DataType.findType( tuple.get(0) ) ) + ":" + tuple.get(0) + ":" + tuple.get(1) );
}
if ( size == 2 )
{
if ( DataType.findType( tuple.get(1) ) != DataType.CHARARRAY )
{
throw new IOException( "Invalid type for tuple 1, not CHARARRAY: " + DataType.findTypeName( DataType.findType( tuple.get(1) ) ) + ":" + tuple.get(0) + ":" + tuple.get(1) );
}
this.key .set( (String) tuple.get(0) );
this.value.set( (String) tuple.get(1) );
}
else
{
//String s[] = ((String)tuple.get(0)).split( " ", 2 );
this.key .set( "" );
this.value.set( (String)tuple.get(0) );
}
this.writer.write( this.key, this.value );
}
catch ( InterruptedException ie )
{
throw new IOException( ie );
}
}
}