/*
* Copyright 2011 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You
* may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.archive.bacon.nutchwax;
import java.io.*;
import java.util.*;
import org.apache.hadoop.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.pig.*;
import org.apache.pig.data.*;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.nutch.parse.*;
import org.apache.nutch.metadata.Metadata;
/**
* Apache Pig UDF to load metadata records from a Nutch(WAX) segment.
*
* This loader assumes that the path is to the 'parse_data'
* sub-directory of a Nutch(WAX) segment.
*
* It returns a Tuple for each link, of the form:
* (url:chararray,
* digest:chararray,
* title:chararray,
* length:long,
* date:chararray,
* type:chararray,
* collection:chararray,
* boiled:chararray,
* links: { tuple(toUrl:chararray, anchor:chararray) }
* )
*/
public class MetadataLoader extends LoadFunc
{
private RecordReader<Text,Writable> reader;
private TupleFactory mTupleFactory = TupleFactory.getInstance();
private BagFactory mBagFactory = BagFactory.getInstance();
/**
* The Nutch(WAX) "parse_data" segment is just a SequenceFile
* with Text keys and Writable values.
*/
public InputFormat getInputFormat( )
throws IOException
{
return new SequenceFileInputFormat<Text,Writable>( );
}
/**
* Reads a Nutch(WAX) metadata record and returns a Tuple containing
* the metadata values. Any null String values are returned as "",
* but since 'length' is a Long, we return null if there is not a
* length value for a record.
*/
public Tuple getNext( )
throws IOException
{
try
{
if ( ! this.reader.nextKeyValue( ) )
{
return null;
}
Writable value;
while ( ( value = this.reader.getCurrentValue( ) ) != null )
{
// Whoah, what to do? Skip it for now
if ( ! ( value instanceof ParseData ) ) continue ;
ParseData pd = (ParseData) value;
Metadata meta = pd.getContentMeta( );
Tuple tuple = mTupleFactory.newTuple( );
tuple.append( meta.get( "url" ) );
tuple.append( meta.get( "digest" ) );
tuple.append( pd.getTitle( ) );
try { tuple.append( new Long( meta.get( "length" ) ) ); } catch ( NumberFormatException nfe ) { tuple.append( null ); }
tuple.append( meta.get( "date" ) );
tuple.append( meta.get( "type" ) );
tuple.append( meta.get( "collection" ) );
tuple.append( meta.get( "boiled" ) );
DataBag links = mBagFactory.newDefaultBag();
for ( Outlink link : pd.getOutlinks() )
{
Tuple lt = mTupleFactory.newTuple( 2 );
lt.set( 0, link.getToUrl ( ) );
lt.set( 1, link.getAnchor( ) );
links.add( lt );
}
tuple.append( links );
return tuple;
}
return null;
}
catch ( InterruptedException e )
{
// From the Pig example/howto code.
int errCode = 6018;
String errMsg = "Error while reading input";
throw new ExecException(errMsg, errCode,PigException.REMOTE_ENVIRONMENT, e);
}
}
/**
* Convenience function to ensure no nulls, only empty strings.
*/
private String get( Metadata meta, String key )
{
String value = meta.get( key );
if ( value == null ) return "";
return value;
}
/**
* Just save the given reader. Dunno what to do with the 'split'.
*/
public void prepareToRead( RecordReader reader, PigSplit split )
throws IOException
{
this.reader = reader;
}
/**
* The 'location' is a path string, which could contain wildcards.
* Expand the wildcards and add each matching path to the input.
*/
public void setLocation( String location, Job job )
throws IOException
{
// Expand any filename globs, and add each to the input paths.
FileStatus[] files = FileSystem.get( job.getConfiguration( ) ).globStatus( new Path( location ) );
for ( FileStatus file : files )
{
FileInputFormat.addInputPath( job, new Path( file.getPath( ), "parse_data" ) );
}
}
}