MetadataLoader.java example

/*
 * Copyright 2011 Internet Archive
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You
 * may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */
package org.archive.bacon.nutchwax;

import java.io.*;
import java.util.*;

import org.apache.hadoop.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;

import org.apache.pig.*;
import org.apache.pig.data.*;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.executionengine.ExecException;

import org.apache.nutch.parse.*;
import org.apache.nutch.metadata.Metadata;


/**
 * Apache Pig UDF to load metadata records from a Nutch(WAX) segment.
 *
 * This loader assumes that the path is to the 'parse_data'
 * sub-directory of a Nutch(WAX) segment.
 *
 * It returns a Tuple for each link, of the form:
 *   (url:chararray,
 *    digest:chararray,
 *    title:chararray,
 *    length:long,
 *    date:chararray,
 *    type:chararray,
 *    collection:chararray,
 *    boiled:chararray,
 *    links: { tuple(toUrl:chararray, anchor:chararray) }
 * )
 */
public class MetadataLoader extends LoadFunc
{
  private RecordReader<Text,Writable> reader;

  private TupleFactory mTupleFactory = TupleFactory.getInstance();
  private BagFactory   mBagFactory   = BagFactory.getInstance();
  
  /**
   * The Nutch(WAX) "parse_data" segment is just a SequenceFile
   * with Text keys and Writable values.
   */ 
  public InputFormat getInputFormat( )
    throws IOException
  {
    return new SequenceFileInputFormat<Text,Writable>( );
  }

  /**
   * Reads a Nutch(WAX) metadata record and returns a Tuple containing
   * the metadata values.  Any null String values are returned as "",
   * but since 'length' is a Long, we return null if there is not a
   * length value for a record.
   */
  public Tuple getNext( )
    throws IOException
  {
    try 
      {
        if ( ! this.reader.nextKeyValue( ) )
          {
            return null;
          }
        
        Writable value;

        while ( ( value = this.reader.getCurrentValue( ) ) != null )
          {
            // Whoah, what to do?  Skip it for now
            if ( ! ( value instanceof ParseData ) ) continue ;
                    
            ParseData pd = (ParseData) value;

            Metadata meta = pd.getContentMeta( );

            Tuple tuple = mTupleFactory.newTuple( );

            tuple.append( meta.get( "url"    ) );
            tuple.append( meta.get( "digest" ) );
            tuple.append( pd.getTitle( ) );
            try { tuple.append( new Long( meta.get( "length" ) ) ); } catch ( NumberFormatException nfe ) { tuple.append( null ); }
            tuple.append( meta.get( "date"       ) );
            tuple.append( meta.get( "type"       ) );
            tuple.append( meta.get( "collection" ) );
            tuple.append( meta.get( "boiled"     ) );
            
            DataBag links = mBagFactory.newDefaultBag();
            for ( Outlink link : pd.getOutlinks() )
              {
                Tuple lt = mTupleFactory.newTuple( 2 );
                lt.set( 0, link.getToUrl ( ) );
                lt.set( 1, link.getAnchor( ) );

                links.add( lt );
              }

            tuple.append( links );

            return tuple;
          }

        return null;
      }
    catch ( InterruptedException e ) 
      {
        // From the Pig example/howto code.
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode,PigException.REMOTE_ENVIRONMENT, e);
      }
  }

  /**
   * Convenience function to ensure no nulls, only empty strings.
   */
  private String get( Metadata meta, String key )
  {
    String value = meta.get( key );

    if ( value == null ) return "";

    return value;
  }


  /**
   * Just save the given reader.  Dunno what to do with the 'split'.
   */
  public void prepareToRead( RecordReader reader, PigSplit split )
    throws IOException
  {
    this.reader = reader;
  }

  /**
   * The 'location' is a path string, which could contain wildcards.
   * Expand the wildcards and add each matching path to the input.
   */
  public void setLocation( String location, Job job )
    throws IOException
  {
    // Expand any filename globs, and add each to the input paths.
    FileStatus[] files = FileSystem.get( job.getConfiguration( ) ).globStatus( new Path( location ) );

    for ( FileStatus file : files )
      {
        FileInputFormat.addInputPath( job, new Path( file.getPath( ), "parse_data" ) );
      }
  }

}