package uk.bl.wa.hadoop.outlinks;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import uk.bl.wa.hadoop.ArchiveFileInputFormat;
@SuppressWarnings( "deprecation" )
public class OutlinkExtractor extends Configured implements Tool {
public int run( String[] args ) throws IOException {
JobConf conf = new JobConf( getConf(), OutlinkExtractor.class );
String line = null;
BufferedReader br = new BufferedReader( new FileReader( args[ 0 ] ) );
while( ( line = br.readLine() ) != null ) {
FileInputFormat.addInputPath( conf, new Path( line ) );
}
FileOutputFormat.setOutputPath( conf, new Path( args[ 1 ] ) );
conf.setJobName( args[ 0 ] + "_" + System.currentTimeMillis() );
conf.setInputFormat( ArchiveFileInputFormat.class );
conf.setMapperClass( OutlinkExtractorMapper.class );
conf.setReducerClass( FrequencyCountingReducer.class );
conf.setOutputFormat( TextOutputFormat.class );
conf.setOutputKeyClass( Text.class );
conf.setOutputValueClass( Text.class );
// JobClient.runJob( conf );
new JobClient( conf ).submitJob( conf );
return 0;
}
public static void main( String[] args ) throws Exception {
if( args.length != 2 ) {
System.out.println( "Need input file list and output dir!" );
System.exit( 1 );
}
int ret = ToolRunner.run( new OutlinkExtractor(), args );
System.exit( ret );
}
}