package uk.bl.wa.hadoop.entities; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import uk.bl.wa.hadoop.ArchiveFileInputFormat; import uk.bl.wa.hadoop.mapreduce.FrequencyCountingReducer; /** * EntityExtractor: * * Extracts known entities from web archives. * * @author Andrew.Jackson@bl.uk */ @SuppressWarnings( { "deprecation" } ) public class EntityExtractor extends Configured implements Tool { private static Logger log = Logger.getLogger(EntityExtractor.class.getName()); public static final String REGEX_PATTERN_PARAM = "regex.pattern"; public int run( String[] args ) throws IOException { JobConf conf = new JobConf( getConf(), EntityExtractor.class ); log.info("Loading paths..."); String line = null; List<Path> paths = new ArrayList<Path>(); BufferedReader br = new BufferedReader( new FileReader( args[ 0 ] ) ); while( ( line = br.readLine() ) != null ) { paths.add( new Path( line ) ); } br.close(); log.info("Setting paths..."); FileInputFormat.setInputPaths( conf, paths.toArray(new Path[] {}) ); log.info("Set "+paths.size()+" InputPaths"); FileOutputFormat.setOutputPath( conf, new Path( args[ 1 ] ) ); conf.set( REGEX_PATTERN_PARAM, args[ 2 ] ); log.info("Set regex pattern = "+conf.get(REGEX_PATTERN_PARAM)); conf.setJobName( args[ 0 ] + "_" + System.currentTimeMillis() ); conf.setInputFormat( ArchiveFileInputFormat.class ); conf.setMapperClass( EntityMapper.class ); conf.setReducerClass( FrequencyCountingReducer.class ); conf.setOutputFormat( TextOutputFormat.class ); conf.setOutputKeyClass( Text.class ); conf.setOutputValueClass( Text.class ); // Override the maxiumum JobConf size so very large lists of files can be processed: // Default mapred.user.jobconf.limit=5242880 (5M), bump to 100 megabytes = 104857600 bytes. conf.set("mapred.user.jobconf.limit", "104857600"); // Manually set a large number of reducers: conf.setNumReduceTasks(50); // Run it: JobClient.runJob( conf ); return 0; } public static void main( String[] args ) throws Exception { if( args.length != 3 ) { System.out.println( "Need <input file list>, <output dir> and <regular expression>!" ); System.exit( 1 ); } int ret = ToolRunner.run( new EntityExtractor(), args ); System.exit( ret ); } }