package uk.bl.wa.hadoop.indexer; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.zookeeper.KeeperException; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigRenderOptions; import com.typesafe.config.ConfigValueFactory; import uk.bl.wa.apache.solr.hadoop.Solate; import uk.bl.wa.hadoop.ArchiveFileInputFormat; import uk.bl.wa.hadoop.TextOutputFormat; import uk.bl.wa.solr.SolrWebServer; import uk.bl.wa.util.ConfigPrinter; /** * WARCIndexerRunner * * Extracts text/metadata using from a series of Archive files. * * @author rcoram */ @SuppressWarnings({ "deprecation" }) public class WARCIndexerRunner extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(WARCIndexerRunner.class); private static final String CLI_USAGE = "[-i <input file>] [-o <output dir>] [-c <config file>] [-d] [Dump config.] [-w] [Wait for completion.] [-x] [output XML in OAI-PMH format]"; private static final String CLI_HEADER = "WARCIndexerRunner - MapReduce method for extracing metadata/text from Archive Records"; public static final String CONFIG_PROPERTIES = "warc_indexer_config"; public static final String CONFIG_APPLY_ANNOTATIONS = "warc.applyAnnotations"; protected static String solrHomeZipName = "solr_home.zip"; private String inputPath; private String outputPath; private String configPath; private boolean wait; private boolean dumpConfig; private boolean exportXml; private boolean applyAnnotations; /** * * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Store application properties where the mappers/reducers can access // them Config index_conf; if (this.configPath != null) { index_conf = ConfigFactory.parseFile(new File(this.configPath)); } else { index_conf = ConfigFactory.load(); } if (this.dumpConfig) { ConfigPrinter.print(index_conf); System.exit(0); } // Decide whether to apply annotations: index_conf = index_conf.withValue(CONFIG_APPLY_ANNOTATIONS, ConfigValueFactory.fromAnyRef(applyAnnotations)); // Store the properties: conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root() .render(ConfigRenderOptions.concise())); LOG.info("Loaded warc config."); LOG.info(index_conf.getString("warc.title")); if (index_conf.getBoolean("warc.solr.use_hash_url_id")) { LOG.info("Using hash-based ID."); } if (index_conf.hasPath("warc.solr.zookeepers")) { LOG.info("Using Zookeepers."); } else { LOG.info("Using SolrServers."); } // Also set reduce speculative execution off, avoiding duplicate // submissions to Solr. conf.set("mapred.reduce.tasks.speculative.execution", "false"); // Reducer count dependent on concurrent HTTP connections to Solr // server. int numReducers = 1; try { numReducers = index_conf.getInt("warc.hadoop.num_reducers"); } catch (NumberFormatException n) { numReducers = 10; } // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCIndexerMapper.class); conf.setReducerClass(WARCIndexerReducer.class); conf.setOutputFormat(TextOutputFormat.class); conf.set("map.output.key.field.separator", ""); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); // If we are indexing into HDFS, we need a copy of the config: if (index_conf.getBoolean("warc.solr.hdfs")) { // Grab the Solr config from ZK and cache it for during the job. if (index_conf.hasPath(SolrWebServer.CONF_ZOOKEEPERS)) { Solate.cacheSolrHome(conf, index_conf.getString(SolrWebServer.CONF_ZOOKEEPERS), index_conf.getString(SolrWebServer.COLLECTION), solrHomeZipName); } else { Solate.cacheSolrHome(conf, null, null, solrHomeZipName); } // TODO Check num_shards == num reducers // Note that we need this to ensure FileSystem.get is thread-safe: // @see https://issues.apache.org/jira/browse/HDFS-925 // @see // https://mail-archives.apache.org/mod_mbox/hadoop-user/201208.mbox/%3CCA+4kjVt-QE2L83p85uELjWXiog25bYTKOZXdc1Ahun+oBSJYpQ@mail.gmail.com%3E conf.setBoolean("fs.hdfs.impl.disable.cache", true); } conf.setBoolean("mapred.output.oai-pmh", this.exportXml); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WritableSolrRecord.class); conf.setNumReduceTasks(numReducers); } /** * * Run the job: * * @throws InterruptedException * @throws KeeperException * */ public int run(String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Set up the base conf: JobConf conf = new JobConf(getConf(), WARCIndexerRunner.class); // Get the job configuration: this.createJobConf(conf, args); // Submit it: if (this.wait) { JobClient.runJob(conf); } else { JobClient client = new JobClient(conf); client.submitJob(conf); } return 0; } private void setup(String[] args, JobConf conf) throws ParseException { // Process Hadoop args first: String[] otherArgs = new GenericOptionsParser(conf, args) .getRemainingArgs(); // Process remaining args list this: Options options = new Options(); options.addOption("i", true, "input file list"); options.addOption("o", true, "output directory"); options.addOption("c", true, "path to configuration"); options.addOption("w", false, "wait for job to finish"); options.addOption("d", false, "dump configuration"); options.addOption("x", false, "output XML in OAI-PMH format"); options.addOption("a", false, "apply annotations from fixed-name files, via '-files annotations.json,openAccessSurts.txt'"); // TODO: Problematic with "hadoop jar"? // I think starting with the GenericOptionsParser (above) should resolve // this? // options.addOption( OptionBuilder.withArgName( "property=value" // ).hasArgs( 2 ).withValueSeparator().withDescription( // "use value for given property" ).create( "D" ) ); CommandLineParser parser = new PosixParser(); CommandLine cmd = parser.parse(options, otherArgs); if (!cmd.hasOption("i") || !cmd.hasOption("o")) { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.setWidth(80); helpFormatter.printHelp(CLI_USAGE, CLI_HEADER, options, ""); System.exit(1); } this.inputPath = cmd.getOptionValue("i"); this.outputPath = cmd.getOptionValue("o"); this.wait = cmd.hasOption("w"); if (cmd.hasOption("c")) { this.configPath = cmd.getOptionValue("c"); } this.dumpConfig = cmd.hasOption("d"); this.exportXml = cmd.hasOption("x"); this.applyAnnotations = cmd.hasOption("a"); } /** * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { int ret = ToolRunner.run(new WARCIndexerRunner(), args); System.exit(ret); } }