WARCIndexerJob.java example

Explorer

webarchive-discovery-master
- digipres-tika
  - src
    - main
      - java
        org
        archive
        io
        arc
        UncompressedARCReader.java
        warc
        UncompressedWARCReader.java
        uk
        bl
        wa
        dpt
        DigiPresTika.java
        tika
        CallableTest.java
        NonRecursiveEmbeddedDocumentExtractor.java
        PreservationParser.java
        TikaDeepIdentifier.java
        TikaNanite.java
        detect
        HighlightJSDetector.java
        parser
        imagefeatures
        ColourExtractor.java
        ColourMatcher.java
        FaceDetectionParser.java
        iso9660
        ISO9660Extractor.java
        ISO9660Parser.java
        ole2
        OLE2Parser.java
        pdf
        PDFParser.java
        XMPSchemaPDFA.java
        itext
        PDFParser.java
        pdfbox
        PDF2XHTML.java
        PDFParser.java
        warc
        ARCParser.java
        WARCParser.java
        WebARCExtractor.java
    - test
      - java
        eu
        scape_project
        pc
        cc
        nanite
        tika
        PreservationParserTest.java
        uk
        bl
        wap
        tika
        TikaCustomMimeTypesTest.java
        parser
        pdf
        pdfbox
        PDFParserTest.java
- solr-shade
  - src
    - main
      - java
        org
        apache
        solr
        core
        HdfsDirectoryFactory.java
        store
        hdfs
        HdfsDirectory.java
        HdfsFileReader.java
        HdfsFileWriter.java
        HdfsLockFactory.java
        NullIndexOutput.java
        update
        HdfsTransactionLog.java
        HdfsUpdateLog.java
        util
        HdfsUtil.java
- warc-discovery-shine
  - shine
    - app
      - uk
        bl
        wa
        shine
        Query.java
        Rescued.java
        Shine.java
        Solr.java
        URIStatusLookup.java
- warc-hadoop-crawler-utils
  - src
    - main
      - java
        uk
        bl
        wa
        hadoop
        recrawl
        PersistLogBuilder.java
        PersistLogMapper.java
- warc-hadoop-indexer
  - src
    - main
      - java
        uk
        bl
        wa
        apache
        solr
        hadoop
        AlphaNumericComparator.java
        DataInputInputStream.java
        DataOutputOutputStream.java
        Solate.java
        SolrInputDocumentWritable.java
        UnbufferedDataInputInputStream.java
        WARCIndexerJob.java
        WebArchiveFileInputFormat.java
        WebArchiveFileRecordReader.java
        WebArchiveIndexerMapper.java
        Zipper.java
        ZooKeeperInspector.java
        hadoop
        entities
        EntityExtractor.java
        EntityMapper.java
        indexer
        MetadataBuilder.java
        WARCIndexerMapper.java
        WARCIndexerReducer.java
        WARCIndexerRunner.java
        WritableSolrRecord.java
        mdx
        MDXSeqSampleGenerator.java
        MDXSeqStatsGenerator.java
        WARCMDXGenerator.java
        WARCMDXMapper.java
        outlinks
        FrequencyCountingReducer.java
        OutlinkExtractor.java
        OutlinkExtractorMapper.java
        solr
        QueueingHttpSolrServer.java
    - test
      - java
        org
        apache
        solr
        hadoop
        SolateTest.java
        uk
        bl
        wa
        hadoop
        indexer
        WARCIndexerRunnerIntegrationTest.java
        mdx
        MDXSeqSampleGeneratorIntegrationTest.java
        MDXSeqStatsGeneratorIntegrationTest.java
        MDXSequenceIteratorTest.java
        WARCMDXGeneratorIntegrationTest.java
        WARCMDXMapperTest.java
- warc-hadoop-recordreaders
  - src
    - main
      - java
        org
        apache
        hadoop
        mapreduce
        lib
        partition
        TotalOrderPartitioner.java
        archive
        hadoop
        mapreduce
        AlphaPartitioner.java
        uk
        bl
        wa
        hadoop
        ArchiveFileInputFormat.java
        ArchiveFileRecordReader.java
        TextOutputFormat.java
        WARCRecordUtils.java
        WritableArchiveRecord.java
        hosts
        HostsReport.java
        HostsReportMapper.java
        HostsReportReducer.java
        mapred
        ByteBlockRecordReader.java
        UnsplittableInputFileFormat.java
        mapreduce
        FrequencyCountingReducer.java
        MutableInt.java
        cdx
        ArchiveCDXGenerator.java
        TinyCDXSender.java
        TinyCDXServerMapper.java
        TinyCDXServerReducer.java
        hash
        HdfsFileHasher.java
        ShaSumMapper.java
        io
        TextOutputFormat.java
        lib
        ArchiveToCDXFileInputFormat.java
        DereferencingArchiveToCDXRecordReader.java
        input
        ByteBlockRecordReader.java
        UnsplittableInputFileFormat.java
        mdx
        MDX.java
        MDXMerger.java
        MDXReduplicatingReducer.java
        MDXSeqIterator.java
        MDXSeqMerger.java
        warcstats
        WARCRawStatsMDXGenerator.java
        WARCRawStatsMapper.java
        WARCStatsMapper.java
        WARCStatsTool.java
        regex
        WARCRegexIndexer.java
        WARCRegexMapper.java
        WARCRegexReducer.java
    - test
      - java
        uk
        bl
        wa
        hadoop
        mapreduce
        MapReduceTestBaseClass.java
        hash
        HdsfFileHasherIntegrationTest.java
        lib
        DereferencingArchiveToCDXRecordReaderTest.java
        warcstats
        WARCStatsToolIntegrationTest.java
- warc-indexer
  - src
    - main
      - java
        uk
        bl
        wa
        analyser
        payload
        ARCNameAnalyser.java
        AbstractPayloadAnalyser.java
        HTMLAnalyser.java
        ImageAnalyser.java
        PDFAnalyser.java
        WARCPayloadAnalysers.java
        XMLAnalyser.java
        text
        AbstractTextAnalyser.java
        FuzzyHashAnalyser.java
        LanguageAnalyser.java
        PostcodeAnalyser.java
        SentimentJTextAnalyser.java
        TextAnalysers.java
        lang
        LanguageIdentifier.java
        LanguageProfile.java
        ProfilingWriter.java
        package-info.java
        annotation
        Annotations.java
        AnnotationsFromAct.java
        Annotator.java
        CollectionsUpdateTest.java
        DateRange.java
        UriCollection.java
        extract
        LanguageDetector.java
        LinkExtractor.java
        PostcodeGeomapper.java
        Postcodes.java
        Times.java
        indexer
        WARCIndexer.java
        WARCIndexerCommand.java
        parsers
        ApachePreflightParser.java
        HtmlFeatureParser.java
        XMLRootNamespaceParser.java
        solr
        SolrFields.java
        SolrRecord.java
        SolrWebServer.java
        TikaExtractor.java
        WctEnricher.java
        WctFields.java
        util
        ColourMatcher.java
        ConfigPrinter.java
        DebugCloseShieldInputStream.java
        HashedCachedInputStream.java
        InputStreamDataSource.java
        Instrument.java
    - test
      - java
        uk
        bl
        wa
        analyser
        payload
        DroidDetectorTest.java
        HTMLAnalyserTest.java
        WARCPayloadAnalysersTest.java
        annotation
        AnnotationsTest.java
        AnnotatorTest.java
        extract
        LanguageDetectorTest.java
        indexer
        WARCIndexerEmbeddedSolrTest.java
        WARCIndexerTest.java
        parsers
        HtmlFeatureParserTest.java
        solr
        TikaExtractorTest.java
        util
        ColourMatcherTest.java
        InstrumentTest.java
- warc-nlp
  - src
    - main
      - java
        uk
        bl
        wa
        hadoop
        mapreduce
        nlp
        WARCSentenceMapper.java
        WARCWord2VecGenerator.java
        Word2VecReducer.java
        nlp
        analysers
        GateTextAnalyser.java
        StanfordAnalyser.java
        parsers
        StanfordAnnotatorParser.java
        wordvec
        StanfordSentenceIterator.java
        WordvecProcessor.java
    - test
      - java
        uk
        bl
        wa
        nlp
        parsers
        StanfordAnnotatorParserTest.java

package uk.bl.wa.apache.solr.hadoop;
///*
// * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements.  See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with
// * the License.  You may obtain a copy of the License at
// *
// *     http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// */
//package org.apache.solr.hadoop;
//
//import java.io.BufferedInputStream;
//import java.io.BufferedReader;
//import java.io.BufferedWriter;
//import java.io.File;
//import java.io.FileInputStream;
//import java.io.FileNotFoundException;
//import java.io.IOException;
//import java.io.InputStream;
//import java.io.InputStreamReader;
//import java.io.OutputStreamWriter;
//import java.io.Writer;
//import java.text.NumberFormat;
//import java.util.ArrayList;
//import java.util.Arrays;
//import java.util.Collections;
//import java.util.Comparator;
//import java.util.List;
//import java.util.Locale;
//import java.util.Map;
//
//import net.sourceforge.argparse4j.ArgumentParsers;
//import net.sourceforge.argparse4j.impl.Arguments;
//import net.sourceforge.argparse4j.impl.action.HelpArgumentAction;
//import net.sourceforge.argparse4j.impl.choice.RangeArgumentChoice;
//import net.sourceforge.argparse4j.impl.type.FileArgumentType;
//import net.sourceforge.argparse4j.inf.Argument;
//import net.sourceforge.argparse4j.inf.ArgumentGroup;
//import net.sourceforge.argparse4j.inf.ArgumentParser;
//import net.sourceforge.argparse4j.inf.ArgumentParserException;
//import net.sourceforge.argparse4j.inf.FeatureControl;
//import net.sourceforge.argparse4j.inf.Namespace;
//
//import org.apache.hadoop.conf.Configuration;
//import org.apache.hadoop.conf.Configured;
//import org.apache.hadoop.fs.FSDataOutputStream;
//import org.apache.hadoop.fs.FileStatus;
//import org.apache.hadoop.fs.FileSystem;
//import org.apache.hadoop.fs.Path;
//import org.apache.hadoop.fs.PathFilter;
//import org.apache.hadoop.io.NullWritable;
//import org.apache.hadoop.io.Text;
//import org.apache.hadoop.mapred.JobClient;
//import org.apache.hadoop.mapreduce.Job;
//import org.apache.hadoop.mapreduce.JobContext;
//import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
//import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//import org.apache.hadoop.util.GenericOptionsParser;
//import org.apache.hadoop.util.Tool;
//import org.apache.hadoop.util.ToolRunner;
//import org.apache.log4j.PropertyConfigurator;
//import org.apache.solr.common.cloud.SolrZkClient;
//import org.apache.solr.hadoop.dedup.NoChangeUpdateConflictResolver;
//import org.apache.solr.hadoop.morphline.MorphlineMapRunner;
//import org.kitesdk.morphline.base.Fields;
//import org.restlet.engine.util.AlphaNumericComparator;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import uk.bl.wa.hadoop.indexer.WARCIndexerRunner;
//import uk.bl.wa.util.ConfigPrinter;
//
//import com.google.common.base.Charsets;
//import com.google.common.base.Preconditions;
//import com.google.common.io.ByteStreams;
//import com.typesafe.config.Config;
//import com.typesafe.config.ConfigFactory;
//import com.typesafe.config.ConfigRenderOptions;
//
//
///**
// * This is based on MapReduceIndexerTool and should be more powerful than
// * hacking it together manually, but depends on Hadoop 2.x.x series code so has
// * problems running against our aging cluster.
// * 
// * Public API for a MapReduce batch job driver that creates a set of Solr index
// * shards from a set of input files and writes the indexes into HDFS, in a
// * flexible, scalable and fault-tolerant manner. Also supports merging the
// * output shards into a set of live customer facing Solr servers, typically a
// * SolrCloud.
// */
//public class WARCIndexerJob extends Configured implements Tool {
//
//	Job job; // visible for testing only
//
//	public static final String RESULTS_DIR = "results";
//
//	static final String MAIN_MEMORY_RANDOMIZATION_THRESHOLD = WARCIndexerJob.class
//			.getName() + ".mainMemoryRandomizationThreshold";
//
//	private static final String FULL_INPUT_LIST = "full-input-list.txt";
//
//	private static final Logger LOG = LoggerFactory
//			.getLogger(WARCIndexerJob.class);
//
//
//	/**
//	 * See http://argparse4j.sourceforge.net and for details see
//	 * http://argparse4j.sourceforge.net/usage.html
//	 */
//	static final class MyArgumentParser {
//
//		private static final String SHOW_NON_SOLR_CLOUD = "--show-non-solr-cloud";
//
//		private boolean showNonSolrCloud = false;
//
//		/**
//		 * Parses the given command line arguments.
//		 * 
//		 * @return exitCode null indicates the caller shall proceed with
//		 *         processing, non-null indicates the caller shall exit the
//		 *         program with the given exit status code.
//		 */
//		public Integer parseArgs(String[] args, Configuration conf, Options opts) {
//			assert args != null;
//			assert conf != null;
//			assert opts != null;
//
//			if (args.length == 0) {
//				args = new String[] { "--help" };
//			}
//
//			showNonSolrCloud = Arrays.asList(args)
//					.contains(SHOW_NON_SOLR_CLOUD); // intercept it first
//
//			ArgumentParser parser = ArgumentParsers
//					.newArgumentParser(
//							"hadoop [GenericOptions]... jar solr-map-reduce-*.jar ",
//							false)
//					.defaultHelp(true)
//					.description(
//							"MapReduce batch job driver that takes a morphline and creates a set of Solr index shards from a set of input files "
//									+ "and writes the indexes into HDFS, in a flexible, scalable and fault-tolerant manner. "
//									+ "It also supports merging the output shards into a set of live customer facing Solr servers, "
//									+ "typically a SolrCloud. The program proceeds in several consecutive MapReduce based phases, as follows:"
//									+ "\n\n"
//									+ "1) Randomization phase: This (parallel) phase randomizes the list of input files in order to spread "
//									+ "indexing load more evenly among the mappers of the subsequent phase."
//									+ "\n\n"
//									+ "2) Mapper phase: This (parallel) phase takes the input files, extracts the relevant content, transforms it "
//									+ "and hands SolrInputDocuments to a set of reducers. "
//									+ "The ETL functionality is flexible and "
//									+ "customizable using chains of arbitrary morphline commands that pipe records from one transformation command to another. "
//									+ "Commands to parse and transform a set of standard data formats such as Avro, CSV, Text, HTML, XML, "
//									+ "PDF, Word, Excel, etc. are provided out of the box, and additional custom commands and parsers for additional "
//									+ "file or data formats can be added as morphline plugins. "
//									+ "This is done by implementing a simple Java interface that consumes a record (e.g. a file in the form of an InputStream "
//									+ "plus some headers plus contextual metadata) and generates as output zero or more records. "
//									+ "Any kind of data format can be indexed and any Solr documents for any kind of Solr schema can be generated, "
//									+ "and any custom ETL logic can be registered and executed.\n"
//									+ "Record fields, including MIME types, can also explicitly be passed by force from the CLI to the morphline, for example: "
//									+ "hadoop ... -D "
//									+ MorphlineMapRunner.MORPHLINE_FIELD_PREFIX
//									+ Fields.ATTACHMENT_MIME_TYPE
//									+ "=text/csv"
//									+ "\n\n"
//									+ "3) Reducer phase: This (parallel) phase loads the mapper's SolrInputDocuments into one EmbeddedSolrServer per reducer. "
//									+ "Each such reducer and Solr server can be seen as a (micro) shard. The Solr servers store their "
//									+ "data in HDFS."
//									+ "\n\n"
//									+ "4) Mapper-only merge phase: This (parallel) phase merges the set of reducer shards into the number of solr "
//									+ "shards expected by the user, using a mapper-only job. This phase is omitted if the number "
//									+ "of shards is already equal to the number of shards expected by the user. "
//									+ "\n\n"
//									+ "5) Go-live phase: This optional (parallel) phase merges the output shards of the previous phase into a set of "
//									+ "live customer facing Solr servers, typically a SolrCloud. "
//									+ "If this phase is omitted you can explicitly point each Solr server to one of the HDFS output shard directories."
//									+ "\n\n"
//									+ "Fault Tolerance: Mapper and reducer task attempts are retried on failure per the standard MapReduce semantics. "
//									+ "On program startup all data in the --output-dir is deleted if that output directory already exists. "
//									+ "If the whole job fails you can retry simply by rerunning the program again using the same arguments.");
//
//			parser.addArgument("--help", "-help", "-h")
//					.help("Show this help message and exit")
//					.action(new HelpArgumentAction() {
//						@Override
//						public void run(ArgumentParser parser, Argument arg,
//								Map<String, Object> attrs, String flag,
//								Object value) throws ArgumentParserException {
//							parser.printHelp();
//							System.out.println();
//							System.out.print(ToolRunnerHelpFormatter
//									.getGenericCommandUsage());
//							// ToolRunner.printGenericCommandUsage(System.out);
//							System.out
//									.println("Examples: \n\n"
//											+
//
//              "# (Re)index an Avro based Twitter tweet file:\n" +
//              "sudo -u hdfs hadoop \\\n" + 
//              "  --config /etc/hadoop/conf.cloudera.mapreduce1 \\\n" +
//              "  jar target/solr-map-reduce-*.jar \\\n" +
//              "  -D 'mapred.child.java.opts=-Xmx500m' \\\n" + 
//											// "  -D 'mapreduce.child.java.opts=-Xmx500m' \\\n"
//											// +
//              "  --log4j src/test/resources/log4j.properties \\\n" + 
//              "  --morphline-file ../search-core/src/test/resources/test-morphlines/tutorialReadAvroContainer.conf \\\n" + 
//              "  --solr-home-dir src/test/resources/solr/minimr \\\n" +
//              "  --output-dir hdfs://c2202.mycompany.com/user/$USER/test \\\n" + 
//              "  --shards 1 \\\n" + 
//              "  hdfs:///user/$USER/test-documents/sample-statuses-20120906-141433.avro\n" +
//              "\n" +
//              "# Go live by merging resulting index shards into a live Solr cluster\n" +
//              "# (explicitly specify Solr URLs - for a SolrCloud cluster see next example):\n" +
//              "sudo -u hdfs hadoop \\\n" + 
//              "  --config /etc/hadoop/conf.cloudera.mapreduce1 \\\n" +
//              "  jar target/solr-map-reduce-*.jar \\\n" +
//              "  -D 'mapred.child.java.opts=-Xmx500m' \\\n" + 
//											// "  -D 'mapreduce.child.java.opts=-Xmx500m' \\\n"
//											// +
//              "  --log4j src/test/resources/log4j.properties \\\n" + 
//              "  --morphline-file ../search-core/src/test/resources/test-morphlines/tutorialReadAvroContainer.conf \\\n" + 
//              "  --solr-home-dir src/test/resources/solr/minimr \\\n" + 
//              "  --output-dir hdfs://c2202.mycompany.com/user/$USER/test \\\n" + 
//              "  --shard-url http://solr001.mycompany.com:8983/solr/collection1 \\\n" + 
//              "  --shard-url http://solr002.mycompany.com:8983/solr/collection1 \\\n" + 
//              "  --go-live \\\n" + 
//              "  hdfs:///user/foo/indir\n" +  
//              "\n" +
//              "# Go live by merging resulting index shards into a live SolrCloud cluster\n" +
//              "# (discover shards and Solr URLs through ZooKeeper):\n" +
//              "sudo -u hdfs hadoop \\\n" + 
//              "  --config /etc/hadoop/conf.cloudera.mapreduce1 \\\n" +
//              "  jar target/solr-map-reduce-*.jar \\\n" +
//              "  -D 'mapred.child.java.opts=-Xmx500m' \\\n" + 
//											// "  -D 'mapreduce.child.java.opts=-Xmx500m' \\\n"
//											// +
//              "  --log4j src/test/resources/log4j.properties \\\n" + 
//              "  --morphline-file ../search-core/src/test/resources/test-morphlines/tutorialReadAvroContainer.conf \\\n" + 
//              "  --output-dir hdfs://c2202.mycompany.com/user/$USER/test \\\n" + 
//              "  --zk-host zk01.mycompany.com:2181/solr \\\n" + 
//              "  --collection collection1 \\\n" + 
//              "  --go-live \\\n" + 
//              "  hdfs:///user/foo/indir\n"
//);
//							throw new FoundHelpArgument(); // Trick to prevent
//															// processing of any
//															// remaining
//															// arguments
//						}
//					});
//
//			ArgumentGroup requiredGroup = parser
//					.addArgumentGroup("Required arguments");
//
//			Argument outputDirArg = requiredGroup
//					.addArgument("--output-dir")
//					.metavar("HDFS_URI")
//					.type(new PathArgumentType(conf) {
//						@Override
//						public Path convert(ArgumentParser parser,
//								Argument arg, String value)
//								throws ArgumentParserException {
//							Path path = super.convert(parser, arg, value);
//							if ("hdfs".equals(path.toUri().getScheme())
//									&& path.toUri().getAuthority() == null) {
//								// TODO: consider defaulting to hadoop's
//								// fs.default.name here or in
//								// SolrRecordWriter.createEmbeddedSolrServer()
//								throw new ArgumentParserException(
//										"Missing authority in path URI: "
//												+ path, parser);
//							}
//							return path;
//						}
//					}.verifyHasScheme().verifyIsAbsolute()
//							.verifyCanWriteParent())
//					.required(true)
//					.help("HDFS directory to write Solr indexes to. Inside there one output directory per shard will be generated. "
//							+ "Example: hdfs://c2202.mycompany.com/user/$USER/test");
//
//			Argument inputListArg = parser
//					.addArgument("--input-list")
//					.action(Arguments.append())
//					.metavar("URI")
//					// .type(new
//					// PathArgumentType(fs).verifyExists().verifyCanRead())
//					.type(Path.class)
//					.help("Local URI or HDFS URI of a UTF-8 encoded file containing a list of HDFS URIs to index, "
//							+ "one URI per line in the file. If '-' is specified, URIs are read from the standard input. "
//							+ "Multiple --input-list arguments can be specified.");
//
//			Argument solrHomeDirArg = nonSolrCloud(parser
//					.addArgument("--solr-home-dir")
//					.metavar("DIR")
//					.type(new FileArgumentType() {
//						@Override
//						public File convert(ArgumentParser parser,
//								Argument arg, String value)
//								throws ArgumentParserException {
//							File solrHomeDir = super
//									.convert(parser, arg, value);
//							File solrConfigFile = new File(new File(
//									solrHomeDir, "conf"), "solrconfig.xml");
//							new FileArgumentType()
//									.verifyExists()
//									.verifyIsFile()
//									.verifyCanRead()
//									.convert(parser, arg,
//											solrConfigFile.getPath());
//							return solrHomeDir;
//						}
//					}.verifyIsDirectory().verifyCanRead())
//					.required(false)
//					.help("Relative or absolute path to a local dir containing Solr conf/ dir and in particular "
//							+ "conf/solrconfig.xml and optionally also lib/ dir. This directory will be uploaded to each MR task. "
//							+ "Example: src/test/resources/solr/minimr"));
//
//			Argument updateConflictResolverArg = parser
//					.addArgument("--update-conflict-resolver")
//					.metavar("FQCN")
//					.type(String.class)
//					.setDefault(NoChangeUpdateConflictResolver.class.getName())
//					.help("Fully qualified class name of a Java class that implements the UpdateConflictResolver interface. "
//							+ "This enables deduplication and ordering of a series of document updates for the same unique document "
//							+ "key. For example, a MapReduce batch job might index multiple files in the same job where some of the "
//							+ "files contain old and new versions of the very same document, using the same unique document key.\n"
//							+ "Typically, implementations of this interface forbid collisions by throwing an exception, or ignore all but "
//							+ "the most recent document version, or, in the general case, order colliding updates ascending from least "
//							+ "recent to most recent (partial) update. The caller of this interface (i.e. the Hadoop Reducer) will then "
//							+ "apply the updates to Solr in the order returned by the orderUpdates() method.\n"
//							+ "The default RetainMostRecentUpdateConflictResolver implementation ignores all but the most recent document "
//							+ "version, based on a configurable numeric Solr field, which defaults to the file_last_modified timestamp");
//
//			Argument mappersArg = parser
//					.addArgument("--mappers")
//					.metavar("INTEGER")
//					.type(Integer.class)
//					.choices(new RangeArgumentChoice(-1, Integer.MAX_VALUE))
//					// TODO: also support X% syntax where X is an integer
//					.setDefault(-1)
//					.help("Tuning knob that indicates the maximum number of MR mapper tasks to use. -1 indicates use all map slots "
//							+ "available on the cluster.");
//
//			Argument reducersArg = parser
//					.addArgument("--reducers")
//					.metavar("INTEGER")
//					.type(Integer.class)
//					.choices(new RangeArgumentChoice(-2, Integer.MAX_VALUE))
//					// TODO: also support X% syntax where X is an integer
//					.setDefault(-1)
//					.help("Tuning knob that indicates the number of reducers to index into. "
//							+ "0 is reserved for a mapper-only feature that may ship in a future release. "
//							+ "-1 indicates use all reduce slots available on the cluster. "
//							+ "-2 indicates use one reducer per output shard, which disables the mtree merge MR algorithm. "
//							+ "The mtree merge MR algorithm improves scalability by spreading load "
//							+ "(in particular CPU load) among a number of parallel reducers that can be much larger than the number "
//							+ "of solr shards expected by the user. It can be seen as an extension of concurrent lucene merges "
//							+ "and tiered lucene merges to the clustered case. The subsequent mapper-only phase "
//							+ "merges the output of said large number of reducers to the number of shards expected by the user, "
//							+ "again by utilizing more available parallelism on the cluster.");
//
//			Argument fanoutArg = parser.addArgument("--fanout")
//					.metavar("INTEGER").type(Integer.class)
//					.choices(new RangeArgumentChoice(2, Integer.MAX_VALUE))
//					.setDefault(Integer.MAX_VALUE)
//					.help(FeatureControl.SUPPRESS);
//
//			Argument maxSegmentsArg = parser
//					.addArgument("--max-segments")
//					.metavar("INTEGER")
//					.type(Integer.class)
//					.choices(new RangeArgumentChoice(1, Integer.MAX_VALUE))
//					.setDefault(1)
//					.help("Tuning knob that indicates the maximum number of segments to be contained on output in the index of "
//							+ "each reducer shard. After a reducer has built its output index it applies a merge policy to merge segments "
//							+ "until there are <= maxSegments lucene segments left in this index. "
//							+ "Merging segments involves reading and rewriting all data in all these segment files, "
//							+ "potentially multiple times, which is very I/O intensive and time consuming. "
//							+ "However, an index with fewer segments can later be merged faster, "
//							+ "and it can later be queried faster once deployed to a live Solr serving shard. "
//							+ "Set maxSegments to 1 to optimize the index for low query latency. "
//							+ "In a nutshell, a small maxSegments value trades indexing latency for subsequently improved query latency. "
//							+ "This can be a reasonable trade-off for batch indexing systems.");
//
//			Argument fairSchedulerPoolArg = parser
//					.addArgument("--fair-scheduler-pool")
//					.metavar("STRING")
//					.help("Optional tuning knob that indicates the name of the fair scheduler pool to submit jobs to. "
//							+ "The Fair Scheduler is a pluggable MapReduce scheduler that provides a way to share large clusters. "
//							+ "Fair scheduling is a method of assigning resources to jobs such that all jobs get, on average, an "
//							+ "equal share of resources over time. When there is a single job running, that job uses the entire "
//							+ "cluster. When other jobs are submitted, tasks slots that free up are assigned to the new jobs, so "
//							+ "that each job gets roughly the same amount of CPU time. Unlike the default Hadoop scheduler, which "
//							+ "forms a queue of jobs, this lets short jobs finish in reasonable time while not starving long jobs. "
//							+ "It is also an easy way to share a cluster between multiple of users. Fair sharing can also work with "
//							+ "job priorities - the priorities are used as weights to determine the fraction of total compute time "
//							+ "that each job gets.");
//
//			Argument dryRunArg = parser
//					.addArgument("--dry-run")
//					.action(Arguments.storeTrue())
//					.help("Run in local mode and print documents to stdout instead of loading them into Solr. This executes "
//							+ "the morphline in the client process (without submitting a job to MR) for quicker turnaround during "
//							+ "early trial & debug sessions.");
//
//			Argument log4jConfigFileArg = parser
//					.addArgument("--log4j")
//					.metavar("FILE")
//					.type(new FileArgumentType().verifyExists().verifyIsFile()
//							.verifyCanRead())
//					.help("Relative or absolute path to a log4j.properties config file on the local file system. This file "
//							+ "will be uploaded to each MR task. Example: /path/to/log4j.properties");
//
//			Argument verboseArg = parser.addArgument("--verbose", "-v")
//					.action(Arguments.storeTrue())
//					.help("Turn on verbose output.");
//
//			parser.addArgument(SHOW_NON_SOLR_CLOUD)
//					.action(Arguments.storeTrue())
//					.help("Also show options for Non-SolrCloud mode as part of --help.");
//
//			ArgumentGroup clusterInfoGroup = parser
//					.addArgumentGroup("Cluster arguments")
//					.description(
//							"Arguments that provide information about your Solr cluster. "
//									+ nonSolrCloud("If you are building shards for a SolrCloud cluster, pass the --zk-host argument. "
//											+ "If you are building shards for "
//											+ "a Non-SolrCloud cluster, pass the --shard-url argument one or more times. To build indexes for "
//											+ "a replicated Non-SolrCloud cluster with --shard-url, pass replica urls consecutively and also pass --shards. "
//											+ "Using --go-live requires either --zk-host or --shard-url."));
//
//			Argument zkHostArg = clusterInfoGroup
//					.addArgument("--zk-host")
//					.metavar("STRING")
//					.type(String.class)
//					.help("The address of a ZooKeeper ensemble being used by a SolrCloud cluster. "
//							+ "This ZooKeeper ensemble will be examined to determine the number of output "
//							+ "shards to create as well as the Solr URLs to merge the output shards into when using the --go-live option. "
//							+ "Requires that you also pass the --collection to merge the shards into.\n"
//							+ "\n"
//							+ "The --zk-host option implements the same partitioning semantics as the standard SolrCloud "
//							+ "Near-Real-Time (NRT) API. This enables to mix batch updates from MapReduce ingestion with "
//							+ "updates from standard Solr NRT ingestion on the same SolrCloud cluster, "
//							+ "using identical unique document keys.\n"
//							+ "\n"
//							+ "Format is: a list of comma separated host:port pairs, each corresponding to a zk "
//							+ "server. Example: '127.0.0.1:2181,127.0.0.1:2182,127.0.0.1:2183' If "
//							+ "the optional chroot suffix is used the example would look "
//							+ "like: '127.0.0.1:2181/solr,127.0.0.1:2182/solr,127.0.0.1:2183/solr' "
//							+ "where the client would be rooted at '/solr' and all paths "
//							+ "would be relative to this root - i.e. getting/setting/etc... "
//							+ "'/foo/bar' would result in operations being run on "
//							+ "'/solr/foo/bar' (from the server perspective).\n"
//							+ nonSolrCloud("\n"
//									+ "If --solr-home-dir is not specified, the Solr home directory for the collection "
//									+ "will be downloaded from this ZooKeeper ensemble."));
//
//			Argument shardsArg = nonSolrCloud(clusterInfoGroup
//					.addArgument("--shards").metavar("INTEGER")
//					.type(Integer.class)
//					.choices(new RangeArgumentChoice(1, Integer.MAX_VALUE))
//					.help("Number of output shards to generate."));
//
//			Argument collectionArg = parser
//					.addArgument("--collection")
//					.metavar("STRING")
//					.help("The SolrCloud collection to merge shards into when using --zk-host. Example: collection1");
//
//			// trailing positional arguments
//			Argument inputFilesArg = parser
//					.addArgument("input-files")
//					.metavar("HDFS_URI")
//					.type(new PathArgumentType(conf).verifyHasScheme()
//							.verifyExists().verifyCanRead()).nargs("*")
//					.setDefault()
//					.help("HDFS URI of file or directory tree to index.");
//
//			Argument configPathArg = parser.addArgument("--config-path")
//					.metavar("STRING").help("The indexer config file to load.");
//
//			Argument dumpConfigArg = parser.addArgument("--dump-config")
//					.action(Arguments.storeTrue()).help("Dump indexer config.");
//
//			Namespace ns;
//			try {
//				ns = parser.parseArgs(args);
//			} catch (FoundHelpArgument e) {
//				return 0;
//			} catch (ArgumentParserException e) {
//				parser.handleError(e);
//				return 1;
//			}
//
//			opts.log4jConfigFile = (File) ns.get(log4jConfigFileArg.getDest());
//			if (opts.log4jConfigFile != null) {
//				PropertyConfigurator.configure(opts.log4jConfigFile.getPath());
//			}
//			LOG.debug("Parsed command line args: {}", ns);
//
//			opts.inputLists = ns.getList(inputListArg.getDest());
//			if (opts.inputLists == null) {
//				opts.inputLists = Collections.EMPTY_LIST;
//			}
//			opts.inputFiles = ns.getList(inputFilesArg.getDest());
//			opts.outputDir = (Path) ns.get(outputDirArg.getDest());
//			opts.mappers = ns.getInt(mappersArg.getDest());
//			opts.reducers = ns.getInt(reducersArg.getDest());
//			opts.updateConflictResolver = ns
//					.getString(updateConflictResolverArg.getDest());
//			opts.fanout = ns.getInt(fanoutArg.getDest());
//			opts.maxSegments = ns.getInt(maxSegmentsArg.getDest());
//			opts.solrHomeDir = (File) ns.get(solrHomeDirArg.getDest());
//			opts.fairSchedulerPool = ns.getString(fairSchedulerPoolArg
//					.getDest());
//			opts.isDryRun = ns.getBoolean(dryRunArg.getDest());
//			opts.isVerbose = ns.getBoolean(verboseArg.getDest());
//			opts.zkHost = ns.getString(zkHostArg.getDest());
//			opts.shards = ns.getInt(shardsArg.getDest());
//			opts.collection = ns.getString(collectionArg.getDest());
//
//			opts.configPath = ns.getString(configPathArg.getDest());
//			opts.dumpConfig = ns.getBoolean(dumpConfigArg.getDest());
//
//			try {
//				if (opts.reducers == 0) {
//					throw new ArgumentParserException(
//							"--reducers must not be zero", parser);
//				}
//			} catch (ArgumentParserException e) {
//				parser.handleError(e);
//				return 1;
//			}
//
//			if (opts.inputLists.isEmpty() && opts.inputFiles.isEmpty()) {
//				LOG.info("No input files specified - nothing to process");
//				return 0; // nothing to process
//			}
//			return null;
//		}
//
//		// make it a "hidden" option, i.e. the option is functional and enabled
//		// but not shown in --help output
//		private Argument nonSolrCloud(Argument arg) {
//			return showNonSolrCloud ? arg : arg.help(FeatureControl.SUPPRESS);
//		}
//
//		private String nonSolrCloud(String msg) {
//			return showNonSolrCloud ? msg : "";
//		}
//
//		/**
//		 * Marker trick to prevent processing of any remaining arguments once
//		 * --help option has been parsed
//		 */
//		private static final class FoundHelpArgument extends RuntimeException {
//		}
//	}
//
//	// END OF INNER CLASS
//
//	static List<List<String>> buildShardUrls(List<Object> urls,
//			Integer numShards) {
//		if (urls == null)
//			return null;
//		List<List<String>> shardUrls = new ArrayList<List<String>>(urls.size());
//		List<String> list = null;
//
//		int sz;
//		if (numShards == null) {
//			numShards = urls.size();
//		}
//		sz = (int) Math.ceil(urls.size() / (float) numShards);
//		for (int i = 0; i < urls.size(); i++) {
//			if (i % sz == 0) {
//				list = new ArrayList<String>();
//				shardUrls.add(list);
//			}
//			list.add((String) urls.get(i));
//		}
//
//		return shardUrls;
//	}
//
//	static final class Options {
//		String collection;
//		String zkHost;
//		List<Path> inputLists;
//		List<Path> inputFiles;
//		Path outputDir;
//		int mappers;
//		int reducers;
//		String updateConflictResolver;
//		int fanout;
//		Integer shards;
//		int maxSegments;
//		File solrHomeDir;
//		String fairSchedulerPool;
//		boolean isDryRun;
//		File log4jConfigFile;
//		boolean isVerbose;
//		String configPath;
//		boolean dumpConfig;
//	}
//
//	// END OF INNER CLASS
//
//
//	/** API for command line clients */
//	public static void main(String[] args) throws Exception {
//		int res = ToolRunner.run(new Configuration(),
// new WARCIndexerJob(),
//				args);
//		System.exit(res);
//	}
//
//	public WARCIndexerJob() {
//	}
//
//	@Override
//	public int run(String[] args) throws Exception {
//		Options opts = new Options();
//		Integer exitCode = new MyArgumentParser().parseArgs(args, getConf(),
//				opts);
//		if (exitCode != null) {
//			return exitCode;
//		}
//		return run(opts);
//	}
//
//	/**
//	 * API for Java clients; visible for testing; may become a public API
//	 * eventually
//	 */
//	int run(Options options) throws Exception {
//		if (getConf().getBoolean("isMR1", false)
//				&& "local".equals(getConf().get("mapred.job.tracker"))) {
//			throw new IllegalStateException(
//					"Running with LocalJobRunner (i.e. all of Hadoop inside a single JVM) is not supported "
//							+ "because LocalJobRunner does not (yet) implement the Hadoop Distributed Cache feature, "
//							+ "which is required for passing files via --files and --libjars");
//		}
//
//		long programStartTime = System.nanoTime();
//		if (options.fairSchedulerPool != null) {
//			getConf().set("mapred.fairscheduler.pool",
//					options.fairSchedulerPool);
//		}
//		getConf().setInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS,
//				options.maxSegments);
//
//		// switch off a false warning about allegedly not implementing Tool
//		// also see
//		// http://hadoop.6.n7.nabble.com/GenericOptionsParser-warning-td8103.html
//		// also see https://issues.apache.org/jira/browse/HADOOP-8183
//		getConf().setBoolean("mapred.used.genericoptionsparser", true);
//
//		if (options.log4jConfigFile != null) {
//			Utils.setLogConfigFile(options.log4jConfigFile, getConf());
//			addDistributedCacheFile(options.log4jConfigFile, getConf());
//		}
//
//		job = Job.getInstance(getConf());
//		job.setJarByClass(getClass());
//
//		int mappers = new JobClient(job.getConfiguration()).getClusterStatus()
//				.getMaxMapTasks(); // MR1
//		// int mappers =
//		// job.getCluster().getClusterStatus().getMapSlotCapacity(); // Yarn
//		// only
//		LOG.info("Cluster reports {} mapper slots", mappers);
//
//		if (options.mappers == -1) {
//			mappers = 8 * mappers; // better accomodate stragglers
//		} else {
//			mappers = options.mappers;
//		}
//		if (mappers <= 0) {
//			throw new IllegalStateException("Illegal number of mappers: "
//					+ mappers);
//		}
//		options.mappers = mappers;
//
//		FileSystem fs = options.outputDir.getFileSystem(job.getConfiguration());
//		if (fs.exists(options.outputDir)
//				&& !delete(options.outputDir, true, fs)) {
//			return -1;
//		}
//		Path outputResultsDir = new Path(options.outputDir, RESULTS_DIR);
//		Path outputReduceDir = new Path(options.outputDir, "reducers");
//		Path outputStep1Dir = new Path(options.outputDir, "tmp1");
//		Path outputStep2Dir = new Path(options.outputDir, "tmp2");
//		Path outputTreeMergeStep = new Path(options.outputDir,
//				"mtree-merge-output");
//		Path fullInputList = new Path(outputStep1Dir, FULL_INPUT_LIST);
//
//		LOG.debug("Creating list of input files for mappers: {}", fullInputList);
//		long numFiles = addInputFiles(options.inputFiles, options.inputLists,
//				fullInputList, job.getConfiguration());
//		if (numFiles == 0) {
//			LOG.info("No input files found - nothing to process");
//			return 0;
//		}
//		int numLinesPerSplit = (int) ceilDivide(numFiles, mappers);
//		if (numLinesPerSplit < 0) { // numeric overflow from downcasting long to
//									// int?
//			numLinesPerSplit = Integer.MAX_VALUE;
//		}
//		numLinesPerSplit = Math.max(1, numLinesPerSplit);
//
//		int realMappers = Math.min(mappers,
//				(int) ceilDivide(numFiles, numLinesPerSplit));
//		calculateNumReducers(options, realMappers);
//		int reducers = options.reducers;
//		LOG.info(
//				"Using these parameters: "
//						+ "numFiles: {}, mappers: {}, realMappers: {}, reducers: {}, shards: {}, fanout: {}, maxSegments: {}",
//				new Object[] { numFiles, mappers, realMappers, reducers,
//						options.shards, options.fanout, options.maxSegments });
//
//
//		long startTime = System.nanoTime();
//		float secs = (System.nanoTime() - startTime) / (float) (10 ^ 9);
//
//		// ---- ---- ----
//
//		// Store application properties where the mappers/reducers can access
//		// them
//		Config index_conf;
//		if (options.configPath != null) {
//			index_conf = ConfigFactory.parseFile(new File(options.configPath));
//		} else {
//			index_conf = ConfigFactory.load();
//		}
//		if (options.dumpConfig) {
//			ConfigPrinter.print(index_conf);
//			System.exit(0);
//		}
//		job.getConfiguration().set(
//				WARCIndexerRunner.CONFIG_PROPERTIES,
//				index_conf.withOnlyPath("warc").root()
//				.render(ConfigRenderOptions.concise()));
//
//		job.setInputFormatClass(WebArchiveFileInputFormat.class);
//		Class<WebArchiveIndexerMapper> mapperClass = WebArchiveIndexerMapper.class;
//		job.setMapperClass(mapperClass);
//
//		// ---- ---- ----
//
//		FileOutputFormat.setOutputPath(job, outputReduceDir);
//		job.setJobName(getClass().getName() + "/"
//				+ Utils.getShortClassName(mapperClass));
//
//		if (job.getConfiguration().get(JobContext.REDUCE_CLASS_ATTR) == null) { // enable
//																				// customization
//			job.setReducerClass(SolrReducer.class);
//		}
//		if (options.updateConflictResolver == null) {
//			throw new IllegalArgumentException(
//					"updateConflictResolver must not be null");
//		}
//		job.getConfiguration().set(SolrReducer.UPDATE_CONFLICT_RESOLVER,
//				options.updateConflictResolver);
//
//		if (options.zkHost != null) {
//			assert options.collection != null;
//			/*
//			 * MapReduce partitioner that partitions the Mapper output such that
//			 * each SolrInputDocument gets sent to the SolrCloud shard that it
//			 * would have been sent to if the document were ingested via the
//			 * standard SolrCloud Near Real Time (NRT) API.
//			 * 
//			 * In other words, this class implements the same partitioning
//			 * semantics as the standard SolrCloud NRT API. This enables to mix
//			 * batch updates from MapReduce ingestion with updates from standard
//			 * NRT ingestion on the same SolrCloud cluster, using identical
//			 * unique document keys.
//			 */
//			if (job.getConfiguration().get(JobContext.PARTITIONER_CLASS_ATTR) == null) { // enable
//																							// customization
//				job.setPartitionerClass(SolrCloudPartitioner.class);
//			}
//			job.getConfiguration().set(SolrCloudPartitioner.ZKHOST,
//					options.zkHost);
//			job.getConfiguration().set(SolrCloudPartitioner.COLLECTION,
//					options.collection);
//		}
//		job.getConfiguration().setInt(SolrCloudPartitioner.SHARDS,
//				options.shards);
//
//		job.setOutputFormatClass(SolrOutputFormat.class);
//		if (options.solrHomeDir != null) {
//			SolrOutputFormat.setupSolrHomeCache(options.solrHomeDir, job);
//		} else {
//			assert options.zkHost != null;
//			// use the config that this collection uses for the SolrHomeCache.
//			ZooKeeperInspector zki = new ZooKeeperInspector();
//			SolrZkClient zkClient = zki.getZkClient(options.zkHost);
//			try {
//				String configName = zki.readConfigName(zkClient,
//						options.collection);
//				File tmpSolrHomeDir = zki.downloadConfigDir(zkClient,
//						configName);
//				SolrOutputFormat.setupSolrHomeCache(tmpSolrHomeDir, job);
//				options.solrHomeDir = tmpSolrHomeDir;
//			} finally {
//				zkClient.close();
//			}
//		}
//
//		job.setNumReduceTasks(reducers);
//		job.setOutputKeyClass(Text.class);
//		job.setOutputValueClass(SolrInputDocumentWritable.class);
//		LOG.info("Indexing {} files using {} real mappers into {} reducers",
//				new Object[] { numFiles, realMappers, reducers });
//		startTime = System.nanoTime();
//		if (!waitForCompletion(job, options.isVerbose)) {
//			return -1; // job failed
//		}
//
//		secs = (System.nanoTime() - startTime) / (float) (10 ^ 9);
//		LOG.info(
//				"Done. Indexing {} files using {} real mappers into {} reducers took {} secs",
//				new Object[] { numFiles, realMappers, reducers, secs });
//
//		int mtreeMergeIterations = 0;
//		if (reducers > options.shards) {
//			mtreeMergeIterations = (int) Math.round(log(options.fanout,
//					reducers / options.shards));
//		}
//		LOG.debug("MTree merge iterations to do: {}", mtreeMergeIterations);
//		int mtreeMergeIteration = 1;
//		while (reducers > options.shards) { // run a mtree merge iteration
//			job = Job.getInstance(getConf());
//			job.setJarByClass(getClass());
//			job.setJobName(getClass().getName() + "/"
//					+ Utils.getShortClassName(TreeMergeMapper.class));
//			job.setMapperClass(TreeMergeMapper.class);
//			job.setOutputFormatClass(TreeMergeOutputFormat.class);
//			job.setNumReduceTasks(0);
//			job.setOutputKeyClass(Text.class);
//			job.setOutputValueClass(NullWritable.class);
//			job.setInputFormatClass(NLineInputFormat.class);
//
//			Path inputStepDir = new Path(options.outputDir,
//					"mtree-merge-input-iteration" + mtreeMergeIteration);
//			fullInputList = new Path(inputStepDir, FULL_INPUT_LIST);
//			LOG.debug(
//					"MTree merge iteration {}/{}: Creating input list file for mappers {}",
//					new Object[] { mtreeMergeIteration, mtreeMergeIterations,
//							fullInputList });
//			numFiles = createTreeMergeInputDirList(outputReduceDir, fs,
//					fullInputList);
//			if (numFiles != reducers) {
//				throw new IllegalStateException("Not same reducers: "
//						+ reducers + ", numFiles: " + numFiles);
//			}
//			NLineInputFormat.addInputPath(job, fullInputList);
//			NLineInputFormat.setNumLinesPerSplit(job, options.fanout);
//			FileOutputFormat.setOutputPath(job, outputTreeMergeStep);
//
//			LOG.info(
//					"MTree merge iteration {}/{}: Merging {} shards into {} shards using fanout {}",
//					new Object[] { mtreeMergeIteration, mtreeMergeIterations,
//							reducers, (reducers / options.fanout),
//							options.fanout });
//			startTime = System.nanoTime();
//			if (!waitForCompletion(job, options.isVerbose)) {
//				return -1; // job failed
//			}
//			if (!renameTreeMergeShardDirs(outputTreeMergeStep, job, fs)) {
//				return -1;
//			}
//			secs = (System.nanoTime() - startTime) / (float) (10 ^ 9);
//			LOG.info(
//					"MTree merge iteration {}/{}: Done. Merging {} shards into {} shards using fanout {} took {} secs",
//					new Object[] { mtreeMergeIteration, mtreeMergeIterations,
//							reducers, (reducers / options.fanout),
//							options.fanout, secs });
//
//			if (!delete(outputReduceDir, true, fs)) {
//				return -1;
//			}
//			if (!rename(outputTreeMergeStep, outputReduceDir, fs)) {
//				return -1;
//			}
//			assert reducers % options.fanout == 0;
//			reducers = reducers / options.fanout;
//			mtreeMergeIteration++;
//		}
//		assert reducers == options.shards;
//
//		// normalize output shard dir prefix, i.e.
//		// rename part-r-00000 to part-00000 (stems from zero tree merge
//		// iterations)
//		// rename part-m-00000 to part-00000 (stems from > 0 tree merge
//		// iterations)
//		for (FileStatus stats : fs.listStatus(outputReduceDir)) {
//			String dirPrefix = SolrOutputFormat.getOutputName(job);
//			Path srcPath = stats.getPath();
//			if (stats.isDirectory() && srcPath.getName().startsWith(dirPrefix)) {
//				String dstName = dirPrefix
//						+ srcPath.getName().substring(
//								dirPrefix.length() + "-m".length());
//				Path dstPath = new Path(srcPath.getParent(), dstName);
//				if (!rename(srcPath, dstPath, fs)) {
//					return -1;
//				}
//			}
//		}
//		;
//
//		// publish results dir
//		if (!rename(outputReduceDir, outputResultsDir, fs)) {
//			return -1;
//		}
//
//		goodbye(job, programStartTime);
//		return 0;
//	}
//
//	private void calculateNumReducers(Options options, int realMappers)
//			throws IOException {
//		if (options.shards <= 0) {
//			throw new IllegalStateException("Illegal number of shards: "
//					+ options.shards);
//		}
//		if (options.fanout <= 1) {
//			throw new IllegalStateException("Illegal fanout: " + options.fanout);
//		}
//		if (realMappers <= 0) {
//			throw new IllegalStateException("Illegal realMappers: "
//					+ realMappers);
//		}
//
//		int reducers = new JobClient(job.getConfiguration()).getClusterStatus()
//				.getMaxReduceTasks(); // MR1
//		// reducers =
//		// job.getCluster().getClusterStatus().getReduceSlotCapacity(); // Yarn
//		// only
//		LOG.info("Cluster reports {} reduce slots", reducers);
//
//		if (options.reducers == -2) {
//			reducers = options.shards;
//		} else if (options.reducers == -1) {
//			reducers = Math.min(reducers, realMappers); // no need to use many
//														// reducers when using
//														// few mappers
//		} else {
//			if (options.reducers == 0) {
//				throw new IllegalStateException("Illegal zero reducers");
//			}
//			reducers = options.reducers;
//		}
//		reducers = Math.max(reducers, options.shards);
//
//		if (reducers != options.shards) {
//			// Ensure fanout isn't misconfigured. fanout can't meaningfully be
//			// larger than what would be
//			// required to merge all leaf shards in one single tree merge
//			// iteration into root shards
//			options.fanout = Math.min(options.fanout,
//					(int) ceilDivide(reducers, options.shards));
//
//			// Ensure invariant reducers == options.shards * (fanout ^ N) where
//			// N is an integer >= 1.
//			// N is the number of mtree merge iterations.
//			// This helps to evenly spread docs among root shards and simplifies
//			// the impl of the mtree merge algorithm.
//			int s = options.shards;
//			while (s < reducers) {
//				s = s * options.fanout;
//			}
//			reducers = s;
//			assert reducers % options.fanout == 0;
//		}
//		options.reducers = reducers;
//	}
//
//	private long addInputFiles(List<Path> inputFiles, List<Path> inputLists,
//			Path fullInputList, Configuration conf) throws IOException {
//
//		long numFiles = 0;
//		FileSystem fs = fullInputList.getFileSystem(conf);
//		FSDataOutputStream out = fs.create(fullInputList);
//		try {
//			Writer writer = new BufferedWriter(new OutputStreamWriter(out,
//					"UTF-8"));
//
//			for (Path inputFile : inputFiles) {
//				FileSystem inputFileFs = inputFile.getFileSystem(conf);
//				if (inputFileFs.exists(inputFile)) {
//					PathFilter pathFilter = new PathFilter() {
//						@Override
//						public boolean accept(Path path) { // ignore "hidden"
//															// files and dirs
//							return !(path.getName().startsWith(".") || path
//									.getName().startsWith("_"));
//						}
//					};
//					numFiles += addInputFilesRecursively(inputFile, writer,
//							inputFileFs, pathFilter);
//				}
//			}
//
//			for (Path inputList : inputLists) {
//				InputStream in;
//				if (inputList.toString().equals("-")) {
//					in = System.in;
//				} else if (inputList.isAbsoluteAndSchemeAuthorityNull()) {
//					in = new BufferedInputStream(new FileInputStream(
//							inputList.toString()));
//				} else {
//					in = inputList.getFileSystem(conf).open(inputList);
//				}
//				try {
//					BufferedReader reader = new BufferedReader(
//							new InputStreamReader(in, "UTF-8"));
//					String line;
//					while ((line = reader.readLine()) != null) {
//						writer.write(line + "\n");
//						numFiles++;
//					}
//					reader.close();
//				} finally {
//					in.close();
//				}
//			}
//
//			writer.close();
//		} finally {
//			out.close();
//		}
//		return numFiles;
//	}
//
//	/**
//	 * Add the specified file to the input set, if path is a directory then add
//	 * the files contained therein.
//	 */
//	private long addInputFilesRecursively(Path path, Writer writer,
//			FileSystem fs, PathFilter pathFilter) throws IOException {
//		long numFiles = 0;
//		for (FileStatus stat : fs.listStatus(path, pathFilter)) {
//			LOG.debug("Adding path {}", stat.getPath());
//			if (stat.isDirectory()) {
//				numFiles += addInputFilesRecursively(stat.getPath(), writer,
//						fs, pathFilter);
//			} else {
//				writer.write(stat.getPath().toString() + "\n");
//				numFiles++;
//			}
//		}
//		return numFiles;
//	}
//
//	// do the same as if the user had typed 'hadoop ... --files <file>'
//	private void addDistributedCacheFile(File file, Configuration conf)
//			throws IOException {
//		String HADOOP_TMP_FILES = "tmpfiles"; // see Hadoop's
//												// GenericOptionsParser
//		String tmpFiles = conf.get(HADOOP_TMP_FILES, "");
//		if (tmpFiles.length() > 0) { // already present?
//			tmpFiles = tmpFiles + ",";
//		}
//		GenericOptionsParser parser = new GenericOptionsParser(
//				new Configuration(conf), new String[] { "--files",
//						file.getCanonicalPath() });
//		String additionalTmpFiles = parser.getConfiguration().get(
//				HADOOP_TMP_FILES);
//		assert additionalTmpFiles != null;
//		assert additionalTmpFiles.length() > 0;
//		tmpFiles += additionalTmpFiles;
//		conf.set(HADOOP_TMP_FILES, tmpFiles);
//	}
//
//	private int createTreeMergeInputDirList(Path outputReduceDir,
//			FileSystem fs, Path fullInputList) throws FileNotFoundException,
//			IOException {
//
//		FileStatus[] dirs = listSortedOutputShardDirs(outputReduceDir, fs);
//		int numFiles = 0;
//		FSDataOutputStream out = fs.create(fullInputList);
//		try {
//			Writer writer = new BufferedWriter(new OutputStreamWriter(out,
//					"UTF-8"));
//			for (FileStatus stat : dirs) {
//				LOG.debug("Adding path {}", stat.getPath());
//				Path dir = new Path(stat.getPath(), "data/index");
//				if (!fs.isDirectory(dir)) {
//					throw new IllegalStateException("Not a directory: " + dir);
//				}
//				writer.write(dir.toString() + "\n");
//				numFiles++;
//			}
//			writer.close();
//		} finally {
//			out.close();
//		}
//		return numFiles;
//	}
//
//	private FileStatus[] listSortedOutputShardDirs(Path outputReduceDir,
//			FileSystem fs) throws FileNotFoundException, IOException {
//
//		final String dirPrefix = SolrOutputFormat.getOutputName(job);
//		FileStatus[] dirs = fs.listStatus(outputReduceDir, new PathFilter() {
//			@Override
//			public boolean accept(Path path) {
//				return path.getName().startsWith(dirPrefix);
//			}
//		});
//		for (FileStatus dir : dirs) {
//			if (!dir.isDirectory()) {
//				throw new IllegalStateException("Not a directory: "
//						+ dir.getPath());
//			}
//		}
//
//		// use alphanumeric sort (rather than lexicographical sort) to properly
//		// handle more than 99999 shards
//		Arrays.sort(dirs, new Comparator<FileStatus>() {
//			@Override
//			public int compare(FileStatus f1, FileStatus f2) {
//				return new AlphaNumericComparator().compare(f1.getPath()
//						.getName(), f2.getPath().getName());
//			}
//		});
//
//		return dirs;
//	}
//
//	/*
//	 * You can run MapReduceIndexerTool in Solrcloud mode, and once the MR job
//	 * completes, you can use the standard solrj Solrcloud API to send doc
//	 * updates and deletes to SolrCloud, and those updates and deletes will go
//	 * to the right Solr shards, and it will work just fine.
//	 * 
//	 * The MapReduce framework doesn't guarantee that input split N goes to the
//	 * map task with the taskId = N. The job tracker and Yarn schedule and
//	 * assign tasks, considering data locality aspects, but without regard of
//	 * the input split# withing the overall list of input splits. In other
//	 * words, split# != taskId can be true.
//	 * 
//	 * To deal with this issue, our mapper tasks write a little auxiliary
//	 * metadata file (per task) that tells the job driver which taskId processed
//	 * which split#. Once the mapper-only job is completed, the job driver
//	 * renames the output dirs such that the dir name contains the true solr
//	 * shard id, based on these auxiliary files.
//	 * 
//	 * This way each doc gets assigned to the right Solr shard even with
//	 * #reducers > #solrshards
//	 * 
//	 * Example for a merge with two shards:
//	 * 
//	 * part-m-00000 and part-m-00001 goes to outputShardNum = 0 and will end up
//	 * in merged part-m-00000 part-m-00002 and part-m-00003 goes to
//	 * outputShardNum = 1 and will end up in merged part-m-00001 part-m-00004
//	 * and part-m-00005 goes to outputShardNum = 2 and will end up in merged
//	 * part-m-00002 ... and so on
//	 * 
//	 * Also see run() method above where it uses
//	 * NLineInputFormat.setNumLinesPerSplit(job, options.fanout)
//	 * 
//	 * Also see
//	 * TreeMergeOutputFormat.TreeMergeRecordWriter.writeShardNumberFile()
//	 */
//	private boolean renameTreeMergeShardDirs(Path outputTreeMergeStep, Job job,
//			FileSystem fs) throws IOException {
//		final String dirPrefix = SolrOutputFormat.getOutputName(job);
//		FileStatus[] dirs = fs.listStatus(outputTreeMergeStep,
//				new PathFilter() {
//					@Override
//					public boolean accept(Path path) {
//						return path.getName().startsWith(dirPrefix);
//					}
//				});
//
//		for (FileStatus dir : dirs) {
//			if (!dir.isDirectory()) {
//				throw new IllegalStateException("Not a directory: "
//						+ dir.getPath());
//			}
//		}
//
//		// Example: rename part-m-00004 to _part-m-00004
//		for (FileStatus dir : dirs) {
//			Path path = dir.getPath();
//			Path renamedPath = new Path(path.getParent(), "_" + path.getName());
//			if (!rename(path, renamedPath, fs)) {
//				return false;
//			}
//		}
//
//		// Example: rename _part-m-00004 to part-m-00002
//		for (FileStatus dir : dirs) {
//			Path path = dir.getPath();
//			Path renamedPath = new Path(path.getParent(), "_" + path.getName());
//
//			// read auxiliary metadata file (per task) that tells which taskId
//			// processed which split# aka solrShard
//			Path solrShardNumberFile = new Path(renamedPath,
//					TreeMergeMapper.SOLR_SHARD_NUMBER);
//			InputStream in = fs.open(solrShardNumberFile);
//			byte[] bytes = ByteStreams.toByteArray(in);
//			in.close();
//			Preconditions.checkArgument(bytes.length > 0);
//			int solrShard = Integer.parseInt(new String(bytes, Charsets.UTF_8));
//			if (!delete(solrShardNumberFile, false, fs)) {
//				return false;
//			}
//
//			// same as FileOutputFormat.NUMBER_FORMAT
//			NumberFormat numberFormat = NumberFormat
//					.getInstance(Locale.ENGLISH);
//			numberFormat.setMinimumIntegerDigits(5);
//			numberFormat.setGroupingUsed(false);
//			Path finalPath = new Path(renamedPath.getParent(), dirPrefix
//					+ "-m-" + numberFormat.format(solrShard));
//
//			LOG.info("MTree merge renaming solr shard: " + solrShard
//					+ " from dir: " + dir.getPath() + " to dir: " + finalPath);
//			if (!rename(renamedPath, finalPath, fs)) {
//				return false;
//			}
//		}
//		return true;
//	}
//
//	private boolean waitForCompletion(Job job, boolean isVerbose)
//			throws IOException, InterruptedException, ClassNotFoundException {
//
//		LOG.debug("Running job: " + getJobInfo(job));
//		boolean success = job.waitForCompletion(isVerbose);
//		if (!success) {
//			LOG.error("Job failed! " + getJobInfo(job));
//		}
//		return success;
//	}
//
//	private void goodbye(Job job, long startTime) {
//		float secs = (System.nanoTime() - startTime) / (float) (10 ^ 9);
//		if (job != null) {
//			LOG.info("Succeeded with job: " + getJobInfo(job));
//		}
//		LOG.info("Success. Done. Program took {} secs. Goodbye.", secs);
//	}
//
//	private String getJobInfo(Job job) {
//		return "jobName: " + job.getJobName() + ", jobId: " + job.getJobID();
//	}
//
//	private boolean rename(Path src, Path dst, FileSystem fs)
//			throws IOException {
//		boolean success = fs.rename(src, dst);
//		if (!success) {
//			LOG.error("Cannot rename " + src + " to " + dst);
//		}
//		return success;
//	}
//
//	private boolean delete(Path path, boolean recursive, FileSystem fs)
//			throws IOException {
//		boolean success = fs.delete(path, recursive);
//		if (!success) {
//			LOG.error("Cannot delete " + path);
//		}
//		return success;
//	}
//
//	// same as IntMath.divide(p, q, RoundingMode.CEILING)
//	private long ceilDivide(long p, long q) {
//		long result = p / q;
//		if (p % q != 0) {
//			result++;
//		}
//		return result;
//	}
//
//	/**
//	 * Returns <tt>log<sub>base</sub>value</tt>.
//	 */
//	private double log(double base, double value) {
//		return Math.log(value) / Math.log(base);
//	}
//
// }