package uk.bl.wa.apache.solr.hadoop;
///*
// * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements. See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with
// * the License. You may obtain a copy of the License at
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// */
//package org.apache.solr.hadoop;
//
//import java.io.BufferedInputStream;
//import java.io.BufferedReader;
//import java.io.BufferedWriter;
//import java.io.File;
//import java.io.FileInputStream;
//import java.io.FileNotFoundException;
//import java.io.IOException;
//import java.io.InputStream;
//import java.io.InputStreamReader;
//import java.io.OutputStreamWriter;
//import java.io.Writer;
//import java.text.NumberFormat;
//import java.util.ArrayList;
//import java.util.Arrays;
//import java.util.Collections;
//import java.util.Comparator;
//import java.util.List;
//import java.util.Locale;
//import java.util.Map;
//
//import net.sourceforge.argparse4j.ArgumentParsers;
//import net.sourceforge.argparse4j.impl.Arguments;
//import net.sourceforge.argparse4j.impl.action.HelpArgumentAction;
//import net.sourceforge.argparse4j.impl.choice.RangeArgumentChoice;
//import net.sourceforge.argparse4j.impl.type.FileArgumentType;
//import net.sourceforge.argparse4j.inf.Argument;
//import net.sourceforge.argparse4j.inf.ArgumentGroup;
//import net.sourceforge.argparse4j.inf.ArgumentParser;
//import net.sourceforge.argparse4j.inf.ArgumentParserException;
//import net.sourceforge.argparse4j.inf.FeatureControl;
//import net.sourceforge.argparse4j.inf.Namespace;
//
//import org.apache.hadoop.conf.Configuration;
//import org.apache.hadoop.conf.Configured;
//import org.apache.hadoop.fs.FSDataOutputStream;
//import org.apache.hadoop.fs.FileStatus;
//import org.apache.hadoop.fs.FileSystem;
//import org.apache.hadoop.fs.Path;
//import org.apache.hadoop.fs.PathFilter;
//import org.apache.hadoop.io.NullWritable;
//import org.apache.hadoop.io.Text;
//import org.apache.hadoop.mapred.JobClient;
//import org.apache.hadoop.mapreduce.Job;
//import org.apache.hadoop.mapreduce.JobContext;
//import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
//import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//import org.apache.hadoop.util.GenericOptionsParser;
//import org.apache.hadoop.util.Tool;
//import org.apache.hadoop.util.ToolRunner;
//import org.apache.log4j.PropertyConfigurator;
//import org.apache.solr.common.cloud.SolrZkClient;
//import org.apache.solr.hadoop.dedup.NoChangeUpdateConflictResolver;
//import org.apache.solr.hadoop.morphline.MorphlineMapRunner;
//import org.kitesdk.morphline.base.Fields;
//import org.restlet.engine.util.AlphaNumericComparator;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import uk.bl.wa.hadoop.indexer.WARCIndexerRunner;
//import uk.bl.wa.util.ConfigPrinter;
//
//import com.google.common.base.Charsets;
//import com.google.common.base.Preconditions;
//import com.google.common.io.ByteStreams;
//import com.typesafe.config.Config;
//import com.typesafe.config.ConfigFactory;
//import com.typesafe.config.ConfigRenderOptions;
//
//
///**
// * This is based on MapReduceIndexerTool and should be more powerful than
// * hacking it together manually, but depends on Hadoop 2.x.x series code so has
// * problems running against our aging cluster.
// *
// * Public API for a MapReduce batch job driver that creates a set of Solr index
// * shards from a set of input files and writes the indexes into HDFS, in a
// * flexible, scalable and fault-tolerant manner. Also supports merging the
// * output shards into a set of live customer facing Solr servers, typically a
// * SolrCloud.
// */
//public class WARCIndexerJob extends Configured implements Tool {
//
// Job job; // visible for testing only
//
// public static final String RESULTS_DIR = "results";
//
// static final String MAIN_MEMORY_RANDOMIZATION_THRESHOLD = WARCIndexerJob.class
// .getName() + ".mainMemoryRandomizationThreshold";
//
// private static final String FULL_INPUT_LIST = "full-input-list.txt";
//
// private static final Logger LOG = LoggerFactory
// .getLogger(WARCIndexerJob.class);
//
//
// /**
// * See http://argparse4j.sourceforge.net and for details see
// * http://argparse4j.sourceforge.net/usage.html
// */
// static final class MyArgumentParser {
//
// private static final String SHOW_NON_SOLR_CLOUD = "--show-non-solr-cloud";
//
// private boolean showNonSolrCloud = false;
//
// /**
// * Parses the given command line arguments.
// *
// * @return exitCode null indicates the caller shall proceed with
// * processing, non-null indicates the caller shall exit the
// * program with the given exit status code.
// */
// public Integer parseArgs(String[] args, Configuration conf, Options opts) {
// assert args != null;
// assert conf != null;
// assert opts != null;
//
// if (args.length == 0) {
// args = new String[] { "--help" };
// }
//
// showNonSolrCloud = Arrays.asList(args)
// .contains(SHOW_NON_SOLR_CLOUD); // intercept it first
//
// ArgumentParser parser = ArgumentParsers
// .newArgumentParser(
// "hadoop [GenericOptions]... jar solr-map-reduce-*.jar ",
// false)
// .defaultHelp(true)
// .description(
// "MapReduce batch job driver that takes a morphline and creates a set of Solr index shards from a set of input files "
// + "and writes the indexes into HDFS, in a flexible, scalable and fault-tolerant manner. "
// + "It also supports merging the output shards into a set of live customer facing Solr servers, "
// + "typically a SolrCloud. The program proceeds in several consecutive MapReduce based phases, as follows:"
// + "\n\n"
// + "1) Randomization phase: This (parallel) phase randomizes the list of input files in order to spread "
// + "indexing load more evenly among the mappers of the subsequent phase."
// + "\n\n"
// + "2) Mapper phase: This (parallel) phase takes the input files, extracts the relevant content, transforms it "
// + "and hands SolrInputDocuments to a set of reducers. "
// + "The ETL functionality is flexible and "
// + "customizable using chains of arbitrary morphline commands that pipe records from one transformation command to another. "
// + "Commands to parse and transform a set of standard data formats such as Avro, CSV, Text, HTML, XML, "
// + "PDF, Word, Excel, etc. are provided out of the box, and additional custom commands and parsers for additional "
// + "file or data formats can be added as morphline plugins. "
// + "This is done by implementing a simple Java interface that consumes a record (e.g. a file in the form of an InputStream "
// + "plus some headers plus contextual metadata) and generates as output zero or more records. "
// + "Any kind of data format can be indexed and any Solr documents for any kind of Solr schema can be generated, "
// + "and any custom ETL logic can be registered and executed.\n"
// + "Record fields, including MIME types, can also explicitly be passed by force from the CLI to the morphline, for example: "
// + "hadoop ... -D "
// + MorphlineMapRunner.MORPHLINE_FIELD_PREFIX
// + Fields.ATTACHMENT_MIME_TYPE
// + "=text/csv"
// + "\n\n"
// + "3) Reducer phase: This (parallel) phase loads the mapper's SolrInputDocuments into one EmbeddedSolrServer per reducer. "
// + "Each such reducer and Solr server can be seen as a (micro) shard. The Solr servers store their "
// + "data in HDFS."
// + "\n\n"
// + "4) Mapper-only merge phase: This (parallel) phase merges the set of reducer shards into the number of solr "
// + "shards expected by the user, using a mapper-only job. This phase is omitted if the number "
// + "of shards is already equal to the number of shards expected by the user. "
// + "\n\n"
// + "5) Go-live phase: This optional (parallel) phase merges the output shards of the previous phase into a set of "
// + "live customer facing Solr servers, typically a SolrCloud. "
// + "If this phase is omitted you can explicitly point each Solr server to one of the HDFS output shard directories."
// + "\n\n"
// + "Fault Tolerance: Mapper and reducer task attempts are retried on failure per the standard MapReduce semantics. "
// + "On program startup all data in the --output-dir is deleted if that output directory already exists. "
// + "If the whole job fails you can retry simply by rerunning the program again using the same arguments.");
//
// parser.addArgument("--help", "-help", "-h")
// .help("Show this help message and exit")
// .action(new HelpArgumentAction() {
// @Override
// public void run(ArgumentParser parser, Argument arg,
// Map<String, Object> attrs, String flag,
// Object value) throws ArgumentParserException {
// parser.printHelp();
// System.out.println();
// System.out.print(ToolRunnerHelpFormatter
// .getGenericCommandUsage());
// // ToolRunner.printGenericCommandUsage(System.out);
// System.out
// .println("Examples: \n\n"
// +
//
// "# (Re)index an Avro based Twitter tweet file:\n" +
// "sudo -u hdfs hadoop \\\n" +
// " --config /etc/hadoop/conf.cloudera.mapreduce1 \\\n" +
// " jar target/solr-map-reduce-*.jar \\\n" +
// " -D 'mapred.child.java.opts=-Xmx500m' \\\n" +
// // " -D 'mapreduce.child.java.opts=-Xmx500m' \\\n"
// // +
// " --log4j src/test/resources/log4j.properties \\\n" +
// " --morphline-file ../search-core/src/test/resources/test-morphlines/tutorialReadAvroContainer.conf \\\n" +
// " --solr-home-dir src/test/resources/solr/minimr \\\n" +
// " --output-dir hdfs://c2202.mycompany.com/user/$USER/test \\\n" +
// " --shards 1 \\\n" +
// " hdfs:///user/$USER/test-documents/sample-statuses-20120906-141433.avro\n" +
// "\n" +
// "# Go live by merging resulting index shards into a live Solr cluster\n" +
// "# (explicitly specify Solr URLs - for a SolrCloud cluster see next example):\n" +
// "sudo -u hdfs hadoop \\\n" +
// " --config /etc/hadoop/conf.cloudera.mapreduce1 \\\n" +
// " jar target/solr-map-reduce-*.jar \\\n" +
// " -D 'mapred.child.java.opts=-Xmx500m' \\\n" +
// // " -D 'mapreduce.child.java.opts=-Xmx500m' \\\n"
// // +
// " --log4j src/test/resources/log4j.properties \\\n" +
// " --morphline-file ../search-core/src/test/resources/test-morphlines/tutorialReadAvroContainer.conf \\\n" +
// " --solr-home-dir src/test/resources/solr/minimr \\\n" +
// " --output-dir hdfs://c2202.mycompany.com/user/$USER/test \\\n" +
// " --shard-url http://solr001.mycompany.com:8983/solr/collection1 \\\n" +
// " --shard-url http://solr002.mycompany.com:8983/solr/collection1 \\\n" +
// " --go-live \\\n" +
// " hdfs:///user/foo/indir\n" +
// "\n" +
// "# Go live by merging resulting index shards into a live SolrCloud cluster\n" +
// "# (discover shards and Solr URLs through ZooKeeper):\n" +
// "sudo -u hdfs hadoop \\\n" +
// " --config /etc/hadoop/conf.cloudera.mapreduce1 \\\n" +
// " jar target/solr-map-reduce-*.jar \\\n" +
// " -D 'mapred.child.java.opts=-Xmx500m' \\\n" +
// // " -D 'mapreduce.child.java.opts=-Xmx500m' \\\n"
// // +
// " --log4j src/test/resources/log4j.properties \\\n" +
// " --morphline-file ../search-core/src/test/resources/test-morphlines/tutorialReadAvroContainer.conf \\\n" +
// " --output-dir hdfs://c2202.mycompany.com/user/$USER/test \\\n" +
// " --zk-host zk01.mycompany.com:2181/solr \\\n" +
// " --collection collection1 \\\n" +
// " --go-live \\\n" +
// " hdfs:///user/foo/indir\n"
//);
// throw new FoundHelpArgument(); // Trick to prevent
// // processing of any
// // remaining
// // arguments
// }
// });
//
// ArgumentGroup requiredGroup = parser
// .addArgumentGroup("Required arguments");
//
// Argument outputDirArg = requiredGroup
// .addArgument("--output-dir")
// .metavar("HDFS_URI")
// .type(new PathArgumentType(conf) {
// @Override
// public Path convert(ArgumentParser parser,
// Argument arg, String value)
// throws ArgumentParserException {
// Path path = super.convert(parser, arg, value);
// if ("hdfs".equals(path.toUri().getScheme())
// && path.toUri().getAuthority() == null) {
// // TODO: consider defaulting to hadoop's
// // fs.default.name here or in
// // SolrRecordWriter.createEmbeddedSolrServer()
// throw new ArgumentParserException(
// "Missing authority in path URI: "
// + path, parser);
// }
// return path;
// }
// }.verifyHasScheme().verifyIsAbsolute()
// .verifyCanWriteParent())
// .required(true)
// .help("HDFS directory to write Solr indexes to. Inside there one output directory per shard will be generated. "
// + "Example: hdfs://c2202.mycompany.com/user/$USER/test");
//
// Argument inputListArg = parser
// .addArgument("--input-list")
// .action(Arguments.append())
// .metavar("URI")
// // .type(new
// // PathArgumentType(fs).verifyExists().verifyCanRead())
// .type(Path.class)
// .help("Local URI or HDFS URI of a UTF-8 encoded file containing a list of HDFS URIs to index, "
// + "one URI per line in the file. If '-' is specified, URIs are read from the standard input. "
// + "Multiple --input-list arguments can be specified.");
//
// Argument solrHomeDirArg = nonSolrCloud(parser
// .addArgument("--solr-home-dir")
// .metavar("DIR")
// .type(new FileArgumentType() {
// @Override
// public File convert(ArgumentParser parser,
// Argument arg, String value)
// throws ArgumentParserException {
// File solrHomeDir = super
// .convert(parser, arg, value);
// File solrConfigFile = new File(new File(
// solrHomeDir, "conf"), "solrconfig.xml");
// new FileArgumentType()
// .verifyExists()
// .verifyIsFile()
// .verifyCanRead()
// .convert(parser, arg,
// solrConfigFile.getPath());
// return solrHomeDir;
// }
// }.verifyIsDirectory().verifyCanRead())
// .required(false)
// .help("Relative or absolute path to a local dir containing Solr conf/ dir and in particular "
// + "conf/solrconfig.xml and optionally also lib/ dir. This directory will be uploaded to each MR task. "
// + "Example: src/test/resources/solr/minimr"));
//
// Argument updateConflictResolverArg = parser
// .addArgument("--update-conflict-resolver")
// .metavar("FQCN")
// .type(String.class)
// .setDefault(NoChangeUpdateConflictResolver.class.getName())
// .help("Fully qualified class name of a Java class that implements the UpdateConflictResolver interface. "
// + "This enables deduplication and ordering of a series of document updates for the same unique document "
// + "key. For example, a MapReduce batch job might index multiple files in the same job where some of the "
// + "files contain old and new versions of the very same document, using the same unique document key.\n"
// + "Typically, implementations of this interface forbid collisions by throwing an exception, or ignore all but "
// + "the most recent document version, or, in the general case, order colliding updates ascending from least "
// + "recent to most recent (partial) update. The caller of this interface (i.e. the Hadoop Reducer) will then "
// + "apply the updates to Solr in the order returned by the orderUpdates() method.\n"
// + "The default RetainMostRecentUpdateConflictResolver implementation ignores all but the most recent document "
// + "version, based on a configurable numeric Solr field, which defaults to the file_last_modified timestamp");
//
// Argument mappersArg = parser
// .addArgument("--mappers")
// .metavar("INTEGER")
// .type(Integer.class)
// .choices(new RangeArgumentChoice(-1, Integer.MAX_VALUE))
// // TODO: also support X% syntax where X is an integer
// .setDefault(-1)
// .help("Tuning knob that indicates the maximum number of MR mapper tasks to use. -1 indicates use all map slots "
// + "available on the cluster.");
//
// Argument reducersArg = parser
// .addArgument("--reducers")
// .metavar("INTEGER")
// .type(Integer.class)
// .choices(new RangeArgumentChoice(-2, Integer.MAX_VALUE))
// // TODO: also support X% syntax where X is an integer
// .setDefault(-1)
// .help("Tuning knob that indicates the number of reducers to index into. "
// + "0 is reserved for a mapper-only feature that may ship in a future release. "
// + "-1 indicates use all reduce slots available on the cluster. "
// + "-2 indicates use one reducer per output shard, which disables the mtree merge MR algorithm. "
// + "The mtree merge MR algorithm improves scalability by spreading load "
// + "(in particular CPU load) among a number of parallel reducers that can be much larger than the number "
// + "of solr shards expected by the user. It can be seen as an extension of concurrent lucene merges "
// + "and tiered lucene merges to the clustered case. The subsequent mapper-only phase "
// + "merges the output of said large number of reducers to the number of shards expected by the user, "
// + "again by utilizing more available parallelism on the cluster.");
//
// Argument fanoutArg = parser.addArgument("--fanout")
// .metavar("INTEGER").type(Integer.class)
// .choices(new RangeArgumentChoice(2, Integer.MAX_VALUE))
// .setDefault(Integer.MAX_VALUE)
// .help(FeatureControl.SUPPRESS);
//
// Argument maxSegmentsArg = parser
// .addArgument("--max-segments")
// .metavar("INTEGER")
// .type(Integer.class)
// .choices(new RangeArgumentChoice(1, Integer.MAX_VALUE))
// .setDefault(1)
// .help("Tuning knob that indicates the maximum number of segments to be contained on output in the index of "
// + "each reducer shard. After a reducer has built its output index it applies a merge policy to merge segments "
// + "until there are <= maxSegments lucene segments left in this index. "
// + "Merging segments involves reading and rewriting all data in all these segment files, "
// + "potentially multiple times, which is very I/O intensive and time consuming. "
// + "However, an index with fewer segments can later be merged faster, "
// + "and it can later be queried faster once deployed to a live Solr serving shard. "
// + "Set maxSegments to 1 to optimize the index for low query latency. "
// + "In a nutshell, a small maxSegments value trades indexing latency for subsequently improved query latency. "
// + "This can be a reasonable trade-off for batch indexing systems.");
//
// Argument fairSchedulerPoolArg = parser
// .addArgument("--fair-scheduler-pool")
// .metavar("STRING")
// .help("Optional tuning knob that indicates the name of the fair scheduler pool to submit jobs to. "
// + "The Fair Scheduler is a pluggable MapReduce scheduler that provides a way to share large clusters. "
// + "Fair scheduling is a method of assigning resources to jobs such that all jobs get, on average, an "
// + "equal share of resources over time. When there is a single job running, that job uses the entire "
// + "cluster. When other jobs are submitted, tasks slots that free up are assigned to the new jobs, so "
// + "that each job gets roughly the same amount of CPU time. Unlike the default Hadoop scheduler, which "
// + "forms a queue of jobs, this lets short jobs finish in reasonable time while not starving long jobs. "
// + "It is also an easy way to share a cluster between multiple of users. Fair sharing can also work with "
// + "job priorities - the priorities are used as weights to determine the fraction of total compute time "
// + "that each job gets.");
//
// Argument dryRunArg = parser
// .addArgument("--dry-run")
// .action(Arguments.storeTrue())
// .help("Run in local mode and print documents to stdout instead of loading them into Solr. This executes "
// + "the morphline in the client process (without submitting a job to MR) for quicker turnaround during "
// + "early trial & debug sessions.");
//
// Argument log4jConfigFileArg = parser
// .addArgument("--log4j")
// .metavar("FILE")
// .type(new FileArgumentType().verifyExists().verifyIsFile()
// .verifyCanRead())
// .help("Relative or absolute path to a log4j.properties config file on the local file system. This file "
// + "will be uploaded to each MR task. Example: /path/to/log4j.properties");
//
// Argument verboseArg = parser.addArgument("--verbose", "-v")
// .action(Arguments.storeTrue())
// .help("Turn on verbose output.");
//
// parser.addArgument(SHOW_NON_SOLR_CLOUD)
// .action(Arguments.storeTrue())
// .help("Also show options for Non-SolrCloud mode as part of --help.");
//
// ArgumentGroup clusterInfoGroup = parser
// .addArgumentGroup("Cluster arguments")
// .description(
// "Arguments that provide information about your Solr cluster. "
// + nonSolrCloud("If you are building shards for a SolrCloud cluster, pass the --zk-host argument. "
// + "If you are building shards for "
// + "a Non-SolrCloud cluster, pass the --shard-url argument one or more times. To build indexes for "
// + "a replicated Non-SolrCloud cluster with --shard-url, pass replica urls consecutively and also pass --shards. "
// + "Using --go-live requires either --zk-host or --shard-url."));
//
// Argument zkHostArg = clusterInfoGroup
// .addArgument("--zk-host")
// .metavar("STRING")
// .type(String.class)
// .help("The address of a ZooKeeper ensemble being used by a SolrCloud cluster. "
// + "This ZooKeeper ensemble will be examined to determine the number of output "
// + "shards to create as well as the Solr URLs to merge the output shards into when using the --go-live option. "
// + "Requires that you also pass the --collection to merge the shards into.\n"
// + "\n"
// + "The --zk-host option implements the same partitioning semantics as the standard SolrCloud "
// + "Near-Real-Time (NRT) API. This enables to mix batch updates from MapReduce ingestion with "
// + "updates from standard Solr NRT ingestion on the same SolrCloud cluster, "
// + "using identical unique document keys.\n"
// + "\n"
// + "Format is: a list of comma separated host:port pairs, each corresponding to a zk "
// + "server. Example: '127.0.0.1:2181,127.0.0.1:2182,127.0.0.1:2183' If "
// + "the optional chroot suffix is used the example would look "
// + "like: '127.0.0.1:2181/solr,127.0.0.1:2182/solr,127.0.0.1:2183/solr' "
// + "where the client would be rooted at '/solr' and all paths "
// + "would be relative to this root - i.e. getting/setting/etc... "
// + "'/foo/bar' would result in operations being run on "
// + "'/solr/foo/bar' (from the server perspective).\n"
// + nonSolrCloud("\n"
// + "If --solr-home-dir is not specified, the Solr home directory for the collection "
// + "will be downloaded from this ZooKeeper ensemble."));
//
// Argument shardsArg = nonSolrCloud(clusterInfoGroup
// .addArgument("--shards").metavar("INTEGER")
// .type(Integer.class)
// .choices(new RangeArgumentChoice(1, Integer.MAX_VALUE))
// .help("Number of output shards to generate."));
//
// Argument collectionArg = parser
// .addArgument("--collection")
// .metavar("STRING")
// .help("The SolrCloud collection to merge shards into when using --zk-host. Example: collection1");
//
// // trailing positional arguments
// Argument inputFilesArg = parser
// .addArgument("input-files")
// .metavar("HDFS_URI")
// .type(new PathArgumentType(conf).verifyHasScheme()
// .verifyExists().verifyCanRead()).nargs("*")
// .setDefault()
// .help("HDFS URI of file or directory tree to index.");
//
// Argument configPathArg = parser.addArgument("--config-path")
// .metavar("STRING").help("The indexer config file to load.");
//
// Argument dumpConfigArg = parser.addArgument("--dump-config")
// .action(Arguments.storeTrue()).help("Dump indexer config.");
//
// Namespace ns;
// try {
// ns = parser.parseArgs(args);
// } catch (FoundHelpArgument e) {
// return 0;
// } catch (ArgumentParserException e) {
// parser.handleError(e);
// return 1;
// }
//
// opts.log4jConfigFile = (File) ns.get(log4jConfigFileArg.getDest());
// if (opts.log4jConfigFile != null) {
// PropertyConfigurator.configure(opts.log4jConfigFile.getPath());
// }
// LOG.debug("Parsed command line args: {}", ns);
//
// opts.inputLists = ns.getList(inputListArg.getDest());
// if (opts.inputLists == null) {
// opts.inputLists = Collections.EMPTY_LIST;
// }
// opts.inputFiles = ns.getList(inputFilesArg.getDest());
// opts.outputDir = (Path) ns.get(outputDirArg.getDest());
// opts.mappers = ns.getInt(mappersArg.getDest());
// opts.reducers = ns.getInt(reducersArg.getDest());
// opts.updateConflictResolver = ns
// .getString(updateConflictResolverArg.getDest());
// opts.fanout = ns.getInt(fanoutArg.getDest());
// opts.maxSegments = ns.getInt(maxSegmentsArg.getDest());
// opts.solrHomeDir = (File) ns.get(solrHomeDirArg.getDest());
// opts.fairSchedulerPool = ns.getString(fairSchedulerPoolArg
// .getDest());
// opts.isDryRun = ns.getBoolean(dryRunArg.getDest());
// opts.isVerbose = ns.getBoolean(verboseArg.getDest());
// opts.zkHost = ns.getString(zkHostArg.getDest());
// opts.shards = ns.getInt(shardsArg.getDest());
// opts.collection = ns.getString(collectionArg.getDest());
//
// opts.configPath = ns.getString(configPathArg.getDest());
// opts.dumpConfig = ns.getBoolean(dumpConfigArg.getDest());
//
// try {
// if (opts.reducers == 0) {
// throw new ArgumentParserException(
// "--reducers must not be zero", parser);
// }
// } catch (ArgumentParserException e) {
// parser.handleError(e);
// return 1;
// }
//
// if (opts.inputLists.isEmpty() && opts.inputFiles.isEmpty()) {
// LOG.info("No input files specified - nothing to process");
// return 0; // nothing to process
// }
// return null;
// }
//
// // make it a "hidden" option, i.e. the option is functional and enabled
// // but not shown in --help output
// private Argument nonSolrCloud(Argument arg) {
// return showNonSolrCloud ? arg : arg.help(FeatureControl.SUPPRESS);
// }
//
// private String nonSolrCloud(String msg) {
// return showNonSolrCloud ? msg : "";
// }
//
// /**
// * Marker trick to prevent processing of any remaining arguments once
// * --help option has been parsed
// */
// private static final class FoundHelpArgument extends RuntimeException {
// }
// }
//
// // END OF INNER CLASS
//
// static List<List<String>> buildShardUrls(List<Object> urls,
// Integer numShards) {
// if (urls == null)
// return null;
// List<List<String>> shardUrls = new ArrayList<List<String>>(urls.size());
// List<String> list = null;
//
// int sz;
// if (numShards == null) {
// numShards = urls.size();
// }
// sz = (int) Math.ceil(urls.size() / (float) numShards);
// for (int i = 0; i < urls.size(); i++) {
// if (i % sz == 0) {
// list = new ArrayList<String>();
// shardUrls.add(list);
// }
// list.add((String) urls.get(i));
// }
//
// return shardUrls;
// }
//
// static final class Options {
// String collection;
// String zkHost;
// List<Path> inputLists;
// List<Path> inputFiles;
// Path outputDir;
// int mappers;
// int reducers;
// String updateConflictResolver;
// int fanout;
// Integer shards;
// int maxSegments;
// File solrHomeDir;
// String fairSchedulerPool;
// boolean isDryRun;
// File log4jConfigFile;
// boolean isVerbose;
// String configPath;
// boolean dumpConfig;
// }
//
// // END OF INNER CLASS
//
//
// /** API for command line clients */
// public static void main(String[] args) throws Exception {
// int res = ToolRunner.run(new Configuration(),
// new WARCIndexerJob(),
// args);
// System.exit(res);
// }
//
// public WARCIndexerJob() {
// }
//
// @Override
// public int run(String[] args) throws Exception {
// Options opts = new Options();
// Integer exitCode = new MyArgumentParser().parseArgs(args, getConf(),
// opts);
// if (exitCode != null) {
// return exitCode;
// }
// return run(opts);
// }
//
// /**
// * API for Java clients; visible for testing; may become a public API
// * eventually
// */
// int run(Options options) throws Exception {
// if (getConf().getBoolean("isMR1", false)
// && "local".equals(getConf().get("mapred.job.tracker"))) {
// throw new IllegalStateException(
// "Running with LocalJobRunner (i.e. all of Hadoop inside a single JVM) is not supported "
// + "because LocalJobRunner does not (yet) implement the Hadoop Distributed Cache feature, "
// + "which is required for passing files via --files and --libjars");
// }
//
// long programStartTime = System.nanoTime();
// if (options.fairSchedulerPool != null) {
// getConf().set("mapred.fairscheduler.pool",
// options.fairSchedulerPool);
// }
// getConf().setInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS,
// options.maxSegments);
//
// // switch off a false warning about allegedly not implementing Tool
// // also see
// // http://hadoop.6.n7.nabble.com/GenericOptionsParser-warning-td8103.html
// // also see https://issues.apache.org/jira/browse/HADOOP-8183
// getConf().setBoolean("mapred.used.genericoptionsparser", true);
//
// if (options.log4jConfigFile != null) {
// Utils.setLogConfigFile(options.log4jConfigFile, getConf());
// addDistributedCacheFile(options.log4jConfigFile, getConf());
// }
//
// job = Job.getInstance(getConf());
// job.setJarByClass(getClass());
//
// int mappers = new JobClient(job.getConfiguration()).getClusterStatus()
// .getMaxMapTasks(); // MR1
// // int mappers =
// // job.getCluster().getClusterStatus().getMapSlotCapacity(); // Yarn
// // only
// LOG.info("Cluster reports {} mapper slots", mappers);
//
// if (options.mappers == -1) {
// mappers = 8 * mappers; // better accomodate stragglers
// } else {
// mappers = options.mappers;
// }
// if (mappers <= 0) {
// throw new IllegalStateException("Illegal number of mappers: "
// + mappers);
// }
// options.mappers = mappers;
//
// FileSystem fs = options.outputDir.getFileSystem(job.getConfiguration());
// if (fs.exists(options.outputDir)
// && !delete(options.outputDir, true, fs)) {
// return -1;
// }
// Path outputResultsDir = new Path(options.outputDir, RESULTS_DIR);
// Path outputReduceDir = new Path(options.outputDir, "reducers");
// Path outputStep1Dir = new Path(options.outputDir, "tmp1");
// Path outputStep2Dir = new Path(options.outputDir, "tmp2");
// Path outputTreeMergeStep = new Path(options.outputDir,
// "mtree-merge-output");
// Path fullInputList = new Path(outputStep1Dir, FULL_INPUT_LIST);
//
// LOG.debug("Creating list of input files for mappers: {}", fullInputList);
// long numFiles = addInputFiles(options.inputFiles, options.inputLists,
// fullInputList, job.getConfiguration());
// if (numFiles == 0) {
// LOG.info("No input files found - nothing to process");
// return 0;
// }
// int numLinesPerSplit = (int) ceilDivide(numFiles, mappers);
// if (numLinesPerSplit < 0) { // numeric overflow from downcasting long to
// // int?
// numLinesPerSplit = Integer.MAX_VALUE;
// }
// numLinesPerSplit = Math.max(1, numLinesPerSplit);
//
// int realMappers = Math.min(mappers,
// (int) ceilDivide(numFiles, numLinesPerSplit));
// calculateNumReducers(options, realMappers);
// int reducers = options.reducers;
// LOG.info(
// "Using these parameters: "
// + "numFiles: {}, mappers: {}, realMappers: {}, reducers: {}, shards: {}, fanout: {}, maxSegments: {}",
// new Object[] { numFiles, mappers, realMappers, reducers,
// options.shards, options.fanout, options.maxSegments });
//
//
// long startTime = System.nanoTime();
// float secs = (System.nanoTime() - startTime) / (float) (10 ^ 9);
//
// // ---- ---- ----
//
// // Store application properties where the mappers/reducers can access
// // them
// Config index_conf;
// if (options.configPath != null) {
// index_conf = ConfigFactory.parseFile(new File(options.configPath));
// } else {
// index_conf = ConfigFactory.load();
// }
// if (options.dumpConfig) {
// ConfigPrinter.print(index_conf);
// System.exit(0);
// }
// job.getConfiguration().set(
// WARCIndexerRunner.CONFIG_PROPERTIES,
// index_conf.withOnlyPath("warc").root()
// .render(ConfigRenderOptions.concise()));
//
// job.setInputFormatClass(WebArchiveFileInputFormat.class);
// Class<WebArchiveIndexerMapper> mapperClass = WebArchiveIndexerMapper.class;
// job.setMapperClass(mapperClass);
//
// // ---- ---- ----
//
// FileOutputFormat.setOutputPath(job, outputReduceDir);
// job.setJobName(getClass().getName() + "/"
// + Utils.getShortClassName(mapperClass));
//
// if (job.getConfiguration().get(JobContext.REDUCE_CLASS_ATTR) == null) { // enable
// // customization
// job.setReducerClass(SolrReducer.class);
// }
// if (options.updateConflictResolver == null) {
// throw new IllegalArgumentException(
// "updateConflictResolver must not be null");
// }
// job.getConfiguration().set(SolrReducer.UPDATE_CONFLICT_RESOLVER,
// options.updateConflictResolver);
//
// if (options.zkHost != null) {
// assert options.collection != null;
// /*
// * MapReduce partitioner that partitions the Mapper output such that
// * each SolrInputDocument gets sent to the SolrCloud shard that it
// * would have been sent to if the document were ingested via the
// * standard SolrCloud Near Real Time (NRT) API.
// *
// * In other words, this class implements the same partitioning
// * semantics as the standard SolrCloud NRT API. This enables to mix
// * batch updates from MapReduce ingestion with updates from standard
// * NRT ingestion on the same SolrCloud cluster, using identical
// * unique document keys.
// */
// if (job.getConfiguration().get(JobContext.PARTITIONER_CLASS_ATTR) == null) { // enable
// // customization
// job.setPartitionerClass(SolrCloudPartitioner.class);
// }
// job.getConfiguration().set(SolrCloudPartitioner.ZKHOST,
// options.zkHost);
// job.getConfiguration().set(SolrCloudPartitioner.COLLECTION,
// options.collection);
// }
// job.getConfiguration().setInt(SolrCloudPartitioner.SHARDS,
// options.shards);
//
// job.setOutputFormatClass(SolrOutputFormat.class);
// if (options.solrHomeDir != null) {
// SolrOutputFormat.setupSolrHomeCache(options.solrHomeDir, job);
// } else {
// assert options.zkHost != null;
// // use the config that this collection uses for the SolrHomeCache.
// ZooKeeperInspector zki = new ZooKeeperInspector();
// SolrZkClient zkClient = zki.getZkClient(options.zkHost);
// try {
// String configName = zki.readConfigName(zkClient,
// options.collection);
// File tmpSolrHomeDir = zki.downloadConfigDir(zkClient,
// configName);
// SolrOutputFormat.setupSolrHomeCache(tmpSolrHomeDir, job);
// options.solrHomeDir = tmpSolrHomeDir;
// } finally {
// zkClient.close();
// }
// }
//
// job.setNumReduceTasks(reducers);
// job.setOutputKeyClass(Text.class);
// job.setOutputValueClass(SolrInputDocumentWritable.class);
// LOG.info("Indexing {} files using {} real mappers into {} reducers",
// new Object[] { numFiles, realMappers, reducers });
// startTime = System.nanoTime();
// if (!waitForCompletion(job, options.isVerbose)) {
// return -1; // job failed
// }
//
// secs = (System.nanoTime() - startTime) / (float) (10 ^ 9);
// LOG.info(
// "Done. Indexing {} files using {} real mappers into {} reducers took {} secs",
// new Object[] { numFiles, realMappers, reducers, secs });
//
// int mtreeMergeIterations = 0;
// if (reducers > options.shards) {
// mtreeMergeIterations = (int) Math.round(log(options.fanout,
// reducers / options.shards));
// }
// LOG.debug("MTree merge iterations to do: {}", mtreeMergeIterations);
// int mtreeMergeIteration = 1;
// while (reducers > options.shards) { // run a mtree merge iteration
// job = Job.getInstance(getConf());
// job.setJarByClass(getClass());
// job.setJobName(getClass().getName() + "/"
// + Utils.getShortClassName(TreeMergeMapper.class));
// job.setMapperClass(TreeMergeMapper.class);
// job.setOutputFormatClass(TreeMergeOutputFormat.class);
// job.setNumReduceTasks(0);
// job.setOutputKeyClass(Text.class);
// job.setOutputValueClass(NullWritable.class);
// job.setInputFormatClass(NLineInputFormat.class);
//
// Path inputStepDir = new Path(options.outputDir,
// "mtree-merge-input-iteration" + mtreeMergeIteration);
// fullInputList = new Path(inputStepDir, FULL_INPUT_LIST);
// LOG.debug(
// "MTree merge iteration {}/{}: Creating input list file for mappers {}",
// new Object[] { mtreeMergeIteration, mtreeMergeIterations,
// fullInputList });
// numFiles = createTreeMergeInputDirList(outputReduceDir, fs,
// fullInputList);
// if (numFiles != reducers) {
// throw new IllegalStateException("Not same reducers: "
// + reducers + ", numFiles: " + numFiles);
// }
// NLineInputFormat.addInputPath(job, fullInputList);
// NLineInputFormat.setNumLinesPerSplit(job, options.fanout);
// FileOutputFormat.setOutputPath(job, outputTreeMergeStep);
//
// LOG.info(
// "MTree merge iteration {}/{}: Merging {} shards into {} shards using fanout {}",
// new Object[] { mtreeMergeIteration, mtreeMergeIterations,
// reducers, (reducers / options.fanout),
// options.fanout });
// startTime = System.nanoTime();
// if (!waitForCompletion(job, options.isVerbose)) {
// return -1; // job failed
// }
// if (!renameTreeMergeShardDirs(outputTreeMergeStep, job, fs)) {
// return -1;
// }
// secs = (System.nanoTime() - startTime) / (float) (10 ^ 9);
// LOG.info(
// "MTree merge iteration {}/{}: Done. Merging {} shards into {} shards using fanout {} took {} secs",
// new Object[] { mtreeMergeIteration, mtreeMergeIterations,
// reducers, (reducers / options.fanout),
// options.fanout, secs });
//
// if (!delete(outputReduceDir, true, fs)) {
// return -1;
// }
// if (!rename(outputTreeMergeStep, outputReduceDir, fs)) {
// return -1;
// }
// assert reducers % options.fanout == 0;
// reducers = reducers / options.fanout;
// mtreeMergeIteration++;
// }
// assert reducers == options.shards;
//
// // normalize output shard dir prefix, i.e.
// // rename part-r-00000 to part-00000 (stems from zero tree merge
// // iterations)
// // rename part-m-00000 to part-00000 (stems from > 0 tree merge
// // iterations)
// for (FileStatus stats : fs.listStatus(outputReduceDir)) {
// String dirPrefix = SolrOutputFormat.getOutputName(job);
// Path srcPath = stats.getPath();
// if (stats.isDirectory() && srcPath.getName().startsWith(dirPrefix)) {
// String dstName = dirPrefix
// + srcPath.getName().substring(
// dirPrefix.length() + "-m".length());
// Path dstPath = new Path(srcPath.getParent(), dstName);
// if (!rename(srcPath, dstPath, fs)) {
// return -1;
// }
// }
// }
// ;
//
// // publish results dir
// if (!rename(outputReduceDir, outputResultsDir, fs)) {
// return -1;
// }
//
// goodbye(job, programStartTime);
// return 0;
// }
//
// private void calculateNumReducers(Options options, int realMappers)
// throws IOException {
// if (options.shards <= 0) {
// throw new IllegalStateException("Illegal number of shards: "
// + options.shards);
// }
// if (options.fanout <= 1) {
// throw new IllegalStateException("Illegal fanout: " + options.fanout);
// }
// if (realMappers <= 0) {
// throw new IllegalStateException("Illegal realMappers: "
// + realMappers);
// }
//
// int reducers = new JobClient(job.getConfiguration()).getClusterStatus()
// .getMaxReduceTasks(); // MR1
// // reducers =
// // job.getCluster().getClusterStatus().getReduceSlotCapacity(); // Yarn
// // only
// LOG.info("Cluster reports {} reduce slots", reducers);
//
// if (options.reducers == -2) {
// reducers = options.shards;
// } else if (options.reducers == -1) {
// reducers = Math.min(reducers, realMappers); // no need to use many
// // reducers when using
// // few mappers
// } else {
// if (options.reducers == 0) {
// throw new IllegalStateException("Illegal zero reducers");
// }
// reducers = options.reducers;
// }
// reducers = Math.max(reducers, options.shards);
//
// if (reducers != options.shards) {
// // Ensure fanout isn't misconfigured. fanout can't meaningfully be
// // larger than what would be
// // required to merge all leaf shards in one single tree merge
// // iteration into root shards
// options.fanout = Math.min(options.fanout,
// (int) ceilDivide(reducers, options.shards));
//
// // Ensure invariant reducers == options.shards * (fanout ^ N) where
// // N is an integer >= 1.
// // N is the number of mtree merge iterations.
// // This helps to evenly spread docs among root shards and simplifies
// // the impl of the mtree merge algorithm.
// int s = options.shards;
// while (s < reducers) {
// s = s * options.fanout;
// }
// reducers = s;
// assert reducers % options.fanout == 0;
// }
// options.reducers = reducers;
// }
//
// private long addInputFiles(List<Path> inputFiles, List<Path> inputLists,
// Path fullInputList, Configuration conf) throws IOException {
//
// long numFiles = 0;
// FileSystem fs = fullInputList.getFileSystem(conf);
// FSDataOutputStream out = fs.create(fullInputList);
// try {
// Writer writer = new BufferedWriter(new OutputStreamWriter(out,
// "UTF-8"));
//
// for (Path inputFile : inputFiles) {
// FileSystem inputFileFs = inputFile.getFileSystem(conf);
// if (inputFileFs.exists(inputFile)) {
// PathFilter pathFilter = new PathFilter() {
// @Override
// public boolean accept(Path path) { // ignore "hidden"
// // files and dirs
// return !(path.getName().startsWith(".") || path
// .getName().startsWith("_"));
// }
// };
// numFiles += addInputFilesRecursively(inputFile, writer,
// inputFileFs, pathFilter);
// }
// }
//
// for (Path inputList : inputLists) {
// InputStream in;
// if (inputList.toString().equals("-")) {
// in = System.in;
// } else if (inputList.isAbsoluteAndSchemeAuthorityNull()) {
// in = new BufferedInputStream(new FileInputStream(
// inputList.toString()));
// } else {
// in = inputList.getFileSystem(conf).open(inputList);
// }
// try {
// BufferedReader reader = new BufferedReader(
// new InputStreamReader(in, "UTF-8"));
// String line;
// while ((line = reader.readLine()) != null) {
// writer.write(line + "\n");
// numFiles++;
// }
// reader.close();
// } finally {
// in.close();
// }
// }
//
// writer.close();
// } finally {
// out.close();
// }
// return numFiles;
// }
//
// /**
// * Add the specified file to the input set, if path is a directory then add
// * the files contained therein.
// */
// private long addInputFilesRecursively(Path path, Writer writer,
// FileSystem fs, PathFilter pathFilter) throws IOException {
// long numFiles = 0;
// for (FileStatus stat : fs.listStatus(path, pathFilter)) {
// LOG.debug("Adding path {}", stat.getPath());
// if (stat.isDirectory()) {
// numFiles += addInputFilesRecursively(stat.getPath(), writer,
// fs, pathFilter);
// } else {
// writer.write(stat.getPath().toString() + "\n");
// numFiles++;
// }
// }
// return numFiles;
// }
//
// // do the same as if the user had typed 'hadoop ... --files <file>'
// private void addDistributedCacheFile(File file, Configuration conf)
// throws IOException {
// String HADOOP_TMP_FILES = "tmpfiles"; // see Hadoop's
// // GenericOptionsParser
// String tmpFiles = conf.get(HADOOP_TMP_FILES, "");
// if (tmpFiles.length() > 0) { // already present?
// tmpFiles = tmpFiles + ",";
// }
// GenericOptionsParser parser = new GenericOptionsParser(
// new Configuration(conf), new String[] { "--files",
// file.getCanonicalPath() });
// String additionalTmpFiles = parser.getConfiguration().get(
// HADOOP_TMP_FILES);
// assert additionalTmpFiles != null;
// assert additionalTmpFiles.length() > 0;
// tmpFiles += additionalTmpFiles;
// conf.set(HADOOP_TMP_FILES, tmpFiles);
// }
//
// private int createTreeMergeInputDirList(Path outputReduceDir,
// FileSystem fs, Path fullInputList) throws FileNotFoundException,
// IOException {
//
// FileStatus[] dirs = listSortedOutputShardDirs(outputReduceDir, fs);
// int numFiles = 0;
// FSDataOutputStream out = fs.create(fullInputList);
// try {
// Writer writer = new BufferedWriter(new OutputStreamWriter(out,
// "UTF-8"));
// for (FileStatus stat : dirs) {
// LOG.debug("Adding path {}", stat.getPath());
// Path dir = new Path(stat.getPath(), "data/index");
// if (!fs.isDirectory(dir)) {
// throw new IllegalStateException("Not a directory: " + dir);
// }
// writer.write(dir.toString() + "\n");
// numFiles++;
// }
// writer.close();
// } finally {
// out.close();
// }
// return numFiles;
// }
//
// private FileStatus[] listSortedOutputShardDirs(Path outputReduceDir,
// FileSystem fs) throws FileNotFoundException, IOException {
//
// final String dirPrefix = SolrOutputFormat.getOutputName(job);
// FileStatus[] dirs = fs.listStatus(outputReduceDir, new PathFilter() {
// @Override
// public boolean accept(Path path) {
// return path.getName().startsWith(dirPrefix);
// }
// });
// for (FileStatus dir : dirs) {
// if (!dir.isDirectory()) {
// throw new IllegalStateException("Not a directory: "
// + dir.getPath());
// }
// }
//
// // use alphanumeric sort (rather than lexicographical sort) to properly
// // handle more than 99999 shards
// Arrays.sort(dirs, new Comparator<FileStatus>() {
// @Override
// public int compare(FileStatus f1, FileStatus f2) {
// return new AlphaNumericComparator().compare(f1.getPath()
// .getName(), f2.getPath().getName());
// }
// });
//
// return dirs;
// }
//
// /*
// * You can run MapReduceIndexerTool in Solrcloud mode, and once the MR job
// * completes, you can use the standard solrj Solrcloud API to send doc
// * updates and deletes to SolrCloud, and those updates and deletes will go
// * to the right Solr shards, and it will work just fine.
// *
// * The MapReduce framework doesn't guarantee that input split N goes to the
// * map task with the taskId = N. The job tracker and Yarn schedule and
// * assign tasks, considering data locality aspects, but without regard of
// * the input split# withing the overall list of input splits. In other
// * words, split# != taskId can be true.
// *
// * To deal with this issue, our mapper tasks write a little auxiliary
// * metadata file (per task) that tells the job driver which taskId processed
// * which split#. Once the mapper-only job is completed, the job driver
// * renames the output dirs such that the dir name contains the true solr
// * shard id, based on these auxiliary files.
// *
// * This way each doc gets assigned to the right Solr shard even with
// * #reducers > #solrshards
// *
// * Example for a merge with two shards:
// *
// * part-m-00000 and part-m-00001 goes to outputShardNum = 0 and will end up
// * in merged part-m-00000 part-m-00002 and part-m-00003 goes to
// * outputShardNum = 1 and will end up in merged part-m-00001 part-m-00004
// * and part-m-00005 goes to outputShardNum = 2 and will end up in merged
// * part-m-00002 ... and so on
// *
// * Also see run() method above where it uses
// * NLineInputFormat.setNumLinesPerSplit(job, options.fanout)
// *
// * Also see
// * TreeMergeOutputFormat.TreeMergeRecordWriter.writeShardNumberFile()
// */
// private boolean renameTreeMergeShardDirs(Path outputTreeMergeStep, Job job,
// FileSystem fs) throws IOException {
// final String dirPrefix = SolrOutputFormat.getOutputName(job);
// FileStatus[] dirs = fs.listStatus(outputTreeMergeStep,
// new PathFilter() {
// @Override
// public boolean accept(Path path) {
// return path.getName().startsWith(dirPrefix);
// }
// });
//
// for (FileStatus dir : dirs) {
// if (!dir.isDirectory()) {
// throw new IllegalStateException("Not a directory: "
// + dir.getPath());
// }
// }
//
// // Example: rename part-m-00004 to _part-m-00004
// for (FileStatus dir : dirs) {
// Path path = dir.getPath();
// Path renamedPath = new Path(path.getParent(), "_" + path.getName());
// if (!rename(path, renamedPath, fs)) {
// return false;
// }
// }
//
// // Example: rename _part-m-00004 to part-m-00002
// for (FileStatus dir : dirs) {
// Path path = dir.getPath();
// Path renamedPath = new Path(path.getParent(), "_" + path.getName());
//
// // read auxiliary metadata file (per task) that tells which taskId
// // processed which split# aka solrShard
// Path solrShardNumberFile = new Path(renamedPath,
// TreeMergeMapper.SOLR_SHARD_NUMBER);
// InputStream in = fs.open(solrShardNumberFile);
// byte[] bytes = ByteStreams.toByteArray(in);
// in.close();
// Preconditions.checkArgument(bytes.length > 0);
// int solrShard = Integer.parseInt(new String(bytes, Charsets.UTF_8));
// if (!delete(solrShardNumberFile, false, fs)) {
// return false;
// }
//
// // same as FileOutputFormat.NUMBER_FORMAT
// NumberFormat numberFormat = NumberFormat
// .getInstance(Locale.ENGLISH);
// numberFormat.setMinimumIntegerDigits(5);
// numberFormat.setGroupingUsed(false);
// Path finalPath = new Path(renamedPath.getParent(), dirPrefix
// + "-m-" + numberFormat.format(solrShard));
//
// LOG.info("MTree merge renaming solr shard: " + solrShard
// + " from dir: " + dir.getPath() + " to dir: " + finalPath);
// if (!rename(renamedPath, finalPath, fs)) {
// return false;
// }
// }
// return true;
// }
//
// private boolean waitForCompletion(Job job, boolean isVerbose)
// throws IOException, InterruptedException, ClassNotFoundException {
//
// LOG.debug("Running job: " + getJobInfo(job));
// boolean success = job.waitForCompletion(isVerbose);
// if (!success) {
// LOG.error("Job failed! " + getJobInfo(job));
// }
// return success;
// }
//
// private void goodbye(Job job, long startTime) {
// float secs = (System.nanoTime() - startTime) / (float) (10 ^ 9);
// if (job != null) {
// LOG.info("Succeeded with job: " + getJobInfo(job));
// }
// LOG.info("Success. Done. Program took {} secs. Goodbye.", secs);
// }
//
// private String getJobInfo(Job job) {
// return "jobName: " + job.getJobName() + ", jobId: " + job.getJobID();
// }
//
// private boolean rename(Path src, Path dst, FileSystem fs)
// throws IOException {
// boolean success = fs.rename(src, dst);
// if (!success) {
// LOG.error("Cannot rename " + src + " to " + dst);
// }
// return success;
// }
//
// private boolean delete(Path path, boolean recursive, FileSystem fs)
// throws IOException {
// boolean success = fs.delete(path, recursive);
// if (!success) {
// LOG.error("Cannot delete " + path);
// }
// return success;
// }
//
// // same as IntMath.divide(p, q, RoundingMode.CEILING)
// private long ceilDivide(long p, long q) {
// long result = p / q;
// if (p % q != 0) {
// result++;
// }
// return result;
// }
//
// /**
// * Returns <tt>log<sub>base</sub>value</tt>.
// */
// private double log(double base, double value) {
// return Math.log(value) / Math.log(base);
// }
//
// }