/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.net.URI;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SolrOutputFormat<K, V> extends FileOutputFormat<K, V> {
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* The parameter used to pass the solr config zip file information. This will
* be the hdfs path to the configuration zip file
*/
public static final String SETUP_OK = "solr.output.format.setup";
/** The key used to pass the zip file name through the configuration. */
public static final String ZIP_NAME = "solr.zip.name";
/**
* The base name of the zip file containing the configuration information.
* This file is passed via the distributed cache using a unique name, obtained
* via {@link #getZipName(Configuration jobConf)}.
*/
public static final String ZIP_FILE_BASE_NAME = "solr.zip";
/**
* The key used to pass the boolean configuration parameter that instructs for
* regular or zip file output
*/
public static final String OUTPUT_ZIP_FILE = "solr.output.zip.format";
static int defaultSolrWriterThreadCount = 0;
public static final String SOLR_WRITER_THREAD_COUNT = "solr.record.writer.num.threads";
static int defaultSolrWriterQueueSize = 1;
public static final String SOLR_WRITER_QUEUE_SIZE = "solr.record.writer.max.queues.size";
static int defaultSolrBatchSize = 20;
public static final String SOLR_RECORD_WRITER_BATCH_SIZE = "solr.record.writer.batch.size";
public static final String SOLR_RECORD_WRITER_MAX_SEGMENTS = "solr.record.writer.maxSegments";
public static String getSetupOk() {
return SETUP_OK;
}
/** Get the number of threads used for index writing */
public static void setSolrWriterThreadCount(int count, Configuration conf) {
conf.setInt(SOLR_WRITER_THREAD_COUNT, count);
}
/** Set the number of threads used for index writing */
public static int getSolrWriterThreadCount(Configuration conf) {
return conf.getInt(SOLR_WRITER_THREAD_COUNT, defaultSolrWriterThreadCount);
}
/**
* Set the maximum size of the the queue for documents to be written to the
* index.
*/
public static void setSolrWriterQueueSize(int count, Configuration conf) {
conf.setInt(SOLR_WRITER_QUEUE_SIZE, count);
}
/** Return the maximum size for the number of documents pending index writing. */
public static int getSolrWriterQueueSize(Configuration conf) {
return conf.getInt(SOLR_WRITER_QUEUE_SIZE, defaultSolrWriterQueueSize);
}
/**
* Return the file name portion of the configuration zip file, from the
* configuration.
*/
public static String getZipName(Configuration conf) {
return conf.get(ZIP_NAME, ZIP_FILE_BASE_NAME);
}
/**
* configure the job to output zip files of the output index, or full
* directory trees. Zip files are about 1/5th the size of the raw index, and
* much faster to write, but take more cpu to create.
*
* @param output true if should output zip files
* @param conf to use
*/
public static void setOutputZipFormat(boolean output, Configuration conf) {
conf.setBoolean(OUTPUT_ZIP_FILE, output);
}
/**
* return true if the output should be a zip file of the index, rather than
* the raw index
*
* @param conf to use
* @return true if output zip files is on
*/
public static boolean isOutputZipFormat(Configuration conf) {
return conf.getBoolean(OUTPUT_ZIP_FILE, false);
}
public static String getOutputName(JobContext job) {
return FileOutputFormat.getOutputName(job);
}
@Override
public void checkOutputSpecs(JobContext job) throws IOException {
super.checkOutputSpecs(job);
if (job.getConfiguration().get(SETUP_OK) == null) {
throw new IOException("Solr home cache not set up!");
}
}
@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
Utils.getLogConfigFile(context.getConfiguration());
Path workDir = getDefaultWorkFile(context, "");
int batchSize = getBatchSize(context.getConfiguration());
return new SolrRecordWriter<>(context, workDir, batchSize);
}
public static void setupSolrHomeCache(File solrHomeDir, Job job) throws IOException{
File solrHomeZip = createSolrHomeZip(solrHomeDir);
addSolrConfToDistributedCache(job, solrHomeZip);
}
public static File createSolrHomeZip(File solrHomeDir) throws IOException {
return createSolrHomeZip(solrHomeDir, false);
}
private static File createSolrHomeZip(File solrHomeDir, boolean safeToModify) throws IOException {
if (solrHomeDir == null || !(solrHomeDir.exists() && solrHomeDir.isDirectory())) {
throw new IOException("Invalid solr home: " + solrHomeDir);
}
File solrHomeZip = File.createTempFile("solr", ".zip");
createZip(solrHomeDir, solrHomeZip);
return solrHomeZip;
}
public static void addSolrConfToDistributedCache(Job job, File solrHomeZip)
throws IOException {
// Make a reasonably unique name for the zip file in the distributed cache
// to avoid collisions if multiple jobs are running.
String hdfsZipName = UUID.randomUUID().toString() + '.'
+ ZIP_FILE_BASE_NAME;
Configuration jobConf = job.getConfiguration();
jobConf.set(ZIP_NAME, hdfsZipName);
Path zipPath = new Path("/tmp", getZipName(jobConf));
FileSystem fs = FileSystem.get(jobConf);
fs.copyFromLocalFile(new Path(solrHomeZip.toString()), zipPath);
final URI baseZipUrl = fs.getUri().resolve(
zipPath.toString() + '#' + getZipName(jobConf));
DistributedCache.addCacheArchive(baseZipUrl, jobConf);
LOG.debug("Set Solr distributed cache: {}", Arrays.asList(job.getCacheArchives()));
LOG.debug("Set zipPath: {}", zipPath);
// Actually send the path for the configuration zip file
jobConf.set(SETUP_OK, zipPath.toString());
}
private static void createZip(File dir, File out) throws IOException {
HashSet<File> files = new HashSet<>();
// take only conf/ and lib/
for (String allowedDirectory : SolrRecordWriter
.getAllowedConfigDirectories()) {
File configDir = new File(dir, allowedDirectory);
boolean configDirExists;
/** If the directory does not exist, and is required, bail out */
if (!(configDirExists = configDir.exists())
&& SolrRecordWriter.isRequiredConfigDirectory(allowedDirectory)) {
throw new IOException(String.format(Locale.ENGLISH,
"required configuration directory %s is not present in %s",
allowedDirectory, dir));
}
if (!configDirExists) {
continue;
}
listFiles(configDir, files); // Store the files in the existing, allowed
// directory configDir, in the list of files
// to store in the zip file
}
Files.deleteIfExists(out.toPath());
int subst = dir.toString().length();
ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(out));
byte[] buf = new byte[1024];
for (File f : files) {
ZipEntry ze = new ZipEntry(f.toString().substring(subst));
zos.putNextEntry(ze);
InputStream is = new FileInputStream(f);
int cnt;
while ((cnt = is.read(buf)) >= 0) {
zos.write(buf, 0, cnt);
}
is.close();
zos.flush();
zos.closeEntry();
}
ZipEntry ze = new ZipEntry("solr.xml");
zos.putNextEntry(ze);
zos.write("<solr></solr>".getBytes("UTF-8"));
zos.flush();
zos.closeEntry();
zos.close();
}
private static void listFiles(File dir, Set<File> files) throws IOException {
File[] list = dir.listFiles();
if (list == null && dir.isFile()) {
files.add(dir);
return;
}
for (File f : list) {
if (f.isFile()) {
files.add(f);
} else {
listFiles(f, files);
}
}
}
public static int getBatchSize(Configuration jobConf) {
// TODO Auto-generated method stub
return jobConf.getInt(SolrOutputFormat.SOLR_RECORD_WRITER_BATCH_SIZE,
defaultSolrBatchSize);
}
public static void setBatchSize(int count, Configuration jobConf) {
jobConf.setInt(SOLR_RECORD_WRITER_BATCH_SIZE, count);
}
}