/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.hadoop.morphline;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.hadoop.HdfsFileFieldNames;
import org.apache.solr.hadoop.PathParts;
import org.apache.solr.hadoop.Utils;
import org.apache.solr.morphlines.solr.DocumentLoader;
import org.apache.solr.morphlines.solr.SolrLocator;
import org.apache.solr.morphlines.solr.SolrMorphlineContext;
import org.apache.solr.schema.IndexSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.MorphlineCompilationException;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.Compiler;
import org.kitesdk.morphline.base.FaultTolerance;
import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.base.Metrics;
import org.kitesdk.morphline.base.Notifications;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.google.common.annotations.Beta;
import com.google.common.base.Joiner;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
/**
* Internal helper for {@link MorphlineMapper} and dryRun mode; This API is for *INTERNAL* use only
* and should not be considered public.
*/
@Beta
public final class MorphlineMapRunner {
private MorphlineContext morphlineContext;
private Command morphline;
private IndexSchema schema;
private Map<String, String> commandLineMorphlineHeaders;
private boolean disableFileOpen;
private String morphlineFileAndId;
private final Timer elapsedTime;
public static final String MORPHLINE_FILE_PARAM = "morphlineFile";
public static final String MORPHLINE_ID_PARAM = "morphlineId";
/**
* Morphline variables can be passed from the CLI to the Morphline, e.g.:
* hadoop ... -D morphlineVariable.zkHost=127.0.0.1:2181/solr
*/
public static final String MORPHLINE_VARIABLE_PARAM = "morphlineVariable";
/**
* Headers, including MIME types, can also explicitly be passed by force from the CLI to Morphline, e.g:
* hadoop ... -D morphlineField._attachment_mimetype=text/csv
*/
public static final String MORPHLINE_FIELD_PREFIX = "morphlineField.";
/**
* Flag to disable reading of file contents if indexing just file metadata is sufficient.
* This improves performance and confidentiality.
*/
public static final String DISABLE_FILE_OPEN = "morphlineDisableFileOpen";
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
MorphlineContext getMorphlineContext() {
return morphlineContext;
}
IndexSchema getSchema() {
return schema;
}
public MorphlineMapRunner(Configuration configuration, DocumentLoader loader, String solrHomeDir) throws IOException {
if (LOG.isTraceEnabled()) {
LOG.trace("CWD is {}", new File(".").getCanonicalPath());
TreeMap map = new TreeMap();
for (Map.Entry<String,String> entry : configuration) {
map.put(entry.getKey(), entry.getValue());
}
LOG.trace("Configuration:\n{}", Joiner.on("\n").join(map.entrySet()));
}
FaultTolerance faultTolerance = new FaultTolerance(
configuration.getBoolean(FaultTolerance.IS_PRODUCTION_MODE, false),
configuration.getBoolean(FaultTolerance.IS_IGNORING_RECOVERABLE_EXCEPTIONS, false),
configuration.get(FaultTolerance.RECOVERABLE_EXCEPTION_CLASSES, SolrServerException.class.getName())
);
morphlineContext = new SolrMorphlineContext.Builder()
.setDocumentLoader(loader)
.setExceptionHandler(faultTolerance)
.setMetricRegistry(new MetricRegistry())
.build();
class MySolrLocator extends SolrLocator { // trick to access protected ctor
public MySolrLocator(MorphlineContext ctx) {
super(ctx);
}
}
SolrLocator locator = new MySolrLocator(morphlineContext);
locator.setSolrHomeDir(solrHomeDir);
schema = locator.getIndexSchema();
// rebuild context, now with schema
morphlineContext = new SolrMorphlineContext.Builder()
.setIndexSchema(schema)
.setDocumentLoader(loader)
.setExceptionHandler(faultTolerance)
.setMetricRegistry(morphlineContext.getMetricRegistry())
.build();
String morphlineFile = configuration.get(MORPHLINE_FILE_PARAM);
String morphlineId = configuration.get(MORPHLINE_ID_PARAM);
if (morphlineFile == null || morphlineFile.trim().length() == 0) {
throw new MorphlineCompilationException("Missing parameter: " + MORPHLINE_FILE_PARAM, null);
}
Map morphlineVariables = new HashMap();
for (Map.Entry<String, String> entry : configuration) {
String variablePrefix = MORPHLINE_VARIABLE_PARAM + ".";
if (entry.getKey().startsWith(variablePrefix)) {
morphlineVariables.put(entry.getKey().substring(variablePrefix.length()), entry.getValue());
}
}
Config override = ConfigFactory.parseMap(morphlineVariables);
morphline = new Compiler().compile(new File(morphlineFile), morphlineId, morphlineContext, null, override);
morphlineFileAndId = morphlineFile + "@" + morphlineId;
disableFileOpen = configuration.getBoolean(DISABLE_FILE_OPEN, false);
LOG.debug("disableFileOpen: {}", disableFileOpen);
commandLineMorphlineHeaders = new HashMap();
for (Map.Entry<String,String> entry : configuration) {
if (entry.getKey().startsWith(MORPHLINE_FIELD_PREFIX)) {
commandLineMorphlineHeaders.put(entry.getKey().substring(MORPHLINE_FIELD_PREFIX.length()), entry.getValue());
}
}
LOG.debug("Headers, including MIME types, passed by force from the CLI to morphline: {}", commandLineMorphlineHeaders);
String metricName = MetricRegistry.name(Utils.getShortClassName(getClass()), Metrics.ELAPSED_TIME);
this.elapsedTime = morphlineContext.getMetricRegistry().timer(metricName);
Notifications.notifyBeginTransaction(morphline);
}
/**
* Extract content from the path specified in the value. Key is useless.
*/
public void map(String value, Configuration configuration, Context context) throws IOException {
LOG.info("Processing file {}", value);
InputStream in = null;
Record record = null;
Timer.Context timerContext = elapsedTime.time();
try {
PathParts parts = new PathParts(value.toString(), configuration);
record = getRecord(parts);
if (record == null) {
return; // ignore
}
for (Map.Entry<String, String> entry : commandLineMorphlineHeaders.entrySet()) {
record.replaceValues(entry.getKey(), entry.getValue());
}
long fileLength = parts.getFileStatus().getLen();
if (disableFileOpen) {
in = new ByteArrayInputStream(new byte[0]);
} else {
in = new BufferedInputStream(parts.getFileSystem().open(parts.getUploadPath()));
}
record.put(Fields.ATTACHMENT_BODY, in);
Notifications.notifyStartSession(morphline);
if (!morphline.process(record)) {
LOG.warn("Morphline {} failed to process record: {}", morphlineFileAndId, record);
}
if (context != null) {
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.FILES_READ.toString()).increment(1);
context.getCounter(MorphlineCounters.class.getName(), MorphlineCounters.FILE_BYTES_READ.toString()).increment(fileLength);
}
} catch (Exception e) {
LOG.error("Unable to process file " + value, e);
if (context != null) {
context.getCounter(getClass().getName() + ".errors", e.getClass().getName()).increment(1);
}
morphlineContext.getExceptionHandler().handleException(e, record);
} finally {
timerContext.stop();
if (in != null) {
in.close();
}
}
}
protected Record getRecord(PathParts parts) {
FileStatus stats;
try {
stats = parts.getFileStatus();
} catch (IOException e) {
stats = null;
}
if (stats == null) {
LOG.warn("Ignoring file that somehow has become unavailable since the job was submitted: {}",
parts.getUploadURL());
return null;
}
Record headers = new Record();
//headers.put(getSchema().getUniqueKeyField().getName(), parts.getId()); // use HDFS file path as docId if no docId is specified
headers.put(Fields.BASE_ID, parts.getId()); // with sanitizeUniqueKey command, use HDFS file path as docId if no docId is specified
headers.put(Fields.ATTACHMENT_NAME, parts.getName()); // Tika can use the file name in guessing the right MIME type
// enable indexing and storing of file meta data in Solr
headers.put(HdfsFileFieldNames.FILE_UPLOAD_URL, parts.getUploadURL());
headers.put(HdfsFileFieldNames.FILE_DOWNLOAD_URL, parts.getDownloadURL());
headers.put(HdfsFileFieldNames.FILE_SCHEME, parts.getScheme());
headers.put(HdfsFileFieldNames.FILE_HOST, parts.getHost());
headers.put(HdfsFileFieldNames.FILE_PORT, String.valueOf(parts.getPort()));
headers.put(HdfsFileFieldNames.FILE_PATH, parts.getURIPath());
headers.put(HdfsFileFieldNames.FILE_NAME, parts.getName());
headers.put(HdfsFileFieldNames.FILE_LAST_MODIFIED, String.valueOf(stats.getModificationTime())); // FIXME also add in SpoolDirectorySource
headers.put(HdfsFileFieldNames.FILE_LENGTH, String.valueOf(stats.getLen())); // FIXME also add in SpoolDirectorySource
headers.put(HdfsFileFieldNames.FILE_OWNER, stats.getOwner());
headers.put(HdfsFileFieldNames.FILE_GROUP, stats.getGroup());
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_USER, stats.getPermission().getUserAction().SYMBOL);
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_GROUP, stats.getPermission().getGroupAction().SYMBOL);
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_OTHER, stats.getPermission().getOtherAction().SYMBOL);
headers.put(HdfsFileFieldNames.FILE_PERMISSIONS_STICKYBIT, String.valueOf(stats.getPermission().getStickyBit()));
// TODO: consider to add stats.getAccessTime(), stats.getReplication(), stats.isSymlink(), stats.getBlockSize()
return headers;
}
public void cleanup() {
Notifications.notifyCommitTransaction(morphline);
Notifications.notifyShutdown(morphline);
}
}