/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.aliyun.odps.mapred.bridge.streaming;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.aliyun.odps.Column;
import com.aliyun.odps.OdpsType;
import com.aliyun.odps.conf.Configuration;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.io.Text;
import com.aliyun.odps.mapred.Mapper;
import com.aliyun.odps.mapred.Reducer;
import com.aliyun.odps.mapred.TaskContext;
import com.aliyun.odps.mapred.bridge.streaming.io.InputWriter;
import com.aliyun.odps.mapred.bridge.streaming.io.LineReader;
import com.aliyun.odps.mapred.bridge.streaming.io.OutputReader;
import com.aliyun.odps.mapred.bridge.streaming.io.RecordOutputReader;
import com.aliyun.odps.mapred.bridge.streaming.io.TextInputWriter;
import com.aliyun.odps.mapred.bridge.streaming.io.TextOutputReader;
import com.aliyun.odps.mapred.conf.BridgeJobConf;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.utils.ReflectionUtils;
/**
* Shared functionality for PipeMapper, PipeReducer.
*/
public abstract class PipeMapRed {
protected static final Log LOG = LogFactory.getLog(PipeMapRed.class.getName());
/**
* Returns the Configuration.
*/
public Configuration getConfiguration() {
return job_;
}
/**
* Returns the DataOutput to which the client input is written.
*/
public DataOutput getClientOutput() {
return clientOut_;
}
/**
* Returns the DataInput from which the client output is read.
*/
public DataInput getClientInput() {
return clientIn_;
}
/**
* Returns the input separator to be used.
*/
public abstract byte[] getInputSeparator();
/**
* Returns the field separator to be used.
*/
public abstract byte[] getFieldSeparator();
/**
* Returns the number of key fields.
*/
public abstract int getNumOfKeyFields();
abstract boolean getDoPipe();
/**
* Returns the command to be spawned as a subprocess.
* Mapper/Reducer operations will delegate to it
*/
abstract String getPipeCommand(JobConf job);
public int getNumOfOutputFields() {
return job_.getOutputSchema().length;
}
final static int OUTSIDE = 1;
final static int SINGLEQ = 2;
final static int DOUBLEQ = 3;
private final static int BUFFER_SIZE = 128 * 1024;
static String[] splitArgs(String args) {
ArrayList argList = new ArrayList();
char[] ch = args.toCharArray();
int clen = ch.length;
int state = OUTSIDE;
int argstart = 0;
for (int c = 0; c <= clen; c++) {
boolean last = (c == clen);
int lastState = state;
boolean endToken = false;
if (!last) {
if (ch[c] == '\'') {
if (state == OUTSIDE) {
state = SINGLEQ;
} else if (state == SINGLEQ) {
state = OUTSIDE;
}
endToken = (state != lastState);
} else if (ch[c] == '"') {
if (state == OUTSIDE) {
state = DOUBLEQ;
} else if (state == DOUBLEQ) {
state = OUTSIDE;
}
endToken = (state != lastState);
} else if (ch[c] == ' ') {
if (state == OUTSIDE) {
endToken = true;
}
}
}
if (last || endToken) {
if (c == argstart) {
// unquoted space
} else {
String a;
a = args.substring(argstart, c);
argList.add(a);
}
argstart = c + 1;
lastState = state;
}
}
return (String[]) argList.toArray(new String[0]);
}
public void configure(JobConf job) {
try {
String argv = getPipeCommand(job);
if (argv == null) {
throw new RuntimeException("streaming pipe cmd is null");
}
joinDelay_ = job.getLong("stream.joindelay.milli", 0);
job_ = new BridgeJobConf(job);
mapInputWriterClass_ =
job_.getClass("stream.map.input.writer.class",
TextInputWriter.class, InputWriter.class);
mapOutputReaderClass_ =
job_.getClass("stream.map.output.reader.class",
TextOutputReader.class, OutputReader.class);
reduceInputWriterClass_ =
job_.getClass("stream.reduce.input.writer.class",
TextInputWriter.class, InputWriter.class);
reduceOutputReaderClass_ =
job_.getClass("stream.reduce.output.reader.class",
TextOutputReader.class, OutputReader.class);
nonZeroExitIsFailure_ = job_.getBoolean("stream.non.zero.exit.is.failure", true);
doPipe_ = getDoPipe();
if (!doPipe_) {
return;
}
setStreamJobDetails(job);
String[] argvSplit = splitArgs(argv);
String prog = argvSplit[0];
//File currentDir = new File(".").getAbsoluteFile();
//if (new File(prog).isAbsolute()) {
// // we don't own it. Hope it is executable
//} else {
// FileUtil.chmod(new File(currentDir, prog).toString(), "a+x");
//}
//
// argvSplit[0]:
// An absolute path should be a preexisting valid path on all TaskTrackers
// A relative path is converted into an absolute pathname by looking
// up the PATH env variable. If it still fails, look it up in the
// tasktracker's local working directory
//
//if (!new File(argvSplit[0]).isAbsolute()) {
// PathFinder finder = new PathFinder("PATH");
// finder.prependPathComponent(currentDir.toString());
// File f = finder.getAbsolutePath(argvSplit[0]);
// if (f != null) {
// argvSplit[0] = f.getAbsolutePath();
// }
// f = null;
//}
LOG.info("PipeMapRed exec " + Arrays.asList(argvSplit));
Properties childEnv = new Properties();
addJobConfToEnvironment(job_, childEnv);
addEnvironment(childEnv, job_.get("stream.addenvironment"));
// add TMPDIR environment variable with the value of java.io.tmpdir
// FIXME
envPut(childEnv, "TMPDIR", System.getProperty("java.io.tmpdir"));
envPut(childEnv, "TABLE_RESOURCE_READER", "../table_resource_reader");
final Map<String, String> envMap = new HashMap<String, String>();
for (String key : childEnv.stringPropertyNames()) {
envMap.put(key, childEnv.getProperty(key));
}
// Start the process
sim = StreamSecurityHelper.startChildProcess(argvSplit, envMap);
clientOut_ = new DataOutputStream(new BufferedOutputStream(
sim.getOutputStream(),
BUFFER_SIZE));
clientIn_ = new DataInputStream(new BufferedInputStream(
sim.getInputStream(),
BUFFER_SIZE));
clientErr_ = new DataInputStream(new BufferedInputStream(sim.getErrorStream()));
startTime_ = System.currentTimeMillis();
} catch (IOException e) {
LOG.error("configuration exception", e);
throw new RuntimeException("configuration exception", e);
//} catch (InterruptedException e) {
// LOG.error("configuration exception", e);
// throw new RuntimeException("configuration exception", e);
}
}
void setStreamJobDetails(JobConf job) {
String s = job.get("stream.minRecWrittenToEnableSkip_");
if (s != null) {
minRecWrittenToEnableSkip_ = Long.parseLong(s);
LOG.info("JobConf set minRecWrittenToEnableSkip_ ="
+ minRecWrittenToEnableSkip_);
}
}
void addJobConfToEnvironment(JobConf conf, Properties env) {
Iterator it = conf.iterator();
while (it.hasNext()) {
Map.Entry en = (Map.Entry) it.next();
String name = (String) en.getKey();
//String value = (String)en.getValue(); // does not apply variable expansion
String value = conf.get(name); // does variable expansion
name = safeEnvVarName(name);
envPut(env, name, value);
}
}
String safeEnvVarName(String var) {
StringBuffer safe = new StringBuffer();
int len = var.length();
for (int i = 0; i < len; i++) {
char c = var.charAt(i);
char s;
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
s = c;
} else {
s = '_';
}
safe.append(s);
}
return safe.toString();
}
void addEnvironment(Properties env, String nameVals) {
// encoding "a=b c=d" from StreamJob
if (nameVals == null) {
return;
}
String[] nv = nameVals.split(" ");
for (int i = 0; i < nv.length; i++) {
String[] pair = nv[i].split("=", 2);
if (pair.length != 2) {
LOG.info("Skip env entry:" + nv[i]);
} else {
envPut(env, pair[0], pair[1]);
}
}
}
void envPut(Properties env, String name, String value) {
if (LOG.isDebugEnabled()) {
LOG.debug("Add env entry:" + name + "=" + value);
}
env.put(name, value);
}
void startOutputThreads(TaskContext context)
throws IOException {
inWriter_ = createInputWriter();
outReader_ = createOutputReader();
outThread_ = new MROutputThread(outReader_, context);
outThread_.start();
errThread_ = new MRErrorThread();
errThread_.setReporter(context);
errThread_.start();
}
void waitOutputThreads() throws IOException {
try {
if (outThread_ == null) {
// This happens only when reducer has empty input(So reduce() is not
// called at all in this task). If reducer still generates output,
// which is very uncommon and we may not have to support this case.
// So we don't write this output to HDFS, but we consume/collect
// this output just to avoid reducer hanging forever.
//OutputCollector collector = new OutputCollector() {
// public void collect(Object key, Object value)
// throws IOException {
// //just consume it, no need to write the record anywhere
// }
//};
//Reporter reporter = null;//dummy reporter
startOutputThreads(null);
}
int exitVal = sim.waitFor();
System.err.println("Streaming subprocess exited with code " + exitVal);
if (outThread_ != null) {
outThread_.join(joinDelay_);
}
if (errThread_ != null) {
errThread_.join(joinDelay_);
}
if (outerrThreadsThrowable != null) {
throw new RuntimeException(outerrThreadsThrowable);
}
if (exitVal != 0 && nonZeroExitIsFailure_) {
throw new RuntimeException("Streaming subprocess failed with code " + exitVal +
", see stderr of failed worker for perhaps more info.");
}
} catch (InterruptedException e) {
//ignore
}
}
abstract InputWriter createInputWriter() throws IOException;
InputWriter createInputWriter(Class<? extends InputWriter> inputWriterClass)
throws IOException {
InputWriter inputWriter =
ReflectionUtils.newInstance(inputWriterClass, job_);
inputWriter.initialize(this);
return inputWriter;
}
abstract OutputReader createOutputReader() throws IOException;
OutputReader createOutputReader(Class<? extends OutputReader> outputReaderClass)
throws IOException {
OutputReader outputReader =
ReflectionUtils.newInstance(outputReaderClass, job_);
outputReader.initialize(this);
return outputReader;
}
class MROutputThread extends Thread {
MROutputThread(OutputReader outReader, TaskContext context) {
setDaemon(true);
this.outReader = outReader;
this.context = context;
if (this.context != null) {
badDataBehavior = context.getJobConf().get("stream.bad.data.behavior", "strict");
ignoreBadCast = !badDataBehavior.equals("strict");
}
}
public void run() {
try {
// 3/4 Tool to Hadoop
while (outReader.readKeyValue()) {
Object key = outReader.getCurrentKey();
Object value = outReader.getCurrentValue();
// XXX dummy read and discard, as explained in waitOutputThreads()
if (context == null || key == null) {
continue;
}
if ((!context.getTaskID().isMap()) || context.getNumReduceTasks() == 0) {
// TODO split key-value to fields
// FIXME key-value to table record mapping
// TODO convert string fileds to output schema type
Record record = context.createOutputRecord();
if (outReader instanceof RecordOutputReader) {
Text[] fields = (Text[]) value;
fillOutputRecord(record, fields, ignoreBadCast);
} else {
record.setString(0, key.toString());
record.setString(1, value.toString());
}
if (context.getTaskID().isMap()) {
((Mapper.TaskContext) context).write(record);
} else {
((Reducer.TaskContext) context).write(record);
}
} else {
// FIXME shuffle key-value both just single Text field now.
Record keyRecord = context.createMapOutputKeyRecord();
keyRecord.setString(0, key.toString());
Record valueRecord = context.createMapOutputValueRecord();
valueRecord.setString(0, value.toString());
((Mapper.TaskContext) context).write(keyRecord, valueRecord);
}
numRecWritten_++;
long now = System.currentTimeMillis();
if (now - lastStdoutReport > reporterOutDelay_) {
lastStdoutReport = now;
String hline = "Records R/W=" + numRecRead_ + "/" + numRecWritten_;
if (!processProvidedStatus_) {
//reporter.setStatus(hline);
} else {
context.progress();
}
LOG.info(hline);
}
}
} catch (Throwable th) {
outerrThreadsThrowable = th;
LOG.warn(th);
} finally {
try {
if (clientIn_ != null) {
clientIn_.close();
clientIn_ = null;
}
} catch (IOException io) {
LOG.info(io);
}
}
}
/**
* Fill Text fields into table output record.
* Type convertion rule:
* STRING: just set Text, no NULL processing
* ALL OTHER: empty field as NULL
* BIGINT,DOUBLE,BOOLEAN: parse by Java type, bad as NULL
* DATETIME:
* DECIMAL:
*/
void fillOutputRecord(Record record, Text[] fields, boolean ignoreBadCast) {
if (fields.length != record.getColumnCount()) {
// should never happen, have checked in RecordOutputReader
throw new RuntimeException("output record not match output schema...");
}
Column[] columns = record.getColumns();
for (int i = 0; i < fields.length; i++) {
Column col = columns[i];
Text field = fields[i];
Object val = null;
if (col.getType().equals(OdpsType.STRING)) {
val = field;
} else if (field.getLength() == 0) {
val = null;
} else {
String fieldStr = null;
try {
fieldStr = field.toString();
switch (col.getType()) {
case BIGINT:
val = Long.valueOf(fieldStr);
break;
case DOUBLE:
val = Double.valueOf(fieldStr);
break;
case BOOLEAN:
val = Boolean.valueOf(fieldStr);
break;
default:
// FIXME should have checked at client side?
throw new RuntimeException(
"output column " + col.getName() + "'s type " + col.getType()
+ " not supported by streaming job");
}
} catch (Exception e) {
if (!ignoreBadCast) {
if (fieldStr == null) {
// invalid utf-8
// TODO provide escaped field content with error msg?
throw new RuntimeException(
"Failed to decode streaming field as UTF-8 for column " + col.getName());
}
throw new RuntimeException(
"Invalid streaming field value for " + col.getType() + " column " + col.getName()
+ ":" + fieldStr, e);
}
}
}
record.set(i, val);
}
}
OutputReader outReader = null;
TaskContext context = null;
long lastStdoutReport = 0;
String badDataBehavior;
boolean ignoreBadCast;
}
class MRErrorThread extends Thread {
public MRErrorThread() {
this.reporterPrefix = job_.get("stream.stderr.reporter.prefix", "reporter:");
this.counterPrefix = reporterPrefix + "counter:";
this.statusPrefix = reporterPrefix + "status:";
setDaemon(true);
}
public void setReporter(TaskContext reporter) {
this.reporter = reporter;
}
public void run() {
Text line = new Text();
LineReader lineReader = null;
try {
lineReader = new LineReader((InputStream) clientErr_, job_);
while (lineReader.readLine(line) > 0) {
String lineStr = line.toString();
if (matchesReporter(lineStr)) {
if (reporter != null) {
if (matchesCounter(lineStr)) {
incrCounter(lineStr);
} else if (matchesStatus(lineStr)) {
processProvidedStatus_ = true;
setStatus(lineStr);
} else {
LOG.warn("Cannot parse reporter line: " + lineStr);
}
}
} else {
System.err.println(lineStr);
}
long now = System.currentTimeMillis();
if (reporter != null && now - lastStderrReport > reporterErrDelay_) {
lastStderrReport = now;
reporter.progress();
}
line.clear();
}
if (lineReader != null) {
lineReader.close();
}
if (clientErr_ != null) {
clientErr_.close();
clientErr_ = null;
LOG.info("MRErrorThread done");
}
} catch (Throwable th) {
outerrThreadsThrowable = th;
LOG.warn(th);
try {
if (lineReader != null) {
lineReader.close();
}
if (clientErr_ != null) {
clientErr_.close();
clientErr_ = null;
}
} catch (IOException io) {
LOG.info(io);
}
}
}
private boolean matchesReporter(String line) {
return line.startsWith(reporterPrefix);
}
private boolean matchesCounter(String line) {
return line.startsWith(counterPrefix);
}
private boolean matchesStatus(String line) {
return line.startsWith(statusPrefix);
}
private void incrCounter(String line) {
String trimmedLine = line.substring(counterPrefix.length()).trim();
String[] columns = trimmedLine.split(",");
if (columns.length == 3) {
try {
reporter.getCounter(columns[0], columns[1]).increment(
Long.parseLong(columns[2]));
} catch (NumberFormatException e) {
LOG.warn("Cannot parse counter increment '" + columns[2] +
"' from line: " + line);
}
} else {
LOG.warn("Cannot parse counter line: " + line);
}
}
private void setStatus(String line) {
//reporter.setStatus(line.substring(statusPrefix.length()).trim());
}
long lastStderrReport = 0;
volatile TaskContext reporter;
private final String reporterPrefix;
private final String counterPrefix;
private final String statusPrefix;
}
public void mapRedFinished() {
try {
if (!doPipe_) {
LOG.info("mapRedFinished");
return;
}
if (clientOut_ != null) {
try {
clientOut_.flush();
clientOut_.close();
} catch (IOException io) {
LOG.warn(io);
}
}
try {
waitOutputThreads();
} catch (IOException io) {
LOG.warn(io);
}
if (sim != null) {
sim.destroy();
}
LOG.info("mapRedFinished");
} catch (RuntimeException e) {
LOG.info("PipeMapRed failed!", e);
throw e;
}
}
void maybeLogRecord() {
if (numRecRead_ >= nextRecReadLog_) {
String info = numRecInfo();
LOG.info(info);
if (nextRecReadLog_ < 100000) {
nextRecReadLog_ *= 10;
} else {
nextRecReadLog_ += 100000;
}
}
}
public String getContext() {
String s = numRecInfo() + "\n";
if (outThread_ != null) {
s += "last tool output: |" + outReader_.getLastOutput() + "|\n";
}
return s;
}
String envline(String var) {
return var + "=" + StreamUtil.env().get(var) + "\n";
}
String numRecInfo() {
long elapsed = (System.currentTimeMillis() - startTime_) / 1000;
return "R/W/S=" + numRecRead_ + "/" + numRecWritten_ + "/" + numRecSkipped_ + " in:"
+ safeDiv(numRecRead_, elapsed) + " [rec/s]" + " out:" + safeDiv(numRecWritten_, elapsed)
+ " [rec/s]";
}
String safeDiv(long n, long d) {
return (d == 0) ? "NA" : "" + n / d + "=" + n + "/" + d;
}
long startTime_;
long numRecRead_ = 0;
long numRecWritten_ = 0;
long numRecSkipped_ = 0;
long nextRecReadLog_ = 1;
long minRecWrittenToEnableSkip_ = Long.MAX_VALUE;
long reporterOutDelay_ = 10 * 1000L;
long reporterErrDelay_ = 10 * 1000L;
long joinDelay_;
BridgeJobConf job_;
boolean doPipe_;
Class<? extends InputWriter> mapInputWriterClass_;
Class<? extends OutputReader> mapOutputReaderClass_;
Class<? extends InputWriter> reduceInputWriterClass_;
Class<? extends OutputReader> reduceOutputReaderClass_;
boolean nonZeroExitIsFailure_;
Process sim;
InputWriter inWriter_;
OutputReader outReader_;
MROutputThread outThread_;
MRErrorThread errThread_;
DataOutputStream clientOut_;
DataInputStream clientErr_;
DataInputStream clientIn_;
// set in PipeMapper/PipeReducer subclasses
int numExceptions_;
protected volatile Throwable outerrThreadsThrowable;
volatile boolean processProvidedStatus_ = false;
}