package com.alibaba.datax.plugin.reader.hdfsreader;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordSender;
import com.alibaba.datax.common.spi.Reader;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.unstructuredstorage.reader.UnstructuredStorageReaderUtil;
import org.apache.commons.io.Charsets;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.InputStream;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
public class HdfsReader extends Reader {
/**
* Job 中的方法仅执行一次,Task 中方法会由框架启动多个 Task 线程并行执行。
* <p/>
* 整个 Reader 执行流程是:
* <pre>
* Job类init-->prepare-->split
*
* Task类init-->prepare-->startRead-->post-->destroy
* Task类init-->prepare-->startRead-->post-->destroy
*
* Job类post-->destroy
* </pre>
*/
public static class Job extends Reader.Job {
private static final Logger LOG = LoggerFactory
.getLogger(Job.class);
private Configuration readerOriginConfig = null;
private String encoding = null;
private HashSet<String> sourceFiles;
private String specifiedFileType = null;
private DFSUtil dfsUtil = null;
private List<String> path = null;
@Override
public void init() {
LOG.info("init() begin...");
this.readerOriginConfig = super.getPluginJobConf();
this.validate();
dfsUtil = new DFSUtil(this.readerOriginConfig);
LOG.info("init() ok and end...");
}
public void validate(){
this.readerOriginConfig.getNecessaryValue(Key.DEFAULT_FS,
HdfsReaderErrorCode.DEFAULT_FS_NOT_FIND_ERROR);
// path check
String pathInString = this.readerOriginConfig.getNecessaryValue(Key.PATH, HdfsReaderErrorCode.REQUIRED_VALUE);
if (!pathInString.startsWith("[") && !pathInString.endsWith("]")) {
path = new ArrayList<String>();
path.add(pathInString);
} else {
path = this.readerOriginConfig.getList(Key.PATH, String.class);
if (null == path || path.size() == 0) {
throw DataXException.asDataXException(HdfsReaderErrorCode.REQUIRED_VALUE, "您需要指定待读取的源目录或文件");
}
for (String eachPath : path) {
if(!eachPath.startsWith("/")){
String message = String.format("请检查参数path:[%s],需要配置为绝对路径", eachPath);
LOG.error(message);
throw DataXException.asDataXException(HdfsReaderErrorCode.ILLEGAL_VALUE, message);
}
}
}
specifiedFileType = this.readerOriginConfig.getNecessaryValue(Key.FILETYPE, HdfsReaderErrorCode.REQUIRED_VALUE);
if( !specifiedFileType.equalsIgnoreCase(Constant.ORC) &&
!specifiedFileType.equalsIgnoreCase(Constant.TEXT) &&
!specifiedFileType.equalsIgnoreCase(Constant.CSV) &&
!specifiedFileType.equalsIgnoreCase(Constant.SEQ) &&
!specifiedFileType.equalsIgnoreCase(Constant.RC)){
String message = "HdfsReader插件目前支持ORC, TEXT, CSV, SEQUENCE, RC五种格式的文件," +
"请将fileType选项的值配置为ORC, TEXT, CSV, SEQUENCE 或者 RC";
throw DataXException.asDataXException(HdfsReaderErrorCode.FILE_TYPE_ERROR, message);
}
encoding = this.readerOriginConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.ENCODING, "UTF-8");
try {
Charsets.toCharset(encoding);
} catch (UnsupportedCharsetException uce) {
throw DataXException.asDataXException(
HdfsReaderErrorCode.ILLEGAL_VALUE,
String.format("不支持的编码格式 : [%s]", encoding), uce);
} catch (Exception e) {
throw DataXException.asDataXException(
HdfsReaderErrorCode.ILLEGAL_VALUE,
String.format("运行配置异常 : %s", e.getMessage()), e);
}
//check Kerberos
Boolean haveKerberos = this.readerOriginConfig.getBool(Key.HAVE_KERBEROS, false);
if(haveKerberos) {
this.readerOriginConfig.getNecessaryValue(Key.KERBEROS_KEYTAB_FILE_PATH, HdfsReaderErrorCode.REQUIRED_VALUE);
this.readerOriginConfig.getNecessaryValue(Key.KERBEROS_PRINCIPAL, HdfsReaderErrorCode.REQUIRED_VALUE);
}
// validate the Columns
validateColumns();
if(this.specifiedFileType.equalsIgnoreCase(Constant.CSV)){
//compress校验
UnstructuredStorageReaderUtil.validateCompress(this.readerOriginConfig);
UnstructuredStorageReaderUtil.validateCsvReaderConfig(this.readerOriginConfig);
}
}
private void validateColumns(){
// 检测是column 是否为 ["*"] 若是则填为空
List<Configuration> column = this.readerOriginConfig
.getListConfiguration(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
if (null != column
&& 1 == column.size()
&& ("\"*\"".equals(column.get(0).toString()) || "'*'"
.equals(column.get(0).toString()))) {
readerOriginConfig
.set(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN, new ArrayList<String>());
} else {
// column: 1. index type 2.value type 3.when type is Data, may have format
List<Configuration> columns = this.readerOriginConfig
.getListConfiguration(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
if (null == columns || columns.size() == 0) {
throw DataXException.asDataXException(
HdfsReaderErrorCode.CONFIG_INVALID_EXCEPTION,
"您需要指定 columns");
}
if (null != columns && columns.size() != 0) {
for (Configuration eachColumnConf : columns) {
eachColumnConf.getNecessaryValue(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.TYPE, HdfsReaderErrorCode.REQUIRED_VALUE);
Integer columnIndex = eachColumnConf.getInt(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.INDEX);
String columnValue = eachColumnConf.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.VALUE);
if (null == columnIndex && null == columnValue) {
throw DataXException.asDataXException(
HdfsReaderErrorCode.NO_INDEX_VALUE,
"由于您配置了type, 则至少需要配置 index 或 value");
}
if (null != columnIndex && null != columnValue) {
throw DataXException.asDataXException(
HdfsReaderErrorCode.MIXED_INDEX_VALUE,
"您混合配置了index, value, 每一列同时仅能选择其中一种");
}
}
}
}
}
@Override
public void prepare() {
LOG.info("prepare(), start to getAllFiles...");
this.sourceFiles = dfsUtil.getAllFiles(path, specifiedFileType);
LOG.info(String.format("您即将读取的文件数为: [%s], 列表为: [%s]",
this.sourceFiles.size(),
StringUtils.join(this.sourceFiles, ",")));
}
@Override
public List<Configuration> split(int adviceNumber) {
LOG.info("split() begin...");
List<Configuration> readerSplitConfigs = new ArrayList<Configuration>();
// warn:每个slice拖且仅拖一个文件,
// int splitNumber = adviceNumber;
int splitNumber = this.sourceFiles.size();
if (0 == splitNumber) {
throw DataXException.asDataXException(HdfsReaderErrorCode.EMPTY_DIR_EXCEPTION,
String.format("未能找到待读取的文件,请确认您的配置项path: %s", this.readerOriginConfig.getString(Key.PATH)));
}
List<List<String>> splitedSourceFiles = this.splitSourceFiles(new ArrayList<String>(this.sourceFiles), splitNumber);
for (List<String> files : splitedSourceFiles) {
Configuration splitedConfig = this.readerOriginConfig.clone();
splitedConfig.set(Constant.SOURCE_FILES, files);
readerSplitConfigs.add(splitedConfig);
}
return readerSplitConfigs;
}
private <T> List<List<T>> splitSourceFiles(final List<T> sourceList, int adviceNumber) {
List<List<T>> splitedList = new ArrayList<List<T>>();
int averageLength = sourceList.size() / adviceNumber;
averageLength = averageLength == 0 ? 1 : averageLength;
for (int begin = 0, end = 0; begin < sourceList.size(); begin = end) {
end = begin + averageLength;
if (end > sourceList.size()) {
end = sourceList.size();
}
splitedList.add(sourceList.subList(begin, end));
}
return splitedList;
}
@Override
public void post() {
}
@Override
public void destroy() {
}
}
public static class Task extends Reader.Task {
private static Logger LOG = LoggerFactory.getLogger(Reader.Task.class);
private Configuration taskConfig;
private List<String> sourceFiles;
private String specifiedFileType;
private String encoding;
private DFSUtil dfsUtil = null;
private int bufferSize;
@Override
public void init() {
this.taskConfig = super.getPluginJobConf();
this.sourceFiles = this.taskConfig.getList(Constant.SOURCE_FILES, String.class);
this.specifiedFileType = this.taskConfig.getNecessaryValue(Key.FILETYPE, HdfsReaderErrorCode.REQUIRED_VALUE);
this.encoding = this.taskConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.ENCODING, "UTF-8");
this.dfsUtil = new DFSUtil(this.taskConfig);
this.bufferSize = this.taskConfig.getInt(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.BUFFER_SIZE,
com.alibaba.datax.plugin.unstructuredstorage.reader.Constant.DEFAULT_BUFFER_SIZE);
}
@Override
public void prepare() {
}
@Override
public void startRead(RecordSender recordSender) {
LOG.info("read start");
for (String sourceFile : this.sourceFiles) {
LOG.info(String.format("reading file : [%s]", sourceFile));
if(specifiedFileType.equalsIgnoreCase(Constant.TEXT)
|| specifiedFileType.equalsIgnoreCase(Constant.CSV)) {
InputStream inputStream = dfsUtil.getInputStream(sourceFile);
UnstructuredStorageReaderUtil.readFromStream(inputStream, sourceFile, this.taskConfig,
recordSender, this.getTaskPluginCollector());
}else if(specifiedFileType.equalsIgnoreCase(Constant.ORC)){
dfsUtil.orcFileStartRead(sourceFile, this.taskConfig, recordSender, this.getTaskPluginCollector());
}else if(specifiedFileType.equalsIgnoreCase(Constant.SEQ)){
dfsUtil.sequenceFileStartRead(sourceFile, this.taskConfig, recordSender, this.getTaskPluginCollector());
}else if(specifiedFileType.equalsIgnoreCase(Constant.RC)){
dfsUtil.rcFileStartRead(sourceFile, this.taskConfig, recordSender, this.getTaskPluginCollector());
}else {
String message = "HdfsReader插件目前支持ORC, TEXT, CSV, SEQUENCE, RC五种格式的文件," +
"请将fileType选项的值配置为ORC, TEXT, CSV, SEQUENCE 或者 RC";
throw DataXException.asDataXException(HdfsReaderErrorCode.FILE_TYPE_UNSUPPORT, message);
}
if(recordSender != null){
recordSender.flush();
}
}
LOG.info("end read source files...");
}
@Override
public void post() {
}
@Override
public void destroy() {
}
}
}