package com.alibaba.datax.plugin.reader.odpsreader;
import com.alibaba.datax.common.element.*;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordSender;
import com.alibaba.datax.plugin.reader.odpsreader.util.OdpsUtil;
import com.aliyun.odps.OdpsType;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.RecordReader;
import com.aliyun.odps.tunnel.TableTunnel;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.ParseException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ReaderProxy {
private static final Logger LOG = LoggerFactory
.getLogger(ReaderProxy.class);
private static boolean IS_DEBUG = LOG.isDebugEnabled();
private RecordSender recordSender;
private TableTunnel.DownloadSession downloadSession;
private Map<String, OdpsType> columnTypeMap;
private List<Pair<String, ColumnType>> parsedColumns;
private String partition;
private boolean isPartitionTable;
private long start;
private long count;
private boolean isCompress;
public ReaderProxy(RecordSender recordSender, TableTunnel.DownloadSession downloadSession,
Map<String, OdpsType> columnTypeMap,
List<Pair<String, ColumnType>> parsedColumns, String partition,
boolean isPartitionTable, long start, long count, boolean isCompress) {
this.recordSender = recordSender;
this.downloadSession = downloadSession;
this.columnTypeMap = columnTypeMap;
this.parsedColumns = parsedColumns;
this.partition = partition;
this.isPartitionTable = isPartitionTable;
this.start = start;
this.count = count;
this.isCompress = isCompress;
}
// warn: odps 分区列和正常列不能重名, 所有列都不不区分大小写
public void doRead() {
try {
LOG.info("start={}, count={}",start, count);
//RecordReader recordReader = downloadSession.openRecordReader(start, count, isCompress);
RecordReader recordReader = OdpsUtil.getRecordReader(downloadSession, start, count, isCompress);
Record odpsRecord;
Map<String, String> partitionMap = this
.parseCurrentPartitionValue();
int retryTimes = 1;
while (true) {
try {
odpsRecord = recordReader.read();
} catch(Exception e) {
//odps read 异常后重试10次
LOG.warn("warn : odps read exception: {}", e.getMessage());
if(retryTimes < 10) {
try {
Thread.sleep(2000);
} catch (InterruptedException ignored) {
}
recordReader = downloadSession.openRecordReader(start, count, isCompress);
LOG.warn("odps-read-exception, 重试第{}次", retryTimes);
retryTimes++;
continue;
} else {
throw DataXException.asDataXException(OdpsReaderErrorCode.ODPS_READ_EXCEPTION, e);
}
}
//记录已经读取的点
start++;
count--;
if (odpsRecord != null) {
com.alibaba.datax.common.element.Record dataXRecord = recordSender
.createRecord();
// warn: for PARTITION||NORMAL columnTypeMap's key
// sets(columnName) is big than parsedColumns's left
// sets(columnName), always contain
for (Pair<String, ColumnType> pair : this.parsedColumns) {
String columnName = pair.getLeft();
switch (pair.getRight()) {
case PARTITION:
String partitionColumnValue = this
.getPartitionColumnValue(partitionMap,
columnName);
this.odpsColumnToDataXField(odpsRecord, dataXRecord,
this.columnTypeMap.get(columnName),
partitionColumnValue, true);
break;
case NORMAL:
this.odpsColumnToDataXField(odpsRecord, dataXRecord,
this.columnTypeMap.get(columnName), columnName,
false);
break;
case CONSTANT:
dataXRecord.addColumn(new StringColumn(columnName));
break;
default:
break;
}
}
recordSender.sendToWriter(dataXRecord);
} else {
break;
}
}
//fixed, 避免recordReader.close失败,跟鸣天确认过,可以不用关闭RecordReader
try {
recordReader.close();
} catch (Exception e) {
LOG.warn("recordReader close exception", e);
}
} catch (DataXException e) {
throw e;
} catch (Exception e) {
// warn: if dirty
throw DataXException.asDataXException(
OdpsReaderErrorCode.READ_DATA_FAIL, e);
}
}
private Map<String, String> parseCurrentPartitionValue() {
Map<String, String> partitionMap = new HashMap<String, String>();
if (this.isPartitionTable) {
String[] splitedPartition = this.partition.split(",");
for (String eachPartition : splitedPartition) {
String[] partitionDetail = eachPartition.split("=");
// warn: check partition like partition=1
if (2 != partitionDetail.length) {
throw DataXException
.asDataXException(
OdpsReaderErrorCode.ILLEGAL_VALUE,
String.format(
"您的分区 [%s] 解析出现错误,解析后正确的配置方式类似为 [ pt=1,dt=1 ].",
eachPartition));
}
// warn: translate to lower case, it's more comfortable to
// compare whit user's input columns
String partitionName = partitionDetail[0].toLowerCase();
String partitionValue = partitionDetail[1];
partitionMap.put(partitionName, partitionValue);
}
}
if (IS_DEBUG) {
LOG.debug(String.format("partition value details: %s",
com.alibaba.fastjson.JSON.toJSONString(partitionMap)));
}
return partitionMap;
}
private String getPartitionColumnValue(Map<String, String> partitionMap,
String partitionColumnName) {
// warn: to lower case
partitionColumnName = partitionColumnName.toLowerCase();
// it's will never happen, but add this checking
if (!partitionMap.containsKey(partitionColumnName)) {
String errorMessage = String.format(
"表所有分区信息为: %s 其中找不到 [%s] 对应的分区值.",
com.alibaba.fastjson.JSON.toJSONString(partitionMap),
partitionColumnName);
throw DataXException.asDataXException(
OdpsReaderErrorCode.READ_DATA_FAIL, errorMessage);
}
return partitionMap.get(partitionColumnName);
}
/**
* TODO warn: odpsRecord 的 String 可能获取出来的是 binary
*
* warn: there is no dirty data in reader plugin, so do not handle dirty
* data with TaskPluginCollector
*
* warn: odps only support BIGINT && String partition column actually
*
* @param odpsRecord
* every line record of odps table
* @param dataXRecord
* every datax record, to be send to writer. method getXXX() case sensitive
* @param type
* odps column type
* @param columnNameValue
* for partition column it's column value, for normal column it's
* column name
* @param isPartitionColumn
* true means partition column and false means normal column
* */
private void odpsColumnToDataXField(Record odpsRecord,
com.alibaba.datax.common.element.Record dataXRecord, OdpsType type,
String columnNameValue, boolean isPartitionColumn) {
switch (type) {
case BIGINT: {
if (isPartitionColumn) {
dataXRecord.addColumn(new LongColumn(columnNameValue));
} else {
dataXRecord.addColumn(new LongColumn(odpsRecord
.getBigint(columnNameValue)));
}
break;
}
case BOOLEAN: {
if (isPartitionColumn) {
dataXRecord.addColumn(new BoolColumn(columnNameValue));
} else {
dataXRecord.addColumn(new BoolColumn(odpsRecord
.getBoolean(columnNameValue)));
}
break;
}
case DATETIME: {
if (isPartitionColumn) {
try {
dataXRecord.addColumn(new DateColumn(ColumnCast
.string2Date(new StringColumn(columnNameValue))));
} catch (ParseException e) {
LOG.error(String.format("", this.partition));
String errMessage = String.format(
"您读取分区 [%s] 出现日期转换异常, 日期的字符串表示为 [%s].",
this.partition, columnNameValue);
LOG.error(errMessage);
throw DataXException.asDataXException(
OdpsReaderErrorCode.READ_DATA_FAIL, errMessage, e);
}
} else {
dataXRecord.addColumn(new DateColumn(odpsRecord
.getDatetime(columnNameValue)));
}
break;
}
case DOUBLE: {
if (isPartitionColumn) {
dataXRecord.addColumn(new DoubleColumn(columnNameValue));
} else {
dataXRecord.addColumn(new DoubleColumn(odpsRecord
.getDouble(columnNameValue)));
}
break;
}
case DECIMAL: {
if(isPartitionColumn) {
dataXRecord.addColumn(new DoubleColumn(columnNameValue));
} else {
dataXRecord.addColumn(new DoubleColumn(odpsRecord.getDecimal(columnNameValue)));
}
break;
}
case STRING: {
if (isPartitionColumn) {
dataXRecord.addColumn(new StringColumn(columnNameValue));
} else {
dataXRecord.addColumn(new StringColumn(odpsRecord
.getString(columnNameValue)));
}
break;
}
default:
throw DataXException
.asDataXException(
OdpsReaderErrorCode.ILLEGAL_VALUE,
String.format(
"DataX 抽取 ODPS 数据不支持字段类型为:[%s]. 目前支持抽取的字段类型有:bigint, boolean, datetime, double, decimal, string. "
+ "您可以选择不抽取 DataX 不支持的字段或者联系 ODPS 管理员寻求帮助.",
type));
}
}
}