/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.data.extractors; import com.google.common.base.Preconditions; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.MetricFieldSpec; import com.linkedin.pinot.common.data.Schema; import com.linkedin.pinot.common.data.TimeFieldSpec; import com.linkedin.pinot.common.data.TimeGranularitySpec; import com.linkedin.pinot.common.utils.StringUtil; import com.linkedin.pinot.common.utils.time.TimeConverter; import com.linkedin.pinot.common.utils.time.TimeConverterProvider; import com.linkedin.pinot.core.data.GenericRow; import java.util.HashMap; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This implementation will only inject columns inside the Schema. */ public class PlainFieldExtractor implements FieldExtractor { private static final Logger LOGGER = LoggerFactory.getLogger(PlainFieldExtractor.class); private static final Map<Class, PinotDataType> SINGLE_VALUE_TYPE_MAP = new HashMap<>(); private static final Map<Class, PinotDataType> MULTI_VALUE_TYPE_MAP = new HashMap<>(); static { SINGLE_VALUE_TYPE_MAP.put(Boolean.class, PinotDataType.BOOLEAN); SINGLE_VALUE_TYPE_MAP.put(Byte.class, PinotDataType.BYTE); SINGLE_VALUE_TYPE_MAP.put(Character.class, PinotDataType.CHARACTER); SINGLE_VALUE_TYPE_MAP.put(Short.class, PinotDataType.SHORT); SINGLE_VALUE_TYPE_MAP.put(Integer.class, PinotDataType.INTEGER); SINGLE_VALUE_TYPE_MAP.put(Long.class, PinotDataType.LONG); SINGLE_VALUE_TYPE_MAP.put(Float.class, PinotDataType.FLOAT); SINGLE_VALUE_TYPE_MAP.put(Double.class, PinotDataType.DOUBLE); SINGLE_VALUE_TYPE_MAP.put(String.class, PinotDataType.STRING); MULTI_VALUE_TYPE_MAP.put(Byte.class, PinotDataType.BYTE_ARRAY); MULTI_VALUE_TYPE_MAP.put(Character.class, PinotDataType.CHARACTER_ARRAY); MULTI_VALUE_TYPE_MAP.put(Short.class, PinotDataType.SHORT_ARRAY); MULTI_VALUE_TYPE_MAP.put(Integer.class, PinotDataType.INTEGER_ARRAY); MULTI_VALUE_TYPE_MAP.put(Long.class, PinotDataType.LONG_ARRAY); MULTI_VALUE_TYPE_MAP.put(Float.class, PinotDataType.FLOAT_ARRAY); MULTI_VALUE_TYPE_MAP.put(Double.class, PinotDataType.DOUBLE_ARRAY); MULTI_VALUE_TYPE_MAP.put(String.class, PinotDataType.STRING_ARRAY); } private final Schema _schema; private final Map<String, Integer> _errorCount = new HashMap<>(); private int _totalErrors = 0; private int _totalNulls = 0; private int _totalConversions = 0; private int _totalNullCols = 0; private final Map<String, PinotDataType> _columnType = new HashMap<>(); private String _incomingTimeColumnName; private String _outgoingTimeColumnName; private TimeConverter _timeConverter; public PlainFieldExtractor(Schema schema) { _schema = schema; initErrorCount(); initColumnTypes(); initTimeConverters(); } public void resetCounters() { _totalErrors = 0; _totalNulls = 0; _totalConversions = 0; _totalNullCols = 0; } private void initErrorCount() { for (String column : _schema.getColumnNames()) { _errorCount.put(column, 0); } } private void initColumnTypes() { // Get the map from column name to pinot data type. for (String column : _schema.getColumnNames()) { FieldSpec fieldSpec = _schema.getFieldSpecFor(column); Preconditions.checkNotNull(fieldSpec, "Bad schema: " + _schema.getSchemaName() + ", field: " + column); _columnType.put(column, PinotDataType.getPinotDataType(fieldSpec)); } } private void initTimeConverters() { TimeFieldSpec timeFieldSpec = _schema.getTimeFieldSpec(); if (timeFieldSpec != null) { TimeGranularitySpec incomingGranularitySpec = timeFieldSpec.getIncomingGranularitySpec(); TimeGranularitySpec outgoingGranularitySpec = timeFieldSpec.getOutgoingGranularitySpec(); _outgoingTimeColumnName = outgoingGranularitySpec.getName(); if (!incomingGranularitySpec.equals(outgoingGranularitySpec)) { _incomingTimeColumnName = incomingGranularitySpec.getName(); _timeConverter = TimeConverterProvider.getTimeConverter(incomingGranularitySpec, outgoingGranularitySpec); } } } @Override public Schema getSchema() { return _schema; } @Override public GenericRow transform(GenericRow row) { return transform(row, new GenericRow()); } @Override public GenericRow transform(GenericRow row, GenericRow destinationRow) { boolean hasError = false; boolean hasNull = false; boolean hasConversion = false; for (String column : _schema.getColumnNames()) { FieldSpec fieldSpec = _schema.getFieldSpecFor(column); // Ignore transform of DerivedMetric if (fieldSpec instanceof MetricFieldSpec && ((MetricFieldSpec) fieldSpec).isDerivedMetric()) { continue; } Object value; // Fetch value for this column. if (column.equals(_outgoingTimeColumnName) && _timeConverter != null) { // Convert incoming time to outgoing time. value = row.getValue(_incomingTimeColumnName); if (value == null) { hasNull = true; _totalNullCols++; } else { try { value = _timeConverter.convert(value); } catch (Exception e) { LOGGER.debug("Caught exception while converting incoming time value: {}", value, e); value = null; hasError = true; _errorCount.put(column, _errorCount.get(column) + 1); } } } else { value = row.getValue(column); if (value == null) { hasNull = true; _totalNullCols++; } } // Convert value if necessary. PinotDataType dest = _columnType.get(column); PinotDataType source = null; if (value != null) { if (value instanceof Object[]) { // Multi-value. Object[] valueArray = (Object[]) value; if (valueArray.length > 0) { source = MULTI_VALUE_TYPE_MAP.get(valueArray[0].getClass()); if (source == null) { source = PinotDataType.OBJECT_ARRAY; } } else { LOGGER.debug("Got 0 length array."); // Use default value for 0 length array. value = null; hasError = true; _errorCount.put(column, _errorCount.get(column) + 1); } } else { // Single-value. source = SINGLE_VALUE_TYPE_MAP.get(value.getClass()); if (source == null) { source = PinotDataType.OBJECT; } } if (value != null && source != dest) { Object before = value; try { value = dest.convert(before, source); hasConversion = true; } catch (Exception e) { LOGGER.debug("Caught exception while converting value: {} from: {} to: {}", before, source, dest); value = null; hasError = true; _errorCount.put(column, _errorCount.get(column) + 1); } } // Null character is the default padding character, we do not allow trailing null chars in strings. // Allowing this can cause multiple values to map to the same padded value, breaking segment generation. if (dest == PinotDataType.STRING) { value = StringUtil.trimTrailingNulls((String) value); } } // Assign default value for null value. if (value == null) { if (fieldSpec.isSingleValueField()) { // Single-value field. value = fieldSpec.getDefaultNullValue(); } else { // Multi-value field. value = new Object[]{fieldSpec.getDefaultNullValue()}; } } destinationRow.putField(column, value); } if (hasError) { _totalErrors++; } if (hasNull) { _totalNulls++; } if (hasConversion) { _totalConversions++; } return destinationRow; } public Map<String, Integer> getErrorCount() { return _errorCount; } public int getTotalErrors() { return _totalErrors; } public int getTotalNulls() { return _totalNulls; } public int getTotalConversions() { return _totalConversions; } public int getTotalNullCols() { return _totalNullCols; } }