/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.segments.v1.creator;
import com.google.common.base.Preconditions;
import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.FieldSpec.DataType;
import com.linkedin.pinot.common.data.FieldSpec.FieldType;
import com.linkedin.pinot.common.data.MetricFieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.data.TimeFieldSpec;
import com.linkedin.pinot.common.data.TimeGranularitySpec;
import com.linkedin.pinot.core.data.readers.FileFormat;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.indexsegment.generator.SegmentVersion;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericData.Array;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.util.Utf8;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Predicate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SegmentTestUtils {
private static final Logger LOGGER = LoggerFactory.getLogger(SegmentTestUtils.class);
public static SegmentGeneratorConfig getSegmentGenSpecWithSchemAndProjectedColumns(File inputAvro, File outputDir,
String timeColumn, TimeUnit timeUnit, String tableName) throws FileNotFoundException,
IOException {
final SegmentGeneratorConfig segmentGenSpec =
new SegmentGeneratorConfig(extractSchemaFromAvroWithoutTime(inputAvro));
segmentGenSpec.setInputFilePath(inputAvro.getAbsolutePath());
segmentGenSpec.setTimeColumnName(timeColumn);
segmentGenSpec.setSegmentTimeUnit(timeUnit);
segmentGenSpec.setFormat(FileFormat.AVRO);
segmentGenSpec.setSegmentVersion(SegmentVersion.v1);
segmentGenSpec.setTableName(tableName);
segmentGenSpec.setOutDir(outputDir.getAbsolutePath());
segmentGenSpec.createInvertedIndexForAllColumns();
return segmentGenSpec;
}
public static SegmentGeneratorConfig getSegmentGeneratorConfigWithSchema(File inputAvro, File outputDir,
String tableName, Schema schema) {
SegmentGeneratorConfig segmentGeneratorConfig = new SegmentGeneratorConfig(schema);
segmentGeneratorConfig.setInputFilePath(inputAvro.getAbsolutePath());
segmentGeneratorConfig.setOutDir(outputDir.getAbsolutePath());
segmentGeneratorConfig.setFormat(FileFormat.AVRO);
segmentGeneratorConfig.setSegmentVersion(SegmentVersion.v1);
segmentGeneratorConfig.setTableName(tableName);
segmentGeneratorConfig.setTimeColumnName(schema.getTimeColumnName());
segmentGeneratorConfig.setSegmentTimeUnit(schema.getOutgoingTimeUnit());
return segmentGeneratorConfig;
}
public static List<String> getColumnNamesFromAvro(File avro) throws FileNotFoundException, IOException {
List<String> ret = new ArrayList<String>();
DataFileStream<GenericRecord> dataStream =
new DataFileStream<GenericRecord>(new FileInputStream(avro), new GenericDatumReader<GenericRecord>());
for (final Field field : dataStream.getSchema().getFields()) {
ret.add(field.name());
}
return ret;
}
public static Schema extractSchemaFromAvro(File avroFile, Map<String, FieldType> fieldTypeMap, TimeUnit granularity)
throws IOException {
DataFileStream<GenericRecord> dataStream =
new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
Schema schema = new Schema();
for (final Field field : dataStream.getSchema().getFields()) {
final String columnName = field.name();
FieldType fieldType = fieldTypeMap.get(columnName);
Preconditions.checkNotNull(fieldType);
switch (fieldType) {
case TIME:
final TimeGranularitySpec gSpec = new TimeGranularitySpec(getColumnType(field), granularity, columnName);
final TimeFieldSpec fSpec = new TimeFieldSpec(gSpec);
schema.addField(fSpec);
continue;
case DIMENSION:
final FieldSpec dimensionFieldSpec =
new DimensionFieldSpec(columnName, getColumnType(field), isSingleValueField(field));
schema.addField(dimensionFieldSpec);
continue;
case METRIC:
final FieldSpec metricFieldSpec = new MetricFieldSpec(columnName, getColumnType(field));
schema.addField(metricFieldSpec);
continue;
default:
throw new UnsupportedOperationException("Unsupported field type: " + fieldType);
}
}
dataStream.close();
return schema;
}
public static Schema extractSchemaFromAvroWithoutTime(File avroFile) throws FileNotFoundException, IOException {
DataFileStream<GenericRecord> dataStream =
new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
Schema schema = new Schema();
for (final Field field : dataStream.getSchema().getFields()) {
try {
getColumnType(field);
} catch (Exception e) {
LOGGER.warn("Caught exception while converting Avro field {} of type {}, field will not be in schema.",
field.name(), field.schema().getType());
continue;
}
final String columnName = field.name();
final String pinotType = field.getProp("pinotType");
final FieldSpec fieldSpec;
if (pinotType != null && "METRIC".equals(pinotType)) {
fieldSpec = new MetricFieldSpec();
} else {
fieldSpec = new DimensionFieldSpec();
}
fieldSpec.setName(columnName);
fieldSpec.setDataType(getColumnType(dataStream.getSchema().getField(columnName)));
fieldSpec.setSingleValueField(isSingleValueField(dataStream.getSchema().getField(columnName)));
schema.addField(fieldSpec);
}
dataStream.close();
return schema;
}
private static boolean isSingleValueField(Field field) {
org.apache.avro.Schema fieldSchema = field.schema();
fieldSchema = extractSchemaFromUnionIfNeeded(fieldSchema);
final Type type = fieldSchema.getType();
if (type == Type.ARRAY) {
return false;
}
return true;
}
public static DataType getColumnType(Field field) {
org.apache.avro.Schema fieldSchema = field.schema();
fieldSchema = extractSchemaFromUnionIfNeeded(fieldSchema);
final Type type = fieldSchema.getType();
if (type == Type.ARRAY) {
org.apache.avro.Schema elementSchema = extractSchemaFromUnionIfNeeded(fieldSchema.getElementType());
if (elementSchema.getType() == Type.RECORD) {
if (elementSchema.getFields().size() == 1) {
elementSchema = elementSchema.getFields().get(0).schema();
} else {
throw new RuntimeException("More than one schema in Multi-value column!");
}
elementSchema = extractSchemaFromUnionIfNeeded(elementSchema);
}
return DataType.valueOf(elementSchema.getType());
} else {
return DataType.valueOf(type);
}
}
private static org.apache.avro.Schema extractSchemaFromUnionIfNeeded(org.apache.avro.Schema fieldSchema) {
if ((fieldSchema).getType() == Type.UNION) {
fieldSchema = ((org.apache.avro.Schema) CollectionUtils.find(fieldSchema.getTypes(), new Predicate() {
@Override
public boolean evaluate(Object object) {
return ((org.apache.avro.Schema) object).getType() != Type.NULL;
}
}));
}
return fieldSchema;
}
private static Object[] transformAvroArrayToObjectArray(Array arr) {
if (arr == null) {
return new Object[0];
}
final Object[] ret = new Object[arr.size()];
final Iterator iterator = arr.iterator();
int i = 0;
while (iterator.hasNext()) {
Object value = iterator.next();
if (value instanceof Record) {
value = ((Record) value).get(0);
}
if (value instanceof Utf8) {
value = ((Utf8) value).toString();
}
ret[i++] = value;
}
return ret;
}
}