/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.tools.data.generator;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.math.IntRange;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.json.JSONException;
import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.FieldSpec.DataType;
import com.linkedin.pinot.common.data.FieldSpec.FieldType;
import com.linkedin.pinot.common.data.MetricFieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.data.TimeFieldSpec;
import com.linkedin.pinot.core.data.readers.FileFormat;
/**
* Sep 12, 2014
*/
public class DataGenerator {
private static final Logger LOGGER = LoggerFactory.getLogger(DataGenerator.class);
private File outDir;
DataGeneratorSpec genSpec;
private final Map<String, Generator> generators;
public DataGenerator() {
generators = new HashMap<String, Generator>();
}
public void init(DataGeneratorSpec spec) throws IOException {
genSpec = spec;
outDir = new File(genSpec.getOutputDir());
if (outDir.exists() && !genSpec.isOverrideOutDir()) {
LOGGER.error("output directory already exists, and override is set to false");
throw new RuntimeException("output directory exists");
}
if (outDir.exists()) {
FileUtils.deleteDirectory(outDir);
}
outDir.mkdir();
for (final String column : genSpec.getColumns()) {
DataType dataType = genSpec.getDataTypesMap().get(column);
if (genSpec.getCardinalityMap().containsKey(column)) {
generators.put(column,
GeneratorFactory.getGeneratorFor(dataType, genSpec.getCardinalityMap().get(column)));
} else if (genSpec.getRangeMap().containsKey(column)) {
IntRange range = genSpec.getRangeMap().get(column);
generators.put(column,
GeneratorFactory.getGeneratorFor(dataType, range.getMinimumInteger(), range.getMaximumInteger()));
} else {
LOGGER.error("cardinality for this column does not exist : " + column);
throw new RuntimeException("cardinality for this column does not exist");
}
generators.get(column).init();
}
}
public void generate(long totalDocs, int numFiles) throws IOException, JSONException {
final int numPerFiles = (int) (totalDocs / numFiles);
for (int i = 0; i < numFiles; i++) {
final AvroWriter writer = new AvroWriter(outDir, i, generators, fetchSchema());
for (int j = 0; j < numPerFiles; j++) {
writer.writeNext();
}
writer.seal();
}
}
public Schema fetchSchema() {
final Schema schema = new Schema();
for (final String column : genSpec.getColumns()) {
final FieldSpec spec = buildSpec(genSpec, column);
schema.addField(spec);
}
return schema;
}
private FieldSpec buildSpec(DataGeneratorSpec genSpec, String column) {
DataType dataType = genSpec.getDataTypesMap().get(column);
FieldType fieldType = genSpec.getFieldTypesMap().get(column);
FieldSpec spec;
switch (fieldType) {
case DIMENSION:
spec = new DimensionFieldSpec();
break;
case METRIC:
spec = new MetricFieldSpec();
break;
case TIME:
spec = new TimeFieldSpec(column, dataType, genSpec.getTimeUnitMap().get(column));
break;
default:
throw new RuntimeException("Invalid Field type.");
}
spec.setName(column);
spec.setDataType(dataType);
spec.setSingleValueField(true);
return spec;
}
public static void main(String[] args) throws IOException, JSONException {
final String[] columns = { "column1", "column2", "column3", "column4", "column5" };
final Map<String, DataType> dataTypes = new HashMap<String, DataType>();
final Map<String, FieldType> fieldTypes = new HashMap<String, FieldType>();
final Map<String, TimeUnit> timeUnits = new HashMap<String, TimeUnit>();
final Map<String, Integer> cardinality = new HashMap<String, Integer>();
final Map<String, IntRange> range = new HashMap<String, IntRange>();
for (final String col : columns) {
dataTypes.put(col, DataType.INT);
fieldTypes.put(col, FieldType.DIMENSION);
cardinality.put(col, 1000);
}
final DataGeneratorSpec spec = new DataGeneratorSpec(Arrays.asList(columns), cardinality,
range, dataTypes, fieldTypes, timeUnits, FileFormat.AVRO, "/tmp/out", true);
final DataGenerator gen = new DataGenerator();
gen.init(spec);
gen.generate(1000000L, 2);
}
}