/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.shifu.guagua;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import ml.shifu.guagua.mapreduce.GuaguaMapReduceClient;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.parser.ParserException;
import parquet.hadoop.ParquetInputFormat;
import parquet.hadoop.metadata.GlobalMetaData;
import parquet.io.ParquetDecodingException;
import parquet.pig.PigMetaData;
import parquet.pig.PigSchemaConverter;
import parquet.pig.SchemaConversionException;
import parquet.schema.IncompatibleSchemaModificationException;
import parquet.schema.MessageType;
/**
* {@link GuaguaParquetMapReduceClient} is to append parquet format to job configuration which can be read in later
* mappers.
*/
public class GuaguaParquetMapReduceClient extends GuaguaMapReduceClient {
private static final PigSchemaConverter pigSchemaConverter = new PigSchemaConverter(false);
/**
* Create Hadoop job according to arguments from main.
*/
@Override
public synchronized Job createJob(String[] args) throws IOException {
Job job = super.createJob(args);
// for parquet format job, we have to append parquet schema field. We can only set parquet.pig.schema here
// because of 'Job' dependency. While the other two required list parameters are in TrainModelProcessor.
@SuppressWarnings("rawtypes")
final GlobalMetaData globalMetaData = new ParquetInputFormat().getGlobalMetaData(job);
Schema schema = getPigSchemaFromMultipleFiles(globalMetaData.getSchema(), globalMetaData.getKeyValueMetaData());
String schemaStr = pigSchemaToString(schema);
job.getConfiguration().set("parquet.pig.schema", schemaStr);
return job;
}
/**
* @param pigSchema
* the pig schema to turn into a string representation
* @return the sctring representation of the schema
*/
static String pigSchemaToString(Schema pigSchema) {
final String pigSchemaString = pigSchema.toString();
return pigSchemaString.substring(1, pigSchemaString.length() - 1);
}
/**
* @param fileSchema
* the parquet schema from the file
* @param keyValueMetaData
* the extra meta data from the files
* @return the pig schema according to the file
*/
static Schema getPigSchemaFromMultipleFiles(MessageType fileSchema, Map<String, Set<String>> keyValueMetaData) {
Set<String> pigSchemas = PigMetaData.getPigSchemas(keyValueMetaData);
if(pigSchemas == null) {
return pigSchemaConverter.convert(fileSchema);
}
Schema mergedPigSchema = null;
for(String pigSchemaString: pigSchemas) {
try {
mergedPigSchema = union(mergedPigSchema, parsePigSchema(pigSchemaString));
} catch (FrontendException e) {
throw new ParquetDecodingException("can not merge " + pigSchemaString + " into " + mergedPigSchema, e);
}
}
return mergedPigSchema;
}
/**
* @param pigSchemaString
* the pig schema to parse
* @return the parsed pig schema
*/
public static Schema parsePigSchema(String pigSchemaString) {
try {
return pigSchemaString == null ? null : Utils.getSchemaFromString(pigSchemaString);
} catch (ParserException e) {
throw new SchemaConversionException("could not parse Pig schema: " + pigSchemaString, e);
}
}
private static Schema union(Schema merged, Schema pigSchema) throws FrontendException {
List<FieldSchema> fields = new ArrayList<Schema.FieldSchema>();
if(merged == null) {
return pigSchema;
}
// merging existing fields
for(FieldSchema fieldSchema: merged.getFields()) {
FieldSchema newFieldSchema = pigSchema.getField(fieldSchema.alias);
if(newFieldSchema == null) {
fields.add(fieldSchema);
} else {
fields.add(union(fieldSchema, newFieldSchema));
}
}
// adding new fields
for(FieldSchema newFieldSchema: pigSchema.getFields()) {
FieldSchema oldFieldSchema = merged.getField(newFieldSchema.alias);
if(oldFieldSchema == null) {
fields.add(newFieldSchema);
}
}
return new Schema(fields);
}
private static FieldSchema union(FieldSchema mergedFieldSchema, FieldSchema newFieldSchema) {
if(!mergedFieldSchema.alias.equals(newFieldSchema.alias) || mergedFieldSchema.type != newFieldSchema.type) {
throw new IncompatibleSchemaModificationException("Incompatible Pig schema change: " + mergedFieldSchema
+ " can not accept");
}
try {
return new FieldSchema(mergedFieldSchema.alias, union(mergedFieldSchema.schema, newFieldSchema.schema),
mergedFieldSchema.type);
} catch (FrontendException e) {
throw new SchemaConversionException(e);
}
}
}