/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.impl.logicalLayer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.pig.PigException;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.ProjectionMap;
import org.apache.pig.impl.plan.RequiredFields;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.MultiMap;
import org.apache.pig.impl.util.Pair;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class LOCross extends RelationalOperator {
private static final long serialVersionUID = 2L;
private static Log log = LogFactory.getLog(LOCross.class);
private List<LogicalOperator> mSchemaInputMapping = new ArrayList<LogicalOperator>();
/**
*
* @param plan
* Logical plan this operator is a part of.
* @param k
* Operator key to assign to this node.
*/
public LOCross(LogicalPlan plan, OperatorKey k) {
super(plan, k);
}
public List<LogicalOperator> getInputs() {
return mPlan.getPredecessors(this);
}
@Override
public Schema getSchema() throws FrontendException {
List<LogicalOperator> inputs = mPlan.getPredecessors(this);
if (!mIsSchemaComputed) {
List<Schema.FieldSchema> fss = new ArrayList<Schema.FieldSchema>();
mSchemaInputMapping = new ArrayList<LogicalOperator>();
Map<Schema.FieldSchema, String> flattenAlias = new HashMap<Schema.FieldSchema, String>();
Map<String, Boolean> inverseFlattenAlias = new HashMap<String, Boolean>();
Map<String, Integer> aliases = new HashMap<String, Integer>();
for (LogicalOperator op : inputs) {
String opAlias = op.getAlias();
Schema s = op.getSchema();
Schema.FieldSchema newFs;
//need to extract the children and create the aliases
//assumption here is that flatten is only for one column
//i.e., flatten(A), flatten(A.x) and NOT
//flatten(B.(x,y,z))
if(null != s) {
for(Schema.FieldSchema fs: s.getFields()) {
log.debug("fs: " + fs);
log.debug("fs.alias: " + fs.alias);
if(null != fs.alias) {
String disambiguatorAlias = opAlias + "::" + fs.alias;
newFs = new Schema.FieldSchema(disambiguatorAlias, fs.schema, fs.type);
fss.add(newFs);
mSchemaInputMapping.add(op);
Integer count;
count = aliases.get(fs.alias);
if(null == count) {
aliases.put(fs.alias, 1);
} else {
aliases.put(fs.alias, ++count);
}
count = aliases.get(disambiguatorAlias);
if(null == count) {
aliases.put(disambiguatorAlias, 1);
} else {
aliases.put(disambiguatorAlias, ++count);
}
flattenAlias.put(newFs, fs.alias);
inverseFlattenAlias.put(fs.alias, true);
//it's fine if there are duplicates
//we just need to record if its due to
//flattening
} else {
newFs = new Schema.FieldSchema(null, DataType.BYTEARRAY);
fss.add(newFs);
mSchemaInputMapping.add(op);
}
newFs.setParent(fs.canonicalName, op);
}
} else {
mSchema = null;
mIsSchemaComputed = true;
return mSchema;
}
}
//check for duplicate column names and throw an error if there are duplicates
//ensure that flatten gets rid of duplicate column names when the checks are
//being done
log.debug(" flattenAlias: " + flattenAlias);
log.debug(" inverseFlattenAlias: " + inverseFlattenAlias);
log.debug(" aliases: " + aliases);
log.debug(" fss.size: " + fss.size());
boolean duplicates = false;
Set<String> duplicateAliases = new HashSet<String>();
for(Map.Entry<String, Integer> e: aliases.entrySet()) {
Integer count = e.getValue();
if(count > 1) {
Boolean inFlatten = false;
log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias);
inFlatten = inverseFlattenAlias.get(e.getKey());
log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias);
if((null != inFlatten) && (!inFlatten)) {
duplicates = true;
duplicateAliases.add(e.getKey());
}
}
}
if(duplicates) {
String errMessage = null;
StringBuilder sb = new StringBuilder("Found duplicates in schema. ");
if(duplicateAliases.size() > 0) {
Iterator<String> iter = duplicateAliases.iterator();
sb.append(": ");
sb.append(iter.next());
while(iter.hasNext()) {
sb.append(", ");
sb.append(iter.next());
}
}
sb.append(". Please alias the columns with unique names.");
errMessage = sb.toString();
int errCode = 1007;
throw new FrontendException(errMessage, errCode, PigException.INPUT, false, null);
}
mSchema = new Schema(fss);
//add the aliases that are unique after flattening
for(Schema.FieldSchema fs: mSchema.getFields()) {
String alias = flattenAlias.get(fs);
Integer count = aliases.get(alias);
if (null == count) count = 1;
log.debug("alias: " + alias);
if((null != alias) && (count == 1)) {
mSchema.addAlias(alias, fs);
}
}
mIsSchemaComputed = true;
}
return mSchema;
}
@Override
public String name() {
return getAliasString() + "Cross " + mKey.scope + "-" + mKey.id;
}
@Override
public boolean supportsMultipleInputs() {
return true;
}
@Override
public void visit(LOVisitor v) throws VisitorException {
v.visit(this);
}
@Override
public byte getType() {
return DataType.BAG ;
}
@Override
public ProjectionMap getProjectionMap() {
if(mIsProjectionMapComputed) return mProjectionMap;
mIsProjectionMapComputed = true;
Schema outputSchema;
try {
outputSchema = getSchema();
} catch (FrontendException fee) {
mProjectionMap = null;
return mProjectionMap;
}
if(outputSchema == null) {
mProjectionMap = null;
return mProjectionMap;
}
List<LogicalOperator> predecessors = (ArrayList<LogicalOperator>)mPlan.getPredecessors(this);
if(predecessors == null) {
mProjectionMap = null;
return mProjectionMap;
}
MultiMap<Integer, ProjectionMap.Column> mapFields = new MultiMap<Integer, ProjectionMap.Column>();
List<Integer> addedFields = new ArrayList<Integer>();
boolean[] unknownSchema = new boolean[predecessors.size()];
boolean anyUnknownInputSchema = false;
int outputColumnNum = 0;
for(int inputNum = 0; inputNum < predecessors.size(); ++inputNum) {
LogicalOperator predecessor = predecessors.get(inputNum);
Schema inputSchema = null;
try {
inputSchema = predecessor.getSchema();
} catch (FrontendException fee) {
mProjectionMap = null;
return mProjectionMap;
}
if(inputSchema == null) {
unknownSchema[inputNum] = true;
outputColumnNum++;
addedFields.add(inputNum);
anyUnknownInputSchema = true;
} else {
unknownSchema[inputNum] = false;
for(int inputColumn = 0; inputColumn < inputSchema.size(); ++inputColumn) {
mapFields.put(outputColumnNum++, new ProjectionMap.Column(new Pair<Integer, Integer>(inputNum, inputColumn)));
}
}
}
//TODO
/*
* For now, if there is any input that has an unknown schema
* flag it and return a null ProjectionMap.
* In the future, when unknown schemas are handled
* mark inputs that have unknown schemas as output columns
* that have been added.
*/
if(anyUnknownInputSchema) {
mProjectionMap = null;
return mProjectionMap;
}
if(addedFields.size() == 0) {
addedFields = null;
}
mProjectionMap = new ProjectionMap(mapFields, null, addedFields);
return mProjectionMap;
}
@Override
public List<RequiredFields> getRequiredFields() {
List<LogicalOperator> predecessors = mPlan.getPredecessors(this);
if(predecessors == null) {
return null;
}
List<RequiredFields> requiredFields = new ArrayList<RequiredFields>();
for(int inputNum = 0; inputNum < predecessors.size(); ++inputNum) {
requiredFields.add(new RequiredFields(true));
}
return (requiredFields.size() == 0? null: requiredFields);
}
@Override
public List<RequiredFields> getRelevantInputs(int output, int column) throws FrontendException {
if (!mIsSchemaComputed)
getSchema();
if (output!=0)
return null;
if (column<0)
return null;
if (mSchema==null)
return null;
if (column>mSchema.size()-1)
return null;
List<LogicalOperator> predecessors = (ArrayList<LogicalOperator>)mPlan.getPredecessors(this);
if(predecessors == null) {
return null;
}
List<RequiredFields> result = new ArrayList<RequiredFields>();
for (int i=0;i<predecessors.size();i++)
result.add(null);
// Figure out the # of input does this output column belong to, and the # of column of that input.
// When we call getSchema, we will cache mSchemaInputMapping for a mapping of output column and it's input.
// We count the number of different inputs we've seen from mSchemaInputMapping[0] to
// mSchemaInputMapping[column] to find out the # of input
int inputNum = -1;
int inputColumn = 0;
LogicalOperator op = null;
for (int i=0;i<=column;i++)
{
if (mSchemaInputMapping.get(i)!=op)
{
inputNum++;
inputColumn = 0;
op = mSchemaInputMapping.get(i);
}
else
inputColumn++;
}
ArrayList<Pair<Integer, Integer>> inputList = new ArrayList<Pair<Integer, Integer>>();
inputList.add(new Pair<Integer, Integer>(inputNum, inputColumn));
RequiredFields requiredFields = new RequiredFields(inputList);
result.set(inputNum, requiredFields);
return result;
}
}