/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.pig.summary;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import org.apache.pig.Algebraic;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.parser.ParserException;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.map.JsonMappingException;
/**
* computes a summary of the input to a json string
*
* @author Julien Le Dem
*
*/
public class Summary extends EvalFunc<String> implements Algebraic {
private static final TupleFactory TF = TupleFactory.getInstance();
private Schema inputSchema;
private String signature;
public static class Initial extends EvalFunc<Tuple> {
private Schema inputSchema;
@Override
public void setUDFContextSignature(String signature) {
inputSchema = Summary.getInputSchema(signature);
}
@Override
public Tuple exec(Tuple t) throws IOException {
return new JSONTuple(sumUp(inputSchema, t));
}
}
public static class Intermediate extends EvalFunc<Tuple> {
@Override
public Tuple exec(Tuple t) throws IOException {
return new JSONTuple(merge(t));
}
}
public static class Final extends EvalFunc<String> {
@Override
public String exec(Tuple t) throws IOException {
return SummaryData.toPrettyJSON(merge(t));
}
}
private static final class JSONTuple implements Tuple {
private static final long serialVersionUID = 1L;
private TupleSummaryData data;
public JSONTuple(TupleSummaryData data) {
this.data = data;
}
@Override
public void readFields(DataInput dataInput) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void write(DataOutput dataOutput) throws IOException {
Tuple t = TF.newTuple(json());
t.write(dataOutput);
}
@Override
public int compareTo(Object o) {
throw new UnsupportedOperationException();
}
@Override
public void append(Object o) {
throw new UnsupportedOperationException();
}
@Override
public Object get(int i) throws ExecException {
if (i == 0) {
return json();
}
throw new ExecException();
}
private String json() {
return SummaryData.toJSON(data);
}
@Override
public List<Object> getAll() {
return new ArrayList<Object>(Arrays.asList(json()));
}
@Override
public long getMemorySize() {
// I don't know. Not too big and we're not going to have many
return 100;
}
@Override
public byte getType(int i) throws ExecException {
if (i == 0) {
return DataType.CHARARRAY;
}
throw new ExecException("size is 1");
}
@Override
public boolean isNull(int i) throws ExecException {
if (i == 0) {
return false;
}
throw new ExecException("size is 1");
}
@Override
public void reference(Tuple t) {
throw new UnsupportedOperationException();
}
@Override
public void set(int i, Object o) throws ExecException {
throw new UnsupportedOperationException();
}
@Override
public int size() {
return 1;
}
@Override
public String toDelimitedString(String delim) throws ExecException {
return json();
}
@Override
public Iterator<Object> iterator() {
return getAll().iterator();
}
}
private static Properties getProperties(String signature) {
return UDFContext.getUDFContext().getUDFProperties(Summary.class, new String[] { signature });
}
private static Schema getInputSchema(String signature) {
try {
return Utils.getSchemaFromString(getProperties(signature).getProperty("inputSchema"));
} catch (ParserException e) {
throw new RuntimeException(e);
}
}
private static TupleSummaryData getData(Tuple tuple) throws ExecException {
if (tuple instanceof JSONTuple) {
return ((JSONTuple) tuple).data;
} else {
return SummaryData.fromJSON((String) tuple.get(0), TupleSummaryData.class);
}
}
/**
* the input tuple contains a bag of string representations of TupleSummaryData
*
* @param t
* @return
* @throws ExecException
* @throws JsonParseException
* @throws JsonMappingException
* @throws IOException
*/
private static TupleSummaryData merge(Tuple t) throws IOException {
TupleSummaryData summaryData = new TupleSummaryData();
DataBag bag = (DataBag) t.get(0);
for (Tuple tuple : bag) {
summaryData.merge(getData(tuple));
}
return summaryData;
}
/**
* The input tuple contains a bag of Tuples to sum up
*
* @param t
* @return
* @throws ExecException
*/
private static TupleSummaryData sumUp(Schema schema, Tuple t) throws ExecException {
TupleSummaryData summaryData = new TupleSummaryData();
DataBag bag = (DataBag) t.get(0);
for (Tuple tuple : bag) {
summaryData.addTuple(schema, tuple);
}
return summaryData;
}
@Override
public String exec(Tuple t) throws IOException {
return SummaryData.toPrettyJSON(sumUp(inputSchema, t));
}
@Override
public String getInitial() {
return Initial.class.getName();
}
@Override
public String getIntermed() {
return Intermediate.class.getName();
}
@Override
public String getFinal() {
return Final.class.getName();
}
@Override
public void setInputSchema(Schema input) {
try {
// relation.bag.tuple
this.inputSchema = input.getField(0).schema.getField(0).schema;
saveSchemaToUDFContext();
} catch (FrontendException e) {
throw new RuntimeException("Usage: B = FOREACH (GROUP A ALL) GENERATE Summary(A); Can not get schema from " + input, e);
} catch (RuntimeException e) {
throw new RuntimeException("Usage: B = FOREACH (GROUP A ALL) GENERATE Summary(A); Can not get schema from "+input, e);
}
}
@Override
public void setUDFContextSignature(String signature) {
this.signature = signature;
saveSchemaToUDFContext();
}
private void saveSchemaToUDFContext() {
if (signature != null && inputSchema != null) {
String schemaString = inputSchema.toString();
getProperties(signature).put("inputSchema", schemaString.substring(1, schemaString.length() - 1));
}
}
}