/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
* NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package com.manning.hip.ch11;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.pig.*;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.*;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import java.io.IOException;
import java.util.*;
/**
* TypedCommonLogLoader is used to load logs based on Apache's
* Common Log Format (CLF), based on the following format:
* <p/>
* "%h %l %u %t \"%r\" %>s %b"
* <p/>
* Example of log line:
* <p/>
* 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
* <p/>
* Example use of LoadFunc:
* <p/>
* raw = LOAD 'common_log.txt' USING com.manning.hip.ch7.TypedCommonLogLoader;
* <p/>
* describe raw;
* logs: {remoteAddr: chararray,remoteLogname: chararray,userid: chararray,
* time: chararray,requestLine: chararray,statusCode: long,objSize: long,
* method: chararray,resource: chararray,protocol: chararray,epoch: long}
*/
public class ComplexTupleLoader extends FileInputLoadFunc
implements LoadMetadata {
protected RecordReader reader = null;
private TupleFactory tupleFactory = TupleFactory.getInstance();
private BagFactory bagFactory = BagFactory.getInstance();
@Override
public Tuple getNext() throws IOException {
try {
if(!reader.nextKeyValue()) {
return null;
}
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
} ArrayList<Object> tuple = new ArrayList<Object>();
tuple.add("127.0.0.1");
Map<String, Object> header = new HashMap<String, Object>();
header.put("User-Agent", "Mozilla");
tuple.add(header);
ArrayList<Tuple> bodyTuples = new ArrayList<Tuple>();
bodyTuples.add(newBodyTuple("keyword1"));
bodyTuples.add(newBodyTuple("keyword2"));
tuple.add(bagFactory.newDefaultBag(bodyTuples));
return tupleFactory.newTuple(tuple);
}
public Tuple newBodyTuple(String line) {
return tupleFactory.newTuple(Arrays.asList(line));
}
@Override
public void setLocation(String location, Job job)
throws IOException {
FileInputFormat.setInputPaths(job, location);
}
@SuppressWarnings("rawtypes")
@Override
public InputFormat getInputFormat() throws IOException {
return new TextInputFormat();
}
@Override
public void prepareToRead(
@SuppressWarnings("rawtypes") RecordReader reader,
PigSplit split)
throws IOException {
this.reader = reader;
}
public ResourceSchema getSchema(String location, Job job)
throws IOException {
Schema schema = new Schema();
Schema headerSchema = new Schema(
Arrays.asList(
new Schema.FieldSchema("word", DataType.CHARARRAY)
));
schema.add(new Schema.FieldSchema("ip", DataType.CHARARRAY));
schema.add(new Schema.FieldSchema("header", DataType.MAP));
schema.add(new Schema.FieldSchema("keywords", headerSchema));
return new ResourceSchema(schema);
}
public ResourceStatistics getStatistics(String location, Job job)
throws IOException {
return null;
}
public String[] getPartitionKeys(String location, Job job)
throws IOException {
return null;
}
public void setPartitionFilter(Expression partitionFilter)
throws IOException {
}
}