/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.jobcontrol.Job; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.Reducer.Context; import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPackage; import org.apache.pig.impl.io.NullableTuple; import org.apache.pig.impl.io.PigNullableWritable; import org.apache.pig.impl.util.Pair; import org.apache.pig.pen.FakeRawKeyValueIterator; public class PigMapReduce extends PigGenericMapReduce { public static class Reduce extends PigGenericMapReduce.Reduce { /** * Get reducer's illustrator context * * @param input Input buffer as output by maps * @param pkg package * @return reducer's illustrator context * @throws IOException * @throws InterruptedException */ @Override public Context getIllustratorContext(Job job, List<Pair<PigNullableWritable, Writable>> input, POPackage pkg) throws IOException, InterruptedException { return new IllustratorContext(job, input, pkg); } @SuppressWarnings("unchecked") public class IllustratorContext extends Context { private PigNullableWritable currentKey = null, nextKey = null; private NullableTuple nextValue = null; private List<NullableTuple> currentValues = null; private Iterator<Pair<PigNullableWritable, Writable>> it; private final ByteArrayOutputStream bos; private final DataOutputStream dos; private final RawComparator sortComparator, groupingComparator; POPackage pack = null; public IllustratorContext(Job job, List<Pair<PigNullableWritable, Writable>> input, POPackage pkg ) throws IOException, InterruptedException { super(job.getJobConf(), new TaskAttemptID(), new FakeRawKeyValueIterator(input.iterator().hasNext()), null, null, null, null, new IllustrateDummyReporter(), null, PigNullableWritable.class, NullableTuple.class); bos = new ByteArrayOutputStream(); dos = new DataOutputStream(bos); org.apache.hadoop.mapreduce.Job nwJob = new org.apache.hadoop.mapreduce.Job(job.getJobConf()); sortComparator = nwJob.getSortComparator(); groupingComparator = nwJob.getGroupingComparator(); Collections.sort(input, new Comparator<Pair<PigNullableWritable, Writable>>() { @Override public int compare(Pair<PigNullableWritable, Writable> o1, Pair<PigNullableWritable, Writable> o2) { try { o1.first.write(dos); int l1 = bos.size(); o2.first.write(dos); int l2 = bos.size(); byte[] bytes = bos.toByteArray(); bos.reset(); return sortComparator.compare(bytes, 0, l1, bytes, l1, l2-l1); } catch (IOException e) { throw new RuntimeException("Serialization exception in sort:"+e.getMessage()); } } } ); currentValues = new ArrayList<NullableTuple>(); it = input.iterator(); if (it.hasNext()) { Pair<PigNullableWritable, Writable> entry = it.next(); nextKey = entry.first; nextValue = (NullableTuple) entry.second; } pack = pkg; } @Override public PigNullableWritable getCurrentKey() { return currentKey; } @Override public boolean nextKey() { if (nextKey == null) return false; currentKey = nextKey; currentValues.clear(); currentValues.add(nextValue); nextKey = null; for(; it.hasNext(); ) { Pair<PigNullableWritable, Writable> entry = it.next(); /* Why can't raw comparison be used? byte[] bytes; int l1, l2; try { currentKey.write(dos); l1 = bos.size(); entry.first.write(dos); l2 = bos.size(); bytes = bos.toByteArray(); } catch (IOException e) { throw new RuntimeException("nextKey exception : "+e.getMessage()); } bos.reset(); if (groupingComparator.compare(bytes, 0, l1, bytes, l1, l2-l1) == 0) */ if (groupingComparator.compare(currentKey, entry.first) == 0) { currentValues.add((NullableTuple)entry.second); } else { nextKey = entry.first; nextValue = (NullableTuple) entry.second; break; } } return true; } @Override public Iterable<NullableTuple> getValues() { return currentValues; } @Override public void write(PigNullableWritable k, Writable t) { } @Override public void progress() { } } @Override public boolean inIllustrator( org.apache.hadoop.mapreduce.Reducer.Context context) { return (context instanceof PigMapReduce.Reduce.IllustratorContext); } @Override public POPackage getPack( org.apache.hadoop.mapreduce.Reducer.Context context) { return ((PigMapReduce.Reduce.IllustratorContext) context).pack; } } }