/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.test.pigmix.mapreduce; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.KeyValueTextInputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Partitioner; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.jobcontrol.Job; import org.apache.hadoop.mapred.jobcontrol.JobControl; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.pig.test.pigmix.mapreduce.Library; public class L10 { public static class MyType implements WritableComparable<MyType> { public String query_term; int timespent; double estimated_revenue; public MyType() { query_term = null; timespent = 0; estimated_revenue = 0.0; } public MyType(Text qt, Text ts, Text er) { query_term = qt.toString(); try { timespent = Integer.valueOf(ts.toString()); } catch (NumberFormatException nfe) { timespent = 0; } try { estimated_revenue = Double.valueOf(er.toString()); } catch (NumberFormatException nfe) { estimated_revenue = 0.0; } } public void write(DataOutput out) throws IOException { out.writeInt(timespent); out.writeDouble(estimated_revenue); out.writeInt(query_term.length()); out.writeBytes(query_term); } public void readFields(DataInput in) throws IOException { timespent = in.readInt(); estimated_revenue = in.readDouble(); int len = in.readInt(); byte[] b = new byte[len]; in.readFully(b); query_term = new String(b); } public int compareTo(MyType other) { int rc = query_term.compareTo(other.query_term); if (rc != 0) return rc; if (estimated_revenue < other.estimated_revenue) return 1; else if (estimated_revenue > other.estimated_revenue) return -1; if (timespent < other.timespent) return -1; else if (timespent > other.timespent) return 1; return 0; } } public static class ReadPageViews extends MapReduceBase implements Mapper<LongWritable, Text, MyType, Text> { public void map( LongWritable k, Text val, OutputCollector<MyType, Text> oc, Reporter reporter) throws IOException { // Split the line List<Text> fields = Library.splitLine(val, ''); if (fields.size() != 9) return; oc.collect(new MyType(fields.get(3), fields.get(2), fields.get(6)), val); } } public static class MyPartitioner implements Partitioner<MyType, Text> { public Map<Character, Integer> map; public int getPartition(MyType key, Text value, int numPartitions) { int rc = 0; if (key==null || key.query_term == null || key.query_term.length() < 1 ) return 0; rc += map.get(key.query_term.charAt(0)); return rc; } public void configure(JobConf conf) { // Don't actually do any configuration, do the setup of the hash // because this call is guaranteed to be made each time we set up // MyPartitioner map = new HashMap<Character, Integer>(59); map.put('A', 1); map.put('B', 1); map.put('C', 2); map.put('D', 2); map.put('E', 3); map.put('F', 3); map.put('G', 4); map.put('H', 4); map.put('I', 5); map.put('J', 5); map.put('K', 6); map.put('L', 6); map.put('M', 7); map.put('N', 7); map.put('O', 8); map.put('P', 8); map.put('Q', 9); map.put('R', 9); map.put('S', 10); map.put('T', 10); map.put('U', 11); map.put('V', 11); map.put('W', 12); map.put('X', 12); map.put('Y', 13); map.put('Z', 13); map.put('[', 14); map.put('\\', 14); map.put(']', 15); map.put('^', 15); map.put('_', 16); map.put('`', 16); map.put('a', 17); map.put('b', 17); map.put('c', 18); map.put('d', 18); map.put('e', 19); map.put('f', 20); map.put('g', 20); map.put('h', 21); map.put('i', 22); map.put('j', 23); map.put('k', 24); map.put('l', 25); map.put('m', 26); map.put('n', 27); map.put('o', 28); map.put('p', 29); map.put('q', 30); map.put('r', 31); map.put('s', 32); map.put('t', 33); map.put('u', 34); map.put('v', 35); map.put('w', 36); map.put('x', 37); map.put('y', 38); map.put('z', 39); } } public static class Group extends MapReduceBase implements Reducer<MyType, Text, MyType, Text> { public void reduce( MyType key, Iterator<Text> iter, OutputCollector<MyType, Text> oc, Reporter reporter) throws IOException { while (iter.hasNext()) { oc.collect(null, iter.next()); } } } public static void main(String[] args) throws IOException { if (args.length!=3) { System.out.println("Parameters: inputDir outputDir parallel"); System.exit(1); } String inputDir = args[0]; String outputDir = args[1]; String parallel = args[2]; JobConf lp = new JobConf(L10.class); lp.setJobName("L10 Load Page Views"); lp.setInputFormat(TextInputFormat.class); lp.setOutputKeyClass(MyType.class); lp.setOutputValueClass(Text.class); lp.setMapperClass(ReadPageViews.class); lp.setReducerClass(Group.class); lp.setPartitionerClass(MyPartitioner.class); Properties props = System.getProperties(); for (Map.Entry<Object,Object> entry : props.entrySet()) { lp.set((String)entry.getKey(), (String)entry.getValue()); } FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views")); FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/L10out")); // Hardcode the parallel to 40 since MyPartitioner assumes it lp.setNumReduceTasks(40); Job group = new Job(lp); JobControl jc = new JobControl("L10 join"); jc.addJob(group); new Thread(jc).start(); int i = 0; while(!jc.allFinished()){ ArrayList<Job> failures = jc.getFailedJobs(); if (failures != null && failures.size() > 0) { for (Job failure : failures) { System.err.println(failure.getMessage()); } break; } try { Thread.sleep(5000); } catch (InterruptedException e) {} if (i % 10000 == 0) { System.out.println("Running jobs"); ArrayList<Job> running = jc.getRunningJobs(); if (running != null && running.size() > 0) { for (Job r : running) { System.out.println(r.getJobName()); } } System.out.println("Ready jobs"); ArrayList<Job> ready = jc.getReadyJobs(); if (ready != null && ready.size() > 0) { for (Job r : ready) { System.out.println(r.getJobName()); } } System.out.println("Waiting jobs"); ArrayList<Job> waiting = jc.getWaitingJobs(); if (waiting != null && waiting.size() > 0) { for (Job r : ready) { System.out.println(r.getJobName()); } } System.out.println("Successful jobs"); ArrayList<Job> success = jc.getSuccessfulJobs(); if (success != null && success.size() > 0) { for (Job r : ready) { System.out.println(r.getJobName()); } } } i++; } ArrayList<Job> failures = jc.getFailedJobs(); if (failures != null && failures.size() > 0) { for (Job failure : failures) { System.err.println(failure.getMessage()); } } jc.stop(); } }