package org.commoncrawl.mapred.ec2.parser; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.NetworkInterface; import java.nio.charset.Charset; import java.util.Arrays; import java.util.Comparator; import java.util.Enumeration; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.commoncrawl.async.EventLoop; import org.commoncrawl.protocol.CrawlDBService; import org.commoncrawl.protocol.LongQueryParam; import org.commoncrawl.protocol.MapReduceTaskIdAndData; import org.commoncrawl.protocol.SimpleByteResult; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.rpc.base.internal.AsyncContext; import org.commoncrawl.rpc.base.internal.AsyncServerChannel; import org.commoncrawl.rpc.base.internal.NullMessage; import org.commoncrawl.rpc.base.internal.Server; import org.commoncrawl.rpc.base.internal.AsyncRequest.Status; import org.commoncrawl.rpc.base.shared.RPCException; import org.commoncrawl.util.TaskDataUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.Multimap; import com.google.common.collect.TreeMultimap; public class EC2TaskDataAwareTask extends Server implements CrawlDBService, Constants { public static final Log LOG = LogFactory.getLog(EC2TaskDataAwareTask.class); static final int TASK_DATA_PORT = 9200; private EventLoop _eventLoop = new EventLoop(); private static InetAddress _serverAddress = null; public EC2TaskDataAwareTask(Configuration conf)throws IOException { // start async event loop thread _eventLoop.start(); // look for our ip address // ok get our ip address ... _serverAddress = getMasterIPAddress("eth0"); // set the address and port for the task data server (if available) if (_serverAddress == null) { throw new IOException("Unable to determine Master IP Address!"); } else { LOG.info("Task Data IP is:" + _serverAddress.getHostAddress() + " and Port is:" + TASK_DATA_PORT); // ok establish the rpc server channel ... InetSocketAddress taskDataServerAddress = new InetSocketAddress(_serverAddress.getHostAddress(), TASK_DATA_PORT); AsyncServerChannel channel = new AsyncServerChannel(this, _eventLoop, taskDataServerAddress, null); // register the task data service registerService(channel, CrawlDBService.spec); // and start processing rpc requests for it ... start(); } } public void shutdown()throws IOException { _eventLoop.stop(); } /** * get ip address for master */ protected static InetAddress getMasterIPAddress(String intfc)throws IOException { NetworkInterface netIF = NetworkInterface.getByName(intfc); if (netIF != null) { Enumeration<InetAddress> e = netIF.getInetAddresses(); while (e.hasMoreElements()) { InetAddress address = e.nextElement(); // only allow ipv4 addresses for now ... if (address.getAddress().length == 4) { LOG.info("IP for Master on interface:"+ intfc + " is:" + address.getHostAddress()); return address; } } } return null; } protected static void initializeTaskDataAwareJob(JobConf jobConf,long segmentId)throws IOException { // initialize task data client info TaskDataUtils.initializeTaskDataJobConfig(jobConf, segmentId, new InetSocketAddress(_serverAddress, TASK_DATA_PORT)); } protected static void finalizeJob(FileSystem fs,Configuration conf,JobConf jobConf,Path outputPath,long segmentId)throws IOException { writeSplitsManifest(fs,conf,jobConf,outputPath,segmentId); writeTrailingSplitsFile(fs,conf,jobConf,outputPath,segmentId); // purge maps synchronized (_badTaskIdMap) { _badTaskIdMap.removeAll(Long.toString(segmentId)); } synchronized (_goodTaskIdMap) { _goodTaskIdMap.removeAll(Long.toString(segmentId)); } } protected static void writeTrailingSplitsFile(FileSystem fs, Configuration conf, JobConf jobConf,Path outputPath, long segmentTimestamp) throws IOException { ImmutableList.Builder<String> listBuilder = new ImmutableList.Builder<String>(); // ok bad splits map ... synchronized (_badTaskIdMap) { for (String badTaskEntry : _badTaskIdMap.get(Long.toString(segmentTimestamp))) { listBuilder.add(badTaskEntry); } } listToTextFile(listBuilder.build(), fs, new Path(outputPath,TRAILING_SPLITS_MANIFEST_FILE)); } protected static void writeSplitsManifest(FileSystem fs, Configuration conf, JobConf jobConf, Path outputPath,long segmentTimestamp) throws IOException { // calculate splits ... InputSplit[] splits = jobConf.getInputFormat().getSplits(jobConf, jobConf.getNumMapTasks()); LOG.info("Writing Splits Manifest for Segment: " + segmentTimestamp + " splitCount:" + splits.length); ImmutableList.Builder<String> allListBuilder = new ImmutableList.Builder<String>(); ImmutableList.Builder<String> failedListBuilder = new ImmutableList.Builder<String>(); // (taken from hadoop code to replicate split order and generate proper // task id to split mapping) // sort the splits into order based on size, so that the biggest // go first Arrays.sort(splits, new Comparator<org.apache.hadoop.mapred.InputSplit>() { public int compare(org.apache.hadoop.mapred.InputSplit a, org.apache.hadoop.mapred.InputSplit b) { try { long left = a.getLength(); long right = b.getLength(); if (left == right) { return 0; } else if (left < right) { return 1; } else { return -1; } } catch (IOException ie) { throw new RuntimeException( "Problem getting input split size", ie); } } }); String segmentIdStr = Long.toString(segmentTimestamp); int splitIndex = 0; for (InputSplit sortedSplit : splits) { allListBuilder.add(sortedSplit.toString()); synchronized (_goodTaskIdMap) { // check to see of it the task data "good task" map contains the specified split ... if (!_goodTaskIdMap.containsEntry(segmentIdStr, Integer.toString(splitIndex))) { // if not, add it the failed list ... failedListBuilder.add(Integer.toString(splitIndex)+","+sortedSplit.toString()); } } ++splitIndex; } // emit ALL splits file listToTextFile(allListBuilder.build(), fs, new Path(outputPath,SPLITS_MANIFEST_FILE)); // emit FAILED splits file (subset of all) listToTextFile(failedListBuilder.build(), fs, new Path(outputPath,FAILED_SPLITS_MANIFEST_FILE)); } protected static List<String> textFileToList(FileSystem fs,Path path)throws IOException { ImmutableList.Builder<String> builder = new ImmutableList.Builder<String>(); BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path),Charset.forName("UTF-8"))); try { String line; while ((line = reader.readLine()) != null) { if (line.length() != 0 && !line.startsWith("#")) builder.add(line); } } finally { reader.close(); } return builder.build(); } protected static void listToTextFile(List<? extends Object> objects,FileSystem fs,Path path)throws IOException { Writer writer = new OutputStreamWriter(fs.create(path), Charset.forName("UTF-8")); try { for (Object obj : objects) { writer.write(obj.toString()); writer.append("\n"); } writer.flush(); } finally { writer.close(); } } /** * Task Data RPC Spec Implementation */ static Multimap<String,String> _badTaskIdMap = TreeMultimap.create( String.CASE_INSENSITIVE_ORDER,new Comparator<String>() { @Override public int compare(String o1, String o2) { String tid1 = o1.substring(0,o1.indexOf(",")); String tid2 = o2.substring(0,o2.indexOf(",")); return tid1.compareToIgnoreCase(tid2); } }); static Multimap<String,String> _goodTaskIdMap = TreeMultimap.create( String.CASE_INSENSITIVE_ORDER,String.CASE_INSENSITIVE_ORDER); @Override public void updateMapReduceTaskValue( AsyncContext<MapReduceTaskIdAndData, NullMessage> rpcContext) throws RPCException { rpcContext.setStatus(Status.Error_RequestFailed); // one big hack .. if we get the "bad" task data key, add the job/task to the bad task id map if (rpcContext.getInput().getDataKey().equalsIgnoreCase(ParserMapper.BAD_TASK_TASKDATA_KEY)) { synchronized (_badTaskIdMap) { _badTaskIdMap.put(rpcContext.getInput().getJobId(),rpcContext.getInput().getTaskId()+","+rpcContext.getInput().getDataValue()); } rpcContext.setStatus(Status.Success); } else if (rpcContext.getInput().getDataKey().equalsIgnoreCase(ParserMapper.GOOD_TASK_TASKDATA_KEY)) { synchronized (_goodTaskIdMap) { _goodTaskIdMap.put(rpcContext.getInput().getJobId(),rpcContext.getInput().getTaskId()); } rpcContext.setStatus(Status.Success); } rpcContext.completeRequest(); } @Override public void queryMapReduceTaskValue( AsyncContext<MapReduceTaskIdAndData, MapReduceTaskIdAndData> rpcContext) throws RPCException { rpcContext.setStatus(Status.Error_RequestFailed); // similarly if we ask for the "bad" task data key, check to see if the job/task // is in the map, and if so, return 1 if (rpcContext.getInput().getDataKey().equalsIgnoreCase(ParserMapper.BAD_TASK_TASKDATA_KEY)) { try { rpcContext.getOutput().merge(rpcContext.getInput()); } catch (CloneNotSupportedException e) { } synchronized (_badTaskIdMap) { if (_badTaskIdMap.containsEntry( rpcContext.getInput().getJobId(), rpcContext.getInput().getTaskId()+",")) { // hack ... caller doesn't need split info ... rpcContext.getOutput().setDataValue("1"); } rpcContext.setStatus(Status.Success); } } rpcContext.completeRequest(); } @Override public void purgeMapReduceTaskValue( AsyncContext<MapReduceTaskIdAndData, NullMessage> rpcContext) throws RPCException { //NOOP - NOT SUPPORTED rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } @Override public void queryDuplicateStatus( AsyncContext<URLFPV2, SimpleByteResult> rpcContext) throws RPCException { //NOOP - NOT SUPPORTED rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } @Override public void queryLongValue( AsyncContext<LongQueryParam, LongQueryParam> rpcContext) throws RPCException { //NOOP - NOT SUPPORTED rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } @Override public void queryFingerprintStatus( AsyncContext<URLFPV2, SimpleByteResult> rpcContext) throws RPCException { //NOOP - NOT SUPPORTED rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } }