/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.util.concurrent.Semaphore;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.protocol.CrawlDBService;
import org.commoncrawl.protocol.MapReduceTaskIdAndData;
import org.commoncrawl.protocol.SimpleByteResult;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.rpc.base.internal.AsyncClientChannel;
import org.commoncrawl.rpc.base.internal.AsyncRequest;
import org.commoncrawl.rpc.base.internal.NullMessage;
/**
*
* @author rana
*
*/
public class TaskDataUtils {
private static final Log LOG = LogFactory.getLog(TaskDataUtils.class);
/** used to set task data connection info into job config BEFORE submitting the job to the JobTracker**/
public static final void initializeTaskDataJobConfig(JobConf jobconf,long jobId,InetSocketAddress connectionInfo) {
jobconf.set("taskdata.service.connectionInfo", connectionInfo.getAddress().getHostAddress() + ":" + connectionInfo.getPort());
jobconf.set("taskdata.service.jobid", Long.toString(jobId));
}
public static final long getTaskDataJobIdFromJobConfig(JobConf jobconf) {
return Long.parseLong(jobconf.get("taskdata.service.jobid", "0"));
}
/** used by the Map/Reduce Task to construct a task data client **/
public static final TaskDataClient getTaskDataClientForTask(JobConf jobconf) throws IOException {
return new TaskDataClient(jobconf);
}
public static class TaskDataClient implements AsyncClientChannel.ConnectionCallback {
EventLoop _eventLoop = null;
AsyncClientChannel _channel;
CrawlDBService.AsyncStub _asyncStub;
Semaphore _blockingCallSemaphore = null;
IOException _lastIOException = null;
private String _connectionData;
private String _jobId;
private String _taskId;
/** internal constructor used to create a new taskdataclient and establish a connection to the
* task data server **/
TaskDataClient(JobConf jobConf) throws IOException {
_connectionData = jobConf.get("taskdata.service.connectionInfo");
_jobId = jobConf.get("taskdata.service.jobid");
if (_connectionData == null || _jobId == null) {
throw new IOException("No TaskData Server Information in JobConfig!");
}
// extract task id ...
TaskAttemptID attemptId = TaskAttemptID.forName(jobConf.get("mapred.task.id"));
_taskId = Integer.toString(attemptId.getTaskID().getId());
// start event loop ...
_eventLoop = new EventLoop();
_eventLoop.start();
// parse connection sting
String connectionParts[] = _connectionData.split(":");
if (connectionParts.length != 2) {
throw new IOException("Invalid Connection String!");
}
else {
// construct socket address
InetSocketAddress endPoint = new InetSocketAddress(InetAddress.getByName(connectionParts[0]),Integer.parseInt(connectionParts[1]));
LOG.info("Connecting to server at:" + endPoint);
_channel = new AsyncClientChannel(_eventLoop,null,endPoint,this);
_blockingCallSemaphore = new Semaphore(0);
_channel.open();
_asyncStub = new CrawlDBService.AsyncStub(_channel);
LOG.info("Waiting on Connect... ");
_blockingCallSemaphore.acquireUninterruptibly();
LOG.info("Connect Semaphore Released... ");
_blockingCallSemaphore = null;
if (!_channel.isOpen()) {
throw new IOException("Connection Failed!");
}
}
}
/** used by the map/reduce task to set task data by key **/
public void updateTaskData(String key,String value) throws IOException {
_blockingCallSemaphore = new Semaphore(0);
_lastIOException = null;
MapReduceTaskIdAndData taskIdAndData = new MapReduceTaskIdAndData();
taskIdAndData.setJobId(_jobId);
taskIdAndData.setTaskId(_taskId);
taskIdAndData.setDataKey(key);
taskIdAndData.setDataValue(value);
_asyncStub.updateMapReduceTaskValue(taskIdAndData, new AsyncRequest.Callback<MapReduceTaskIdAndData, NullMessage> () {
@Override
public void requestComplete(AsyncRequest<MapReduceTaskIdAndData, NullMessage> request) {
if (request.getStatus() != AsyncRequest.Status.Success) {
LOG.error("updateMapReduceTaskValue Request Failed");
_lastIOException = new IOException("updateMapReduceTaskValue Request Failed");
}
//release blocking semaphore
_blockingCallSemaphore.release();
}
});
//wait for async request to complete ...
_blockingCallSemaphore.acquireUninterruptibly();
// if async call failed and generated an exception on the remote end ... throw the exception now
if (_lastIOException != null) {
throw _lastIOException;
}
}
/** used by the map/reduce task to query for task data by key **/
public String queryTaskData(String key) throws IOException {
_blockingCallSemaphore = new Semaphore(0);
_lastIOException = null;
final MapReduceTaskIdAndData taskIdAndDataInOut = new MapReduceTaskIdAndData();
taskIdAndDataInOut.setJobId(_jobId);
taskIdAndDataInOut.setTaskId(_taskId);
taskIdAndDataInOut.setDataKey(key);
_asyncStub.queryMapReduceTaskValue(taskIdAndDataInOut, new AsyncRequest.Callback<MapReduceTaskIdAndData, MapReduceTaskIdAndData> () {
@Override
public void requestComplete(AsyncRequest<MapReduceTaskIdAndData, MapReduceTaskIdAndData> request) {
if (request.getStatus() != AsyncRequest.Status.Success) {
LOG.error("queryTaskData Request Failed");
_lastIOException = new IOException("queryTaskData Request Failed");
}
else {
if (request.getOutput().isFieldDirty(MapReduceTaskIdAndData.Field_DATAVALUE)) {
taskIdAndDataInOut.setDataValue(request.getOutput().getDataValue());
}
}
//release blocking semaphore
_blockingCallSemaphore.release();
}
});
//wait for async request to complete ...
_blockingCallSemaphore.acquireUninterruptibly();
// if async call failed and generated an exception on the remote end ... throw the exception now
if (_lastIOException != null) {
throw _lastIOException;
}
else {
if (taskIdAndDataInOut.isFieldDirty(MapReduceTaskIdAndData.Field_DATAVALUE)) {
return taskIdAndDataInOut.getDataValue();
}
return null;
}
}
/** used by the map/reduce task to query for duplicate exclusion status **/
public int queryDuplicateStatus(final URLFPV2 key) throws IOException {
_blockingCallSemaphore = new Semaphore(0);
_lastIOException = null;
final SimpleByteResult result = new SimpleByteResult();
_asyncStub.queryDuplicateStatus(key, new AsyncRequest.Callback<URLFPV2, SimpleByteResult> () {
@Override
public void requestComplete(AsyncRequest<URLFPV2, SimpleByteResult> request) {
if (request.getStatus() != AsyncRequest.Status.Success) {
LOG.error("queryTaskData Request Failed");
_lastIOException = new IOException("queryTaskData Request Failed");
}
else {
result.setByteResult(request.getOutput().getByteResult());
}
//release blocking semaphore
_blockingCallSemaphore.release();
}
});
//wait for async request to complete ...
_blockingCallSemaphore.acquireUninterruptibly();
// if async call failed and generated an exception on the remote end ... throw the exception now
if (_lastIOException != null) {
throw _lastIOException;
}
else {
return result.getByteResult();
}
}
/** used by the map/reduce task to query for fingprint status **/
public int queryFingerprintStatus(final URLFPV2 key) throws IOException {
_blockingCallSemaphore = new Semaphore(0);
_lastIOException = null;
final SimpleByteResult result = new SimpleByteResult();
_asyncStub.queryFingerprintStatus(key, new AsyncRequest.Callback<URLFPV2, SimpleByteResult> () {
@Override
public void requestComplete(AsyncRequest<URLFPV2, SimpleByteResult> request) {
if (request.getStatus() != AsyncRequest.Status.Success) {
LOG.error("queryTaskData Request Failed");
_lastIOException = new IOException("queryTaskData Request Failed");
}
else {
result.setByteResult(request.getOutput().getByteResult());
}
//release blocking semaphore
_blockingCallSemaphore.release();
}
});
//wait for async request to complete ...
_blockingCallSemaphore.acquireUninterruptibly();
// if async call failed and generated an exception on the remote end ... throw the exception now
if (_lastIOException != null) {
throw _lastIOException;
}
else {
return result.getByteResult();
}
}
/** used by the map/reduce task to purge all data related to the task **/
public void purgeTaskData() throws IOException {
_blockingCallSemaphore = new Semaphore(0);
_lastIOException = null;
MapReduceTaskIdAndData taskIdAndData = new MapReduceTaskIdAndData();
taskIdAndData.setJobId(_jobId);
taskIdAndData.setTaskId(_taskId);
_asyncStub.purgeMapReduceTaskValue(taskIdAndData, new AsyncRequest.Callback<MapReduceTaskIdAndData, NullMessage> () {
@Override
public void requestComplete(AsyncRequest<MapReduceTaskIdAndData, NullMessage> request) {
if (request.getStatus() != AsyncRequest.Status.Success) {
LOG.error("purgeMapReduceTaskValue Request Failed");
_lastIOException = new IOException("purgeMapReduceTaskValue Request Failed");
}
//release blocking semaphore
_blockingCallSemaphore.release();
}
});
//wait for async request to complete ...
_blockingCallSemaphore.acquireUninterruptibly();
// if async call failed and generated an exception on the remote end ... throw the exception now
if (_lastIOException != null) {
throw _lastIOException;
}
}
/** shutdown **/
public void shutdown() {
if (_channel != null) {
try {
_channel.close();
} catch (IOException e) {
e.printStackTrace();
}
_channel = null;
_eventLoop.stop();
}
}
@Override
public void OutgoingChannelConnected(AsyncClientChannel channel) {
LOG.info("OutgoingChannelConnected... ");
if (_blockingCallSemaphore != null) {
_blockingCallSemaphore.release();
}
}
@Override
public boolean OutgoingChannelDisconnected(AsyncClientChannel channel) {
LOG.info("OutgoingChannelDisconnected... ");
if (_blockingCallSemaphore != null) {
_blockingCallSemaphore.release();
}
return true;
}
}
}