/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.channels.SocketChannel;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.io.NIOBufferList;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.io.NIOHttpConnection.DataSource;
import org.commoncrawl.io.NIOHttpConnection.Listener;
import org.commoncrawl.io.NIOHttpConnection.State;
import org.commoncrawl.util.CCStringUtils;
import com.google.common.collect.ImmutableSet;
/**
* A daemon process that monitors the crawler logs and transfers them to a remote location
* @author rana
*
*/
public class DataTransferAgent {
public static final String CACHE_DATA_PREFIX = "cacheData-";
static final Log LOG = LogFactory.getLog(DataTransferAgent.class);
static Pattern regEx = Pattern.compile("^cacheData-(.{4}).*$");
static Pattern oobRegEx = Pattern.compile("^(.{4}).*$");
static Pattern partFileRegEx = Pattern.compile("^part-(.{5}).*$");
static File hdfsCacheFileToLogFileLocation(File baseDir,FileStatus hdfsFile)throws IOException {
Matcher m = regEx.matcher(hdfsFile.getPath().getName());
if (m.matches()) {
String prefixId = m.group(1);
File prefixDir = new File(baseDir,prefixId);
prefixDir.mkdirs();
File logFileLocation = new File(prefixDir,hdfsFile.getPath().getName());
return logFileLocation;
}
else {
LOG.error("Failed to Match FileName"+hdfsFile.getPath());
return null;
}
}
static File outOfOrderFileToLogFileLocation(File baseDir,Path hdfsFile)throws IOException {
Matcher m = oobRegEx.matcher(hdfsFile.getParent().getName());
if (m.matches()) {
String prefixId = m.group(1);
File prefixDir = new File(baseDir,prefixId);
prefixDir.mkdirs();
Matcher m2 = partFileRegEx.matcher(hdfsFile.getName());
if (m2.matches()) {
String prefixAndPart = hdfsFile.getParent().getName() +
"-" + m2.group(1);
File logFileLocation = new File(prefixDir,prefixAndPart);
return logFileLocation;
}
}
LOG.error("Failed to Match FileName"+hdfsFile);
return null;
}
public static class ProxyTransferItem {
ProxyTransferItem() {
}
ProxyTransferItem(Path hdfsPath,File logFilePath,String uploadName) {
this.hdfsFilePath = hdfsPath;
this.logFilePath = logFilePath;
this.uploadName = uploadName;
}
Path hdfsFilePath;
File logFilePath;
String uploadName;
}
private static void probeAndSetSize(boolean sendSize,int targetSize,int minSize,SocketChannel channel)throws IOException {
if (sendSize && channel.socket().getSendBufferSize() >= targetSize) {
//System.out.println("SendSize is Already:" + channel.socket().getSendBufferSize());
return;
}
else if (!sendSize && channel.socket().getReceiveBufferSize() >= targetSize) {
//System.out.println("RcvSize is Already:" + channel.socket().getReceiveBufferSize());
return;
}
do {
int sizeOut = 0;
if (sendSize) {
channel.socket().setSendBufferSize(targetSize);
sizeOut = channel.socket().getSendBufferSize();
}
else {
channel.socket().setReceiveBufferSize(targetSize);
sizeOut = channel.socket().getReceiveBufferSize();
}
if (sizeOut == targetSize)
break;
targetSize >>= 1;
}
while (targetSize > minSize);
}
static class BufferStruct {
ByteBuffer _buffer;
int _id;
BufferStruct(ByteBuffer b,int id) {
_buffer = b;
_id = id;
}
};
static class CCBridgeServerMapping {
public CCBridgeServerMapping(String internalIP,String externalIP) {
_internalName = internalIP;
_externalName = externalIP;
}
String _internalName;
String _externalName;
}
static int uploadSingeFile(CCBridgeServerMapping mapping,FileSystem fs,Configuration conf,Path hdfsFilePath,String uploadName,EventLoop eventLoop)throws IOException {
final FileStatus fileStatus = fs.getFileStatus(hdfsFilePath);
LOG.info("Uploading:" + uploadName +" size:" + fileStatus.getLen() + " to:" + mapping._internalName);
{
// construct url
URL fePostURL = new URL("http://"+ mapping._externalName+":8090/");
LOG.info("POST URL IS:" + fePostURL.toString());
// open input stream
final FSDataInputStream is = fs.open(hdfsFilePath);
final Semaphore blockingSemaphore = new Semaphore(0);
NIOHttpConnection connection = null;
try {
// create connection
connection = new NIOHttpConnection(fePostURL,eventLoop.getSelector(),eventLoop.getResolver(),null);
// set listener
connection.setListener(new Listener() {
@Override
public void HttpConnectionStateChanged(NIOHttpConnection theConnection, State oldState, State state) {
LOG.info("Connection State Changed to:" + state.toString());
if (state == State.DONE || state == State.ERROR) {
//LOG.info("Connection Transition to Done or Error");
//LOG.info("Response Headers:" + theConnection.getResponseHeaders().toString());
blockingSemaphore.release();
}
}
@Override
public void HttpContentAvailable(NIOHttpConnection theConnection,
NIOBufferList contentBuffer) {
// TODO Auto-generated method stub
}
}
);
// set headers
connection.getRequestHeaders().reset();
connection.getRequestHeaders().prepend("PUT /put?src="+uploadName+" HTTP/1.1",null);
connection.getRequestHeaders().set("Host",mapping._internalName+":8090");
connection.getRequestHeaders().set("Content-Length",Long.toString(fileStatus.getLen()));
connection.getRequestHeaders().set("Connection", "keep-alive");
connection.setPopulateDefaultHeaderItems(false);
final LinkedBlockingDeque<BufferStruct> _loaderQueue = new LinkedBlockingDeque<BufferStruct>(20);
final AtomicBoolean eof = new AtomicBoolean();
final ByteBuffer sentinel = ByteBuffer.allocate(4096);
sentinel.position(sentinel.position());
final Thread loaderThread = new Thread(new Runnable() {
int _id=0;
@Override
public void run() {
int bytesRead;
byte incomingBuffer[] = new byte[4096 * 10];
try {
while ((bytesRead = is.read(incomingBuffer)) != -1) {
ByteBuffer buffer = ByteBuffer.wrap(incomingBuffer, 0,bytesRead);
buffer.position(bytesRead);
//LOG.info("Loader Thread Read:"+ bytesRead + " Buffer:" + ++_id);
try {
_loaderQueue.put(new BufferStruct(buffer,_id));
} catch (InterruptedException e) {
LOG.error(CCStringUtils.stringifyException(e));
break;
}
incomingBuffer = new byte[4096 * 10];
}
try {
_loaderQueue.put(new BufferStruct(sentinel,++_id));
} catch (InterruptedException e) {
}
}
catch (IOException e){
LOG.error(CCStringUtils.stringifyException(e));
return;
}
}
});
loaderThread.start();
// set data source ...
connection.setDataSource(new DataSource() {
int bytesTransferred = 0;
@Override
public boolean read(NIOHttpConnection connection,NIOBufferList dataBuffer) throws IOException {
if (eof.get())
return true;
//LOG.info("Connect read callback triggered");
BufferStruct buffer = _loaderQueue.poll();
if (buffer != null) {
if (buffer._buffer != sentinel) {
//LOG.info("Got Buffer:"+ buffer._id);
if (buffer._id == 1) {
//LOG.info("Inital Buffer Bytes:" + new String(buffer._buffer.array(),0,10).toString());
}
dataBuffer.write(buffer._buffer);
bytesTransferred += buffer._buffer.limit();
//LOG.info("Read:" + buffer._buffer.limit() + " Transfered:" + bytesTransferred);
return false;
}
else {
//LOG.info("EOF Condition");
dataBuffer.write(sentinel);
eof.set(true);
return true;
}
}
return false;
}
@Override
public void finsihedWriting(NIOHttpConnection sourceConnection,
ByteBuffer thisBuffer) throws IOException {
// TODO Auto-generated method stub
}
});
// open connection
connection.open();
// wait for connection to complete ...
blockingSemaphore.acquireUninterruptibly();
// kill loader thread
loaderThread.interrupt();
try {
LOG.info("Waiting for Loader Thread");
loaderThread.join();
LOG.info("Done Waiting for Loader Thread");
} catch (InterruptedException e) {
}
}
finally {
is.close();
if (connection != null) {
connection.close();
LOG.info("Response Code for File:" + uploadName + "to Host: " + mapping._internalName + " is:" + connection.getResponseHeaders().getHttpResponseCode());
return connection.getResponseHeaders().getHttpResponseCode();
/*
if (connection.getResponseHeaders().getHttpResponseCode() != 200) {
throw new IOException("Failed to upload file:" + dataFile.getName() + " responseCode:" + connection.getResponseHeaders().getHttpResponseCode());
}
*/
}
}
}
// something went wrong ???
LOG.error("Failed to upload file:" + uploadName + " unknown response code");
return 500;
}
static Thread startTransferThread(final int threadIndex,final CCBridgeServerMapping mapping,final File shutdownFile,final FileSystem fs,final Configuration conf,final LinkedBlockingDeque<ProxyTransferItem> itemQueue,final EventLoop eventLoop,final Semaphore shutdownSemaphore) {
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
try {
while (true) {
if (shutdownFile.exists()) {
LOG.info("Exiting due to shutdown file existense!");
break;
}
ProxyTransferItem item = itemQueue.take();
if (item.hdfsFilePath == null) {
LOG.info("Transfer Thread:" + Thread.currentThread().getId() + " Exiting");
}
else {
try {
LOG.info("Transfer Thread:"+ threadIndex+" for Host:" + mapping._internalName + " Transferring File:" + item.hdfsFilePath);
int result = uploadSingeFile(mapping,fs,conf,item.hdfsFilePath,item.uploadName,eventLoop);
if (result == 200){
LOG.info("Transfer Thread:" + threadIndex + "for Host:" + mapping._internalName + " Done Transferring File:" + item.hdfsFilePath);
//item.logFilePath.createNewFile();
}
else if (result == 409) {
LOG.info("Transfer Thread:" + threadIndex + "for Host:" + mapping._internalName + " File Already Exists for Path:" + item.hdfsFilePath);
//item.logFilePath.createNewFile();
}
else {
LOG.error("Transfer Thread:" + threadIndex + "for Host:" + mapping._internalName + " File Transfer Failed with Error:" +result + " for Path:" + item.hdfsFilePath);
itemQueue.putFirst(item);
}
} catch (IOException e) {
LOG.error("Transfer Failed for Thread:" + threadIndex+ "Host:" + mapping._internalName + " File: " + item.hdfsFilePath);
LOG.fatal(CCStringUtils.stringifyException(e));
}
}
}
} catch (InterruptedException e) {
}
finally {
shutdownSemaphore.release();
}
}
});
thread.start();
return thread;
}
public static final int TRANSFER_THREADS_PER_HOST = 8;
static ImmutableSet<CCBridgeServerMapping> mappingsTable = new ImmutableSet.Builder<CCBridgeServerMapping>()
.add(new CCBridgeServerMapping("0.0.0.0","0.0.0.0"))
.build();
public static void main(String[] args) {
Logger logger = Logger.getLogger("org.commoncrawl");
logger.setLevel(Level.INFO);
BasicConfigurator.configure();
Configuration conf = new Configuration();
conf.addResource("core-site.xml");
conf.addResource("hdfs-site.xml");
// set a big io buffer size ...
conf.setInt("io.file.buffer.size", 4096 * 1024);
final File transferLogDir = new File("/home/rana/ccprod/data/proxy_xfr_log");
final Path hdfsCacheDataPath = new Path("crawl/proxy/cache/");
final File shutdownFile = new File("/home/rana/ccprod/data/shutdown_xfr");
// create a deque ..
final LinkedBlockingDeque<ProxyTransferItem> itemQueue = new LinkedBlockingDeque<ProxyTransferItem>();
final EventLoop eventLoop = new EventLoop();
eventLoop.start();
try {
final DistributedFileSystem fs = (DistributedFileSystem) FileSystem.get(conf);
Thread transferThreads[] = new Thread[TRANSFER_THREADS_PER_HOST * mappingsTable.size()];
Semaphore shutdownSemaphore = new Semaphore(0);
int threadIndex = 0;
for (int i=0;i<TRANSFER_THREADS_PER_HOST;++i){
int serverIdx=0;
for (CCBridgeServerMapping mapping : mappingsTable) {
transferThreads[(i * mappingsTable.size()) + serverIdx++] = startTransferThread(threadIndex++,mapping,shutdownFile,fs,conf,itemQueue,eventLoop,shutdownSemaphore);
}
}
Thread scannerThread = new Thread(new Runnable() {
long _lastScanId = -1;
long _lastOutOfOrderDataDirId = -1L;
static final int SCAN_INTERVAL_MS = 500;
@Override
public void run() {
while (true) {
try {
if (shutdownFile.exists()) {
LOG.info("Shutdown File Detected in ScanTimer Outer Loop. Exiting Scan Thread");
return;
}
LOG.info("Scanning For Files based on filter. Last Known Scan Id is:" + _lastScanId);
FileStatus fileList[] = fs.listStatus(hdfsCacheDataPath, new PathFilter() {
@Override
public boolean accept(Path path) {
try {
if (path.getName().startsWith("cacheData-")) {
// extract file id ...
long currentFileId = Long.parseLong(path.getName().substring("cacheData-".length()));
// figure out if we are going to process it ...
if (_lastScanId == -1 || currentFileId > _lastScanId) {
return true;
}
}
}
catch (Exception e) {
LOG.error("Caught Exception Processing Path Filter:" + CCStringUtils.stringifyException(e));
}
return false;
}
});
LOG.info("Scan returned:" + fileList.length + " Number of Valid Files");
long latestFileId = 0L;
for (FileStatus file : fileList) {
// extract file id ...
long currentFileId = Long.parseLong(file.getPath().getName().substring("cacheData-".length()));
// figure out if we are going to process it ...
if (_lastScanId == -1 || currentFileId > _lastScanId) {
// cache max latest id ..
latestFileId = Math.max(latestFileId, currentFileId);
File logFile = hdfsCacheFileToLogFileLocation(transferLogDir,file);
if (logFile != null) {
if (logFile.exists()) {
LOG.info("Skipping:" + file.getPath().getName());
}
else {
LOG.info("Queueing File:" + file.getPath().getName());
itemQueue.add(new ProxyTransferItem(file.getPath(),logFile,file.getPath().getName()));
}
}
}
}
// ok update lastest file id
_lastScanId = Math.max(_lastScanId,latestFileId);
FileStatus outofOrderDataDirs[] = fs.globStatus(new Path("crawl/proxy/dtAgentOutOfOrderTransfers/*"));
for (FileStatus outOfOrderDataDir : outofOrderDataDirs) {
long dataDirId = Long.parseLong(outOfOrderDataDir.getPath().getName());
if (dataDirId > _lastOutOfOrderDataDirId) {
FileStatus candidates[] = fs.globStatus(new Path(outOfOrderDataDir.getPath(),"part-*"));
for (FileStatus candidate : candidates) {
File logFile = outOfOrderFileToLogFileLocation(transferLogDir,candidate.getPath());
if (logFile != null) {
String candidateName =
candidate.getPath().getParent().getName()
+ "-"
+ candidate.getPath().getName();
if (logFile.exists()) {
LOG.info("Skipping OOB FILE:" + candidateName);
}
else {
LOG.info("Queueing OOB FILE:" + candidateName);
itemQueue.add(new ProxyTransferItem(candidate.getPath(),logFile,candidateName));
}
}
}
_lastOutOfOrderDataDirId = dataDirId;
}
}
LOG.info("Finish Scan. Last Known Scan Id is now:" + _lastScanId);
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
try {
Thread.sleep(SCAN_INTERVAL_MS);
} catch (InterruptedException e) {
}
}
}
});
// start scanner thread ...
scannerThread.start();
LOG.info("Waiting on Transfer Threads");
shutdownSemaphore.acquireUninterruptibly(TRANSFER_THREADS_PER_HOST * mappingsTable.size());
LOG.info("ALL Transfer Threads Dead.");
// wait for scanner thread to die
LOG.info("Waiting for Scanner Thread to Die.");
try {
scannerThread.join();
} catch (InterruptedException e) {
}
LOG.info("Killing Event Loop");
eventLoop.stop();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}