/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.parser.ec2;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.UUID;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.Tuples.Pair;
import com.google.api.client.http.GenericUrl;
import com.google.api.client.http.HttpRequest;
import com.google.api.client.http.HttpRequestFactory;
import com.google.api.client.http.HttpResponse;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.common.collect.Lists;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.stream.JsonReader;
public class EC2ParserNode implements Runnable, Constants {
public static final Log LOG = LogFactory.getLog(EC2ParserNode.class);
Configuration _conf;
Thread _thread;
FileSystem _fs;
UUID _uuid;
String _masterHost;
String _hostName;
public EC2ParserNode(String hostName,Configuration conf)throws IOException, URISyntaxException {
_conf = conf;
_conf.set("fs.s3n.awsAccessKeyId", "079HD5ZAQSKEY542V7R2");
_conf.set("fs.s3n.awsSecretAccessKey", "g4Ow3MSj77mqEw3uf4fZ22QPXuH991YP/rak8FJX");
_fs = FileSystem.get(new URI("s3n://aws-publicdatasets/"), _conf);
_uuid = UUID.randomUUID();
_masterHost = "10.0.20.21";
_hostName = hostName;
startThread();
}
private static class QueueItem {
public QueueItem(Path path) {
crawlLogPath = path;
}
Path crawlLogPath;
}
LinkedBlockingQueue<QueueItem> _queue = new LinkedBlockingQueue<QueueItem>();
private void startThread() {
_thread = new Thread(this);
_thread.start();
}
public void stop() {
if (_thread != null) {
try {
LOG.info("Stopping Thread");
_queue.put(new QueueItem(null));
LOG.info("Waiting for Thread to Die");
_thread.join();
LOG.info("Thread dead");
_thread = null;
} catch (InterruptedException e) {
}
}
}
public void addToQueue(Path path)throws IOException {
_queue.add(new QueueItem(path));
}
static final HttpTransport HTTP_TRANSPORT = new NetHttpTransport();
GenericUrl buildCheckoutURL() {
GenericUrl url = new GenericUrl();
url.setScheme("http");
url.setHost(_masterHost);
url.setPort(CrawlEnvironment.DEFAULT_EC2MASTER_HTTP_PORT);
url.setPathParts(Lists.newArrayList("","checkout"));
url.put("host",_hostName);
url.put("uuid",_uuid.toString());
LOG.info(url.build());
return url;
}
GenericUrl buildPingURL(String activeFile,long pos) {
GenericUrl url = new GenericUrl();
url.setScheme("http");
url.setHost(_masterHost);
url.setPort(CrawlEnvironment.DEFAULT_EC2MASTER_HTTP_PORT);
url.setPathParts(Lists.newArrayList("","ping"));
url.put("host",_hostName);
url.put("uuid",_uuid.toString());
url.put("activeFile", activeFile);
url.put("pos",pos);
LOG.info(url.build());
return url;
}
GenericUrl buildCheckInURL(String activeFile,long pos) {
GenericUrl url = new GenericUrl();
url.setScheme("http");
url.setHost(_masterHost);
url.setPort(CrawlEnvironment.DEFAULT_EC2MASTER_HTTP_PORT);
url.setPathParts(Lists.newArrayList("","checkin"));
url.put("host",_hostName);
url.put("uuid",_uuid.toString());
url.put("activeFile", activeFile);
url.put("pos",pos);
LOG.info(url.build());
return url;
}
AtomicBoolean _shutdownActive = new AtomicBoolean();
HttpRequestFactory factory = HTTP_TRANSPORT.createRequestFactory();
private Pair<String,Long> checkoutFile()throws IOException {
GenericUrl url = buildCheckoutURL();
HttpRequest request = factory.buildGetRequest(url);
HttpResponse response = request.execute();
if (response.getStatusCode() == 200) {
JsonParser parser = new JsonParser();
JsonObject e = parser.parse(new JsonReader(new InputStreamReader(response.getContent(),Charset.forName("UTF-8")))).getAsJsonObject();
String logName = e.get("name").getAsString();
long lastPos = e.get("lastPos").getAsLong();
LOG.info("Got Name:" + logName + " Pos:" + lastPos);
return new Pair<String,Long>(logName,lastPos);
}
return null;
}
public static final Path buildCrawlLogPath(String logName) {
return new Path("/" + CC_BUCKET_ROOT + CC_CRAWLLOG_SOURCE+logName);
}
public static final Path buildCrawlLogCheckpointPath(String logName,long timestamp,long position) {
return new Path("/" + CC_BUCKET_ROOT + CC_PARSER_INTERMEDIATE+logName+ "_"+timestamp+"_"+position + DONE_SUFFIX);
}
private static final int CHECKPOINT_INTERVAL = 1 * 5 * 1000;
@Override
public void run() {
while (!_shutdownActive.get()) {
if (!_shutdownActive.get()) {
try {
Pair<String,Long> checkoutInfo = checkoutFile();
if (checkoutInfo != null) {
Path logPath = buildCrawlLogPath(checkoutInfo.e0);
LOG.info("Opening File At LogPath:" + logPath);
SequenceFile.Reader reader
= new SequenceFile.Reader(_fs,logPath,_conf);
long lastPos = checkoutInfo.e1;
LOG.info("Seeking to Pos:" + lastPos);
if (lastPos != 0) {
reader.seek(lastPos);
}
try {
Text key = new Text();
CrawlURL urlData = new CrawlURL();
long lastCheckpointTime = System.currentTimeMillis();
while (reader.next(key,urlData)) {
if (reader.getPosition() != lastPos) {
if (System.currentTimeMillis() - lastCheckpointTime >= CHECKPOINT_INTERVAL) {
doCheckpoint(checkoutInfo.e0,lastCheckpointTime,reader.getPosition(),false);
lastPos = reader.getPosition();
lastCheckpointTime = System.currentTimeMillis();
}
}
LOG.info("Pos:" + reader.getPosition() + " Key:" + key.toString() + " ValueLen:" + urlData.getContentRaw().getCount());
}
doCheckpoint(checkoutInfo.e0,lastCheckpointTime,reader.getPosition(),true);
}
finally {
reader.close();
}
}
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
}
private void doCheckpoint(String logFileName,long timestamp,long position, boolean isFinalCommit)throws IOException {
Path checkpointFile = buildCrawlLogCheckpointPath(logFileName, timestamp, position);
FSDataOutputStream outputStream = _fs.create(checkpointFile);
try {
outputStream.write(1);
outputStream.flush();
}
finally {
outputStream.close();
}
HttpRequest request = factory.buildGetRequest(
(isFinalCommit) ? buildCheckInURL(logFileName,position) : buildPingURL(logFileName, position));
HttpResponse response = request.execute();
LOG.info("Checkpointing log:"+ logFileName + " position:" + position);
if (response.getStatusCode() == 200) {
LOG.info("Checkpointing for log:"+ logFileName + " position:" + position + " SUCCEEDED");
}
else {
LOG.error("Checkpoint for log:"
+ logFileName
+ " position:"
+ position
+ " FAILED WITH ERROR: "
+ response.getStatusCode()
+ " "
+ response.getStatusMessage());
}
}
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
EC2ParserNode parser = new EC2ParserNode("test-host",conf);
parser.addToQueue(new Path("/common-crawl/crawl-intermediate/CrawlLog_ccc01-01_1328300149459"));
parser.stop();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
} catch (URISyntaxException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}