/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.parser.ec2;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.server.CommonCrawlServer;
import org.commoncrawl.util.CCStringUtils;
import org.iq80.leveldb.DB;
import com.amazonaws.*;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ListObjectsRequest;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.google.common.base.Predicates;
import com.google.common.collect.Iterables;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
import com.google.gson.JsonObject;
/**
*
* @author rana
*
*/
public class EC2ParserMaster extends CommonCrawlServer implements Constants {
public static final String ENTRY_DB = "parse_entry_db";
private static final String s3AccessKeyId = "";
private static final String s3SecretKey = "";
private static final Log LOG = LogFactory.getLog(EC2ParserMaster.class);
private DB entryDB;
@Override
protected String getDefaultDataDir() {
return CrawlEnvironment.DEFAULT_DATA_DIR;
}
@Override
protected String getDefaultHttpInterface() {
return "10.0.20.21";
}
@Override
protected int getDefaultHttpPort() {
return CrawlEnvironment.DEFAULT_EC2MASTER_HTTP_PORT;
}
@Override
protected String getDefaultLogFileName() {
return "historyserver.log";
}
@Override
protected String getDefaultRPCInterface() {
return CrawlEnvironment.DEFAULT_RPC_INTERFACE;
}
@Override
protected int getDefaultRPCPort() {
return CrawlEnvironment.DEFAULT_EC2MASTER_RPC_PORT;
}
@Override
protected String getWebAppName() {
return CrawlEnvironment.DEFAULT_EC2MASTER_WEBAPP_NAME;
}
@Override
protected boolean initServer() {
try {
doScan(true);
getWebServer().addServlet("checkout", "/checkout", CheckoutServlet.class);
getWebServer().addServlet("ping", "/ping", PingServlet.class);
getWebServer().addServlet("checkin", "/checkin", CheckInServlet.class);
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
return false;
}
return true;
}
@Override
protected boolean parseArguements(String[] argv) {
return true;
}
@Override
protected void printUsage() {
}
@Override
protected boolean startDaemons() {
startScannerThread();
return true;
}
@Override
protected void stopDaemons() {
LOG.info("Shutting down scanner thread");
if (_scannerThread != null) {
shutdownFlag.set(true);
_scannerThread.interrupt();
try {
_scannerThread.join();
} catch (InterruptedException e) {
}
_scannerThread = null;
}
}
Thread _scannerThread = null;
public static final int SCAN_INTERVAL = 5 * 60 * 1000;
private Set<String> _complete = new HashSet<String>();
private Multimap<Long,ParseCandidate> _candidates = TreeMultimap.create();
private Map<ParseCandidate,ActiveHostRequest> _active = new TreeMap<ParseCandidate,ActiveHostRequest>();
/***
*
*/
private static class ParseCandidate implements Comparable<ParseCandidate> {
public String _crawlLogName;
public long _timestamp;
public long _lastValidPos = 0;
public long _size=0;
public static ParseCandidate candidateFromBucketEntry(String bucketEntry) throws IOException {
try {
Matcher m = crawlLogPattern.matcher(bucketEntry);
if (m.matches() && m.groupCount() == 1) {
ParseCandidate candidate = new ParseCandidate();
candidate._crawlLogName = m.group(1);
Matcher timesampMatcher = timestampExtractorPattern.matcher(candidate._crawlLogName);
if (timesampMatcher.matches()) {
candidate._timestamp = Long.parseLong(timesampMatcher.group(1));
}
else {
throw new IOException("Invalid CrawlLog");
}
return candidate;
}
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
return null;
}
// public static ParseCandidate candidateFromDoneMatcher(Matcher m) throws IOException {
// try {
// ParseCandidate candidate = new ParseCandidate();
// candidate._crawlLogName = m.group(1);
// Matcher timesampMatcher = timestampExtractorPattern.matcher(candidate._crawlLogName);
// if (timesampMatcher.matches()) {
// candidate._timestamp = Long.parseLong(timesampMatcher.group(1));
// }
// else {
// throw new IOException("Invalid CrawlLog");
// }
// candidate._lastValidPos =
// }
// catch (Exception e) {
// LOG.error(CCStringUtils.stringifyException(e));
// }
// return null;
// }
@Override
public String toString() {
return _crawlLogName + ":" + _timestamp;
}
@Override
public int compareTo(ParseCandidate o) {
return _crawlLogName.compareTo(o._crawlLogName);
}
public static class Comparator implements java.util.Comparator<ParseCandidate> {
@Override
public int compare(ParseCandidate o1, ParseCandidate o2) {
int result = (o1._timestamp < o2._timestamp) ? -1 : (o1._timestamp > o2._timestamp) ? 1: 0;
if (result == 0) {
result = o1._crawlLogName.compareTo(o2._crawlLogName);
}
return result;
}
}
}
static Pattern crawlLogPattern = Pattern.compile(".*(CrawlLog_ccc[0-9]{2}-[0-9]{2}_[0-9]*)");
static Pattern timestampExtractorPattern = Pattern.compile("CrawlLog_ccc[0-9]{2}-[0-9]{2}_([0-9]*)");
static Pattern doneFilePattern = Pattern.compile(".*(CrawlLog_ccc[0-9]{2}-[0-9]{2}_[0-9]*)_([0-9]*)_([0-9]*)"+ DONE_SUFFIX);
AtomicBoolean shutdownFlag = new AtomicBoolean();
private boolean doScan(boolean initialScan)throws IOException {
try {
LOG.info("Scanner Thread Starting");
AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(s3AccessKeyId,s3SecretKey));
ObjectListing response = s3Client.listObjects(new ListObjectsRequest().withBucketName("aws-publicdatasets").withPrefix(CC_BUCKET_ROOT+CC_CRAWLLOG_SOURCE));
do {
LOG.info("Response Key Count:" + response.getObjectSummaries().size());
for (S3ObjectSummary entry : response.getObjectSummaries()) {
Matcher matcher = crawlLogPattern.matcher(entry.getKey());
if (matcher.matches()) {
ParseCandidate candidate = ParseCandidate.candidateFromBucketEntry(entry.getKey());
if (candidate == null) {
LOG.error("Failed to Parse Candidate for:" + entry.getKey());
}
else {
LOG.info("Candidate is:" + candidate);
synchronized (this) {
if (_complete.contains(candidate._crawlLogName)) {
LOG.info("Skipping completed Candidate:" + candidate);
}
else {
if (!_candidates.containsEntry(candidate._timestamp, candidate) && !_active.containsKey(candidate)) {
// update candidate size here ...
candidate._size = entry.getSize();
LOG.info("New Candidate:" + candidate._crawlLogName + " Found");
_candidates.put(candidate._timestamp,candidate);
}
else {
LOG.info("Skipping Existing Candidate:" + candidate._crawlLogName);
}
}
}
}
}
}
if (response.isTruncated()) {
response = s3Client.listNextBatchOfObjects(response);
}
else {
break;
}
}
while (!shutdownFlag.get());
if (initialScan) {
// search for completions
synchronized(this) {
scanForCompletions();
}
}
return true;
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
return false;
}
}
private static class ActiveHostRequest implements Comparable<ActiveHostRequest> {
public String hostName;
public String uuid;
public ParseCandidate candidate;
public long startTime;
public ActiveHostRequest(String hostName,String uuid,ParseCandidate candidate) {
this.hostName = hostName;
this.uuid = uuid;
this.candidate = candidate;
this.startTime = System.currentTimeMillis();
}
@Override
public int compareTo(ActiveHostRequest o) {
int result = hostName.compareTo(o.hostName);
if (result == 0)
result = uuid.compareTo(o.uuid);
if (result == 0)
result = candidate.compareTo(o.candidate);
return result;
}
}
public static class PingServlet extends HttpServlet {
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp)throws ServletException, IOException {
String hostName = req.getParameter("host");
String uuid = req.getParameter("uuid");
String logName = req.getParameter("activeFile");
String pos = req.getParameter("pos");
if (hostName == null || uuid == null || logName == null || pos == null) {
LOG.error("Invalid Request from Host:" + req.getRemoteAddr());
resp.sendError(500,"Invalid Parameters");
}
else {
LOG.info("Got PING Request from Host:" + hostName + " uuid:" + uuid + " address:" + req.getRemoteAddr());
EC2ParserMaster server = getServer();
ParseCandidate candidate = ParseCandidate.candidateFromBucketEntry(logName);
if (candidate == null) {
LOG.error("Unable to Parse Candidate given Name:" + logName + "from Host:" + hostName + " uuid:" + uuid + " address:" + req.getRemoteAddr());
resp.sendError(500);
}
else {
boolean sendFailure =true;
synchronized (server) {
ActiveHostRequest request = server._active.get(candidate);
if (request == null) {
LOG.error("Unable to Find ParseCandidate:" + candidate._crawlLogName + " in ActiveList from Host:" + hostName + " uuid:" + uuid + " address:" + req.getRemoteAddr());
}
else {
if (!request.hostName.equals(hostName) || !request.uuid.equals(uuid)) {
// ok this is pad
LOG.error("Host Mismatch for candidate:" + candidate._crawlLogName
+ " We show:" + request.hostName + ":" + request.uuid + " We Got:"
+ hostName + ":" + uuid + " from:" + req.getRemoteAddr());
}
else {
long newPos = Long.parseLong(pos);
LOG.info("Updating Candidate:" + request.candidate._crawlLogName + " with new Pos:"+ newPos);
request.candidate._lastValidPos = newPos;
request.startTime = System.currentTimeMillis();
sendFailure = false;
}
}
}
if (sendFailure) {
resp.sendError(500);
}
else {
resp.setStatus(200);
}
}
}
}
}
public static class CheckInServlet extends HttpServlet {
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp)throws ServletException, IOException {
String hostName = req.getParameter("host");
String uuid = req.getParameter("uuid");
String logName = req.getParameter("activeFile");
String pos = req.getParameter("pos");
if (hostName == null || uuid == null || logName == null || pos == null) {
LOG.error("Invalid Request from Host:" + req.getRemoteAddr());
resp.sendError(500,"Invalid Parameters");
}
else {
LOG.info("Got PING Request from Host:" + hostName + " uuid:" + uuid + " address:" + req.getRemoteAddr());
EC2ParserMaster server = getServer();
ParseCandidate candidate = ParseCandidate.candidateFromBucketEntry(logName);
if (candidate == null) {
LOG.error("Unable to Parse Candidate given Name:" + logName + "from Host:" + hostName + " uuid:" + uuid + " address:" + req.getRemoteAddr());
resp.sendError(500);
}
else {
boolean sendFailure =true;
synchronized (server) {
ActiveHostRequest request = server._active.get(candidate);
if (request == null) {
LOG.error("Unable to Find ParseCandidate:" + candidate._crawlLogName + " in ActiveList from Host:" + hostName + " uuid:" + uuid + " address:" + req.getRemoteAddr());
}
else {
if (!request.hostName.equals(hostName) || !request.uuid.equals(uuid)) {
// ok this is pad
LOG.error("Host Mismatch for candidate:" + candidate._crawlLogName
+ " We show:" + request.hostName + ":" + request.uuid + " We Got:"
+ hostName + ":" + uuid + " from:" + req.getRemoteAddr());
}
else {
long newPos = Long.parseLong(pos);
LOG.info("Updating Candidate:" + request.candidate._crawlLogName + " Pos:"+ newPos);
request.candidate._lastValidPos = newPos;
if (request.candidate._lastValidPos == request.candidate._size) {
LOG.info("MARKING Candidate:" + request.candidate._crawlLogName + " As COMPLETE");
// ok now mark this candidate as complete...
server._active.remove(request.candidate);
server._complete.add(request.candidate._crawlLogName);
}
else {
LOG.info("Making Active Candidate " + request.candidate._crawlLogName + " AVAILABLE");
server._active.remove(request.candidate);
server._candidates.put(request.candidate._timestamp, request.candidate);
}
sendFailure = false;
}
}
}
if (sendFailure) {
resp.sendError(500);
}
else {
resp.setStatus(200);
}
}
}
}
}
public static class CheckoutServlet extends HttpServlet {
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp)
throws ServletException, IOException {
String hostName = req.getParameter("host");
String uuid = req.getParameter("uuid");
if (hostName == null || uuid == null) {
LOG.error("Invalid Request from Host:" + req.getRemoteAddr());
resp.sendError(500,"Invalid Parameters");
}
else {
LOG.info("Got Request from Host:" + hostName + " uuid:" + uuid + " address:" + req.getRemoteAddr());
EC2ParserMaster server = getServer();
ParseCandidate candidate = null;
synchronized (server) {
if (server._candidates.size() != 0) {
candidate = Iterables.getFirst(server._candidates.values(), null);
if (candidate != null) {
LOG.info("Assigning candidate:" + candidate._crawlLogName + " to Host:" + hostName + " uuid:" + uuid);
server._candidates.remove(candidate._timestamp,candidate);
// create a host request object ...
ActiveHostRequest request = new ActiveHostRequest(hostName, uuid, candidate);
server._active.put(candidate, request);
}
}
}
if (candidate != null) {
JsonObject objectOut = new JsonObject();
objectOut.addProperty("name",candidate._crawlLogName);
objectOut.addProperty("lastPos",candidate._lastValidPos);
objectOut.addProperty("size",candidate._size);
resp.setContentType("text/plain");
resp.getWriter().append(objectOut.toString());
resp.getWriter().flush();
}
else {
resp.sendError(404,"No Valid Candidate Found");
}
}
}
}
public static EC2ParserMaster getServer() {
return (EC2ParserMaster) CommonCrawlServer.getServerSingleton();
}
private void startScannerThread() {
_scannerThread = new Thread(new Runnable() {
@Override
public void run() {
while (!shutdownFlag.get()) {
LOG.info("Sleeping.... ");
try {
if (!shutdownFlag.get())
Thread.sleep(SCAN_INTERVAL);
} catch (InterruptedException e) {
}
if(!shutdownFlag.get()) {
try {
doScan(false);
} catch (IOException e1) {
LOG.error(CCStringUtils.stringifyException(e1));
}
}
}
}
});
_scannerThread.start();
}
public void scanForCompletions() throws IOException {
AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(s3AccessKeyId,s3SecretKey));
ObjectListing response = s3Client.listObjects(new ListObjectsRequest().withBucketName("aws-publicdatasets").withPrefix(CC_BUCKET_ROOT+CC_PARSER_INTERMEDIATE));
do {
LOG.info("Response Key Count:" + response.getObjectSummaries().size());
for (S3ObjectSummary entry : response.getObjectSummaries()) {
Matcher matcher = doneFilePattern.matcher(entry.getKey());
if (matcher.matches()) {
ParseCandidate candidate = ParseCandidate.candidateFromBucketEntry(entry.getKey());
if (candidate == null) {
LOG.error("Failed to Parse Candidate for:" + entry.getKey());
}
else {
long partialTimestamp = Long.parseLong(matcher.group(2));
long position = Long.parseLong(matcher.group(3));
LOG.info("Found completion for Log:" + candidate._crawlLogName + " TS:" + partialTimestamp + " Pos:" + position);
candidate._lastValidPos = position;
// ok lookup existing entry if present ...
ParseCandidate existingCandidate = Iterables.find(_candidates.get(candidate._timestamp),Predicates.equalTo(candidate));
// if existing candidate found
if (existingCandidate != null) {
LOG.info("Found existing candidate with last pos:" + existingCandidate._lastValidPos);
if (candidate._lastValidPos > existingCandidate._lastValidPos) {
existingCandidate._lastValidPos = candidate._lastValidPos;
if (candidate._lastValidPos == candidate._size) {
LOG.info("Found last pos == size for candidate:" + candidate._crawlLogName + ".REMOVING FROM ACTIVE - MOVING TO COMPLETE");
_candidates.remove(candidate._timestamp, candidate);
_complete.add(candidate._crawlLogName);
}
}
}
else {
LOG.info("Skipping Completion for CrawlLog:" + candidate._crawlLogName + " because existing candidate was not found.");
}
}
}
}
if (response.isTruncated()) {
response = s3Client.listNextBatchOfObjects(response);
}
else {
break;
}
}
while (true);
}
public static void main(String[] args) throws IOException {
Multimap<String,String> options = TreeMultimap.create();
for (int i=0;i<args.length;++i) {
String optionName = args[i];
if (++i != args.length) {
String optionValue = args[i];
options.put(optionName, optionValue);
}
}
options.removeAll("--server");
options.put("--server",EC2ParserMaster.class.getName());
Collection<Entry<String,String>> entrySet = options.entries();
String finalArgs[] = new String[entrySet.size() * 2];
int index = 0;
for (Entry entry : entrySet) {
finalArgs[index++] = (String)entry.getKey();
finalArgs[index++] = (String)entry.getValue();
}
try {
CommonCrawlServer.main(finalArgs);
} catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}