/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.commoncrawl.service.crawler;
import java.io.IOException;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.WritableName;
import org.apache.hadoop.io.WritableUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CrawlSegmentDetail;
import org.commoncrawl.protocol.CrawlSegmentDetailFPInfo;
import org.commoncrawl.protocol.CrawlSegmentHost;
import org.commoncrawl.protocol.CrawlSegmentHostFPInfo;
import org.commoncrawl.protocol.CrawlSegmentURL;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.service.crawler.CrawlSegmentLog.CrawlSegmentFPMap;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.TextBytes;
/**
*
* @author rana
*
*/
public class SegmentLoader {
private static final Log LOG = LogFactory.getLog(SegmentLoader.class);
public static class DNSResult {
public int ipAddress;
public long ttl;
public String cname;
}
public static interface DNSCache {
public DNSResult resolveName(CrawlSegmentHost host);
}
public static interface LoadProgressCallback {
public boolean hostAvailable(final CrawlSegmentHost host,final int originalURLCount,final int completedURLCount);
}
public static interface CancelOperationCallback {
// return true to cancel the operation
public boolean cancelOperation();
}
@SuppressWarnings("unchecked")
public static CrawlSegmentFPMap loadCrawlSegmentFPInfo(int listId,int segmentId,String crawlerName,CancelOperationCallback cancelCallback) throws IOException {
CrawlSegmentFPMap fpMap = new CrawlSegmentFPMap();
WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost");
if (segmentId == -1 || listId == -1) {
throw new IOException("Invalid Parameters!");
}
// construct hdfs path to segment ...
Path segmentPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + crawlerName+ "/" + CrawlEnvironment.formatListId(listId) + "/" + segmentId);
SequenceFile.Reader reader = null;
try {
FileSystem fs = FileSystem.get(segmentPath.toUri(),CrawlEnvironment.getHadoopConfig());
reader = new SequenceFile.Reader(fs,segmentPath,CrawlEnvironment.getHadoopConfig());
LongWritable hostFP = new LongWritable();
CrawlSegmentHost segmentHost = new CrawlSegmentHost();
DataOutputBuffer outputBuffer = new DataOutputBuffer();
int segmentUrlCount = 0;
while (reader.next(hostFP, segmentHost) && cancelCallback.cancelOperation() == false) {
// and update url count ...
segmentUrlCount += segmentHost.getUrlTargets().size();
// set the url vector to the appropriate size ...
for (CrawlSegmentURL url : segmentHost.getUrlTargets()) {
WritableUtils.writeVLong(outputBuffer,segmentHost.getHostFP());
WritableUtils.writeVLong(outputBuffer,url.getUrlFP());
}
}
outputBuffer.flush();
// ok set the urlfp stream
fpMap.setURLFPBuffer(segmentUrlCount,outputBuffer.getData(),outputBuffer.getLength());
// now initialize the
if (cancelCallback.cancelOperation()) {
return null;
}
else {
return fpMap;
}
}
finally {
if (reader != null)
reader.close();
}
}
public static class CrawlSegmentDetialLoadHintItem {
public static final int Is_Complete = 1 << 0; // completely intact
public static final int Is_Partial = 1 << 1; // partial item ...
public static final int Is_Empty = 1 << 2; // completely exhausted ...
public int _flags = 0;
public CrawlSegmentHostFPInfo _hostInfo = null;
}
public static class CrawlSegmentDetailLoadHint {
public Map<Long,CrawlSegmentDetialLoadHintItem> _hostItems = new TreeMap<Long,CrawlSegmentDetialLoadHintItem>();
public static CrawlSegmentDetailLoadHint buildLoadHintFromDetailFPInfo(CrawlSegmentDetailFPInfo info) {
CrawlSegmentDetailLoadHint hintOut = new CrawlSegmentDetailLoadHint();
for (CrawlSegmentHostFPInfo host : info.getHosts()) {
CrawlSegmentDetialLoadHintItem hintItem = new CrawlSegmentDetialLoadHintItem();
if (host.getUrlTargets().size() == 0) {
hintItem._flags = CrawlSegmentDetialLoadHintItem.Is_Empty;
}
else if (host.getUrlTargets().size() == host.getOriginalTargetCount()) {
hintItem._flags = CrawlSegmentDetialLoadHintItem.Is_Complete;
}
else {
hintItem._flags = CrawlSegmentDetialLoadHintItem.Is_Partial;
hintItem._hostInfo = host;
}
hintOut._hostItems.put(host.getHostFP(),hintItem);
}
return hintOut;
}
}
@SuppressWarnings("unchecked")
public static CrawlSegmentDetail loadCrawlSegment(int listId,int segmentId,String crawlerName,CrawlSegmentFPMap loadHint,DNSCache cache,LoadProgressCallback callback,CancelOperationCallback incomingCancelCallback) throws IOException {
final CancelOperationCallback cancelCallback = (incomingCancelCallback != null) ?
incomingCancelCallback : new CancelOperationCallback() {
@Override
public boolean cancelOperation() {
return false;
}
};
WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost");
if (segmentId == -1 || listId == -1) {
throw new IOException("Invalid Parameters!");
}
// construct hdfs path to segment ...
Path segmentPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + crawlerName+ "/" + CrawlEnvironment.formatListId(listId) + "/" + segmentId);
SequenceFile.Reader reader = null;
try {
CrawlSegmentDetail segmentOut = new CrawlSegmentDetail();
// initialize work unit detail ...
segmentOut.setSegmentId(segmentId);
FileSystem fs = FileSystem.get(segmentPath.toUri(),CrawlEnvironment.getHadoopConfig());
reader = new SequenceFile.Reader(fs,segmentPath,CrawlEnvironment.getHadoopConfig());
LongWritable hostFP = new LongWritable();
CrawlSegmentHost segmentHost = new CrawlSegmentHost();
while (reader.next(hostFP, segmentHost) && !cancelCallback.cancelOperation()) {
if (segmentHost.getHostFP() == 0) {
LOG.error("Host FP is Zero during reader.next");
}
//setup the segment id associated with this host (so that the host contains self sufficient context information).
segmentHost.setSegmentId(segmentId);
segmentHost.setListId(listId);
// capture original item count
int originalURLCount = segmentHost.getUrlTargets().size();
int completedURLCount = 0;
// and update url count ...
segmentOut.setUrlCount(segmentOut.getUrlCount() + segmentHost.getUrlTargets().size());
if (loadHint != null) {
// now walk remaining items (in hint)
for (int i=0;i<segmentHost.getUrlTargets().size();++i) {
CrawlSegmentURL segmentURL = segmentHost.getUrlTargets().get(i);
URLFPV2 urlfp = new URLFPV2();
urlfp.setDomainHash(segmentHost.getHostFP());
urlfp.setUrlHash(segmentURL.getUrlFP());
if (loadHint.wasCrawled(urlfp)) {
completedURLCount++;
segmentHost.getUrlTargets().remove(i);
--i;
segmentOut.setUrlsComplete(segmentOut.getUrlsComplete() + 1);
}
}
}
// now ... if there are no more entries in the host ...
if (segmentHost.getUrlTargets().size() != 0) {
if (!segmentHost.isFieldDirty(CrawlSegmentHost.Field_IPADDRESS)) {
if (cache != null) {
// try to resolve the address up front
DNSResult dnsCacheResult = cache.resolveName(segmentHost);
if (dnsCacheResult != null) {
segmentHost.setIpAddress(dnsCacheResult.ipAddress);
segmentHost.setTtl(dnsCacheResult.ttl);
if (dnsCacheResult.cname != null && dnsCacheResult.cname.length() != 0) {
segmentHost.setCname(dnsCacheResult.cname);
}
}
}
}
else {
if (!segmentHost.isFieldDirty(CrawlSegmentHost.Field_TTL)) {
segmentHost.setTtl(0);
}
}
}
// if a progress callback was specified, then call it with the load progress of this host ...
if (callback != null) {
// and initiate completion callaback
boolean continueLoading = callback.hostAvailable(segmentHost,originalURLCount,completedURLCount);
if (!continueLoading) {
LOG.info("HostAvailable Callback returned false. Aborting Load");
return null;
}
}
// otherwise ... add the host to the segment detail ...
else {
segmentOut.getHosts().add(segmentHost);
}
// and allocate a new segment host for next read
segmentHost = new CrawlSegmentHost();
}
if (!cancelCallback.cancelOperation()) {
return segmentOut;
}
else {
return null;
}
}
finally {
if (reader != null)
reader.close();
}
}
public static void main(String[] args) {
if (args.length != 0) {
System.out.println("Usage: listId segmentId crawlerName");
}
int listId = Integer.parseInt(args[0]);
int segmentId = Integer.parseInt(args[1]);
String crawlerName = args[2];
// initialize ...
Configuration conf = new Configuration();
conf.addResource("nutch-default.xml");
conf.addResource("nutch-site.xml");
conf.addResource("hadoop-default.xml");
conf.addResource("hadoop-site.xml");
conf.addResource("commoncrawl-default.xml");
conf.addResource("commoncrawl-site.xml");
CrawlEnvironment.setHadoopConfig(conf);
try {
System.out.println("Loading Crawl Segment Log for Segment:" + segmentId + " Crawler:" + crawlerName);
// now do it one more time ...
CrawlSegmentDetail detail = SegmentLoader.loadCrawlSegment(listId,segmentId, crawlerName, null, null, null,null);
System.out.println("Segment Detail - URL Count:" + detail.getUrlCount() + " Host Count:" + detail.getHosts().size());
for (CrawlSegmentHost host : detail.getHosts()) {
System.out.println(" Host:" + host.getHostName());
TextBytes textBytes = new TextBytes();
for (CrawlSegmentURL url : host.getUrlTargets()) {
System.out.println(" URL2:" + url.getUrl());
}
}
}
catch (IOException e) {
System.out.println(CCStringUtils.stringifyException(e));
}
}
}