/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.queryserver.query;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CrawlDatumAndMetadata;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.service.queryserver.ClientQueryInfo;
import org.commoncrawl.service.queryserver.DomainURLListQueryInfo;
import org.commoncrawl.service.queryserver.ShardIndexHostNameTuple;
import org.commoncrawl.service.queryserver.index.DatabaseIndexV2;
import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.SlaveDatabaseIndex;
import org.commoncrawl.service.queryserver.index.DatabaseIndexV2.MasterDatabaseIndex.MetadataOut;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FlexBuffer;
/**
*
* @author rana
*
*/
public class DomainURLListQuery extends Query<DomainURLListQueryInfo,URLFPV2,CrawlDatumAndMetadata> {
private static final Log LOG = LogFactory.getLog(DomainURLListQuery.class);
public final static String SORT_BY_NAME = "NAME";
//public final static String SORT_BY_STATUS = "STATUS";
//public final static String SORT_BY_TIME = "TIME";
public final static String SORT_BY_PR = "PR";
public DomainURLListQuery() {
}
public DomainURLListQuery(DomainURLListQueryInfo queryInfo) {
setQueryData(queryInfo);
}
private final String getURLOutputFileNameBasedOnSortByField(String sortByField) throws IOException {
if(sortByField.length() == 0 || sortByField.equals(SORT_BY_NAME)) {
return "DATA_" + SORT_BY_NAME;
}
else if (sortByField.equals(SORT_BY_PR)) {
return "DATA_" + SORT_BY_PR;
}
/*
else if (sortByField.equals(SORT_BY_STATUS)|| sortByField.equals(SORT_BY_TIME)
|| sortByField.equals(SORT_BY_PR)) {
return "DATA_" + sortByField;
}
*/
throw new IOException(sortByField +" is an INVALID SORT FIELD");
}
private final String getSharedOutputFileNameBasedOnSortByAndShardId(String sortByField,int shardId) throws IOException {
if (sortByField == null) {
throw new IOException("Invalid Sort By Field");
}
return getURLOutputFileNameBasedOnSortByField(sortByField) + "-" + getPartNameForSlave(shardId);
}
@Override
public boolean cachedResultsAvailable(FileSystem fileSystem,Configuration conf, QueryRequest<DomainURLListQueryInfo,URLFPV2,CrawlDatumAndMetadata> theClientRequest) throws IOException {
FileSystem localFileSystem = FileSystem.getLocal(conf);
Path urlOutputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest)+getURLOutputFileNameBasedOnSortByField(theClientRequest.getClientQueryInfo().getSortByField()));
// LOG.info("Cached Results Available called for Query:" + theClientRequest.getSourceQuery().getQueryId() + ". Checking Path:" + urlOutputFileName);
return localFileSystem.exists(urlOutputFileName);
}
@Override
protected long executeRemote(
FileSystem fileSystem, Configuration conf,
EventLoop eventLoop, SlaveDatabaseIndex instanceIndex, File tempFirDir,
QueryProgressCallback<DomainURLListQueryInfo,URLFPV2,CrawlDatumAndMetadata> progressCallback
) throws IOException {
// OK .. WE EXPECT A SINGLE RELEVANT SHARD ID
if (getCommonQueryInfo().getRelevantShardIds().size() != 1) {
throw new IOException("Invalid Shard Id Count in Remote Dispatch");
}
Path remoteURLListPath = getRemoteOutputFilePath(getClientQueryInfo(),getCommonQueryInfo().getRelevantShardIds().get(0));
LOG.info("ExecuteRemote called for Query:" + getQueryId() + " Creating spill files:" +remoteURLListPath);
FSDataOutputStream urlListWriter = fileSystem.create(remoteURLListPath);
try {
long recordCountOut = 0;
try {
LOG.info("Execute Remote for Query:" + getQueryId() +" Calling executeURLListQuery");
FlexBuffer urlListOut = null;
if (getClientQueryInfo().getSortByField().compareTo(SORT_BY_NAME) == 0) {
urlListOut = _slaveDatabaseIndex.queryURLListSortedByName(getQueryData().getDomainId());
}
else if (getClientQueryInfo().getSortByField().compareTo(SORT_BY_PR) == 0) {
urlListOut = _slaveDatabaseIndex.queryURLListSortedByPR(getQueryData().getDomainId());
}
else {
throw new IOException("Invalid Sort Field:" + getClientQueryInfo().getSortByField());
}
if (urlListOut != null) {
urlListWriter.write(urlListOut.get(),urlListOut.getOffset(),urlListOut.getCount());
urlListWriter.flush();
recordCountOut = urlListOut.getCount() / 8L;
}
LOG.info("Execute Remote for Query:" + getQueryId() +" executeDomainListQuery returned:" + recordCountOut);
return recordCountOut;
}
catch (IOException e) {
LOG.error("Execute Remote for Query:" + getQueryId() +" executeDomainListQuery failed with error:" + CCStringUtils.stringifyException(e));
throw e;
}
}
finally {
if (urlListWriter != null) {
urlListWriter.close();
}
}
}
@Override
public void remoteDispatchComplete(FileSystem fileSystem,Configuration conf,QueryRequest<DomainURLListQueryInfo,URLFPV2,CrawlDatumAndMetadata> request, long resultCount) throws IOException {
if (getShardIdToHostMapping().size() != 1) {
throw new IOException("Excepected One ShardIdToHostMapping. Got:" + getShardIdToHostMapping().size());
}
LOG.info("remoteDispathc Complete Called");
Path remoteURLListPath = getRemoteOutputFilePath(getClientQueryInfo(),getShardIdToHostMapping().get(0).getShardId());
if (fileSystem.exists(remoteURLListPath)) {
LocalFileSystem localFS = FileSystem.getLocal(conf);
Path localURLListPath = new Path(getLocalQueryResultsPathPrefix(request)+getURLOutputFileNameBasedOnSortByField(request.getClientQueryInfo().getSortByField()));
localFS.delete(localURLListPath,false);
LOG.info("Copying " + remoteURLListPath + " to LocalPath:" + localURLListPath);
fileSystem.copyToLocalFile(remoteURLListPath, localURLListPath);
}
}
static final int FP_RECORD_SIZE = 8;
private static void readPaginatedResults(final DatabaseIndexV2.MasterDatabaseIndex masterIndex,long domainId, FSDataInputStream inputStream,long length,String sortByField,int sortOrder,int pageNumber,int pageSize,QueryResult<URLFPV2,CrawlDatumAndMetadata> resultOut) throws IOException {
// if descending sort order ...
// take pageNumber * pageSize as starting point
long offset = 0;
long startPos = 0;
long endPos = 0;
// calculate total record count ...
int totalRecordCount = (int) (length / FP_RECORD_SIZE);
resultOut.getResults().clear();
resultOut.setPageNumber(pageNumber);
resultOut.setTotalRecordCount(totalRecordCount);
// flip pr due to bug in how we sort pr
if (sortByField.equals(SORT_BY_PR)) {
if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING)
sortOrder = ClientQueryInfo.SortOrder.DESCENDING;
else
sortOrder = ClientQueryInfo.SortOrder.ASCENDING;
}
if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) {
startPos = pageNumber * pageSize;
endPos = Math.min(startPos + pageSize, totalRecordCount);
offset = pageNumber * pageSize;
}
else {
startPos = totalRecordCount - ((pageNumber +1) * pageSize);
endPos = startPos + pageSize;
startPos = Math.max(0,startPos);
offset = totalRecordCount - ((pageNumber +1) * pageSize);
}
//LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset);
if (startPos < totalRecordCount) {
//LOG.info("Seeking to Offset:" + startPos);
inputStream.seek(startPos * FP_RECORD_SIZE);
//LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)");
for (long i=startPos;i<endPos;++i) {
URLFPV2 key = new URLFPV2();
key.setDomainHash(domainId);
key.setUrlHash(inputStream.readLong());
// ok time to find this item in the master index ...
CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata();
long timeStart = System.currentTimeMillis();
MetadataOut metadataOut = masterIndex.queryMetadataAndURLGivenFP(key);
long timeEnd = System.currentTimeMillis();
//LOG.info("Metadata Retrieval for Index:"+ i + " took:" + (timeEnd - timeStart));
if (metadataOut == null) {
LOG.error("Failed to Retrieve URL and Metadata for Domain:" + domainId + " FP:" + key.getUrlHash());
metadataObject.setUrl("NULL-DH(" + key.getDomainHash() + ")-FP(" + key.getUrlHash() + ")");
}
else {
metadataObject.setUrl(metadataOut.url.toString());
metadataObject.setStatus(metadataOut.fetchStatus);
if (metadataOut.lastFetchTime > 0) {
metadataObject.getMetadata().setLastFetchTimestamp(metadataOut.lastFetchTime);
}
metadataObject.getMetadata().setPageRank(metadataOut.pageRank);
}
if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) {
resultOut.getResults().add(0,new QueryResultRecord<URLFPV2,CrawlDatumAndMetadata>(key,metadataObject));
}
else {
resultOut.getResults().add(new QueryResultRecord<URLFPV2,CrawlDatumAndMetadata>(key,metadataObject));
}
}
}
}
@Override
public void getCachedResults(FileSystem fileSyste, Configuration conf,EventLoop eventLoop, final DatabaseIndexV2.MasterDatabaseIndex masterIndex, QueryRequest<DomainURLListQueryInfo,URLFPV2,CrawlDatumAndMetadata> theClientRequest,QueryCompletionCallback<DomainURLListQueryInfo,URLFPV2,CrawlDatumAndMetadata> callback) throws IOException {
LOG.info("getCachedResults for Query:" + getQueryId() +" Retrieving Cached Results");
FileSystem localFileSystem = FileSystem.getLocal(conf);
Path outputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest)+getURLOutputFileNameBasedOnSortByField(theClientRequest.getClientQueryInfo().getSortByField()));
FSDataInputStream inputStream = localFileSystem.open(outputFileName);
try {
QueryResult<URLFPV2,CrawlDatumAndMetadata> resultOut = new QueryResult<URLFPV2,CrawlDatumAndMetadata>();
//LOG.info("Calling ReadPaginationResults");
readPaginatedResults(
masterIndex,
getQueryData().getDomainId(),
inputStream, localFileSystem.getFileStatus(outputFileName).getLen(),
theClientRequest.getClientQueryInfo().getSortByField(),
theClientRequest.getClientQueryInfo().getSortOrder(),
theClientRequest.getClientQueryInfo().getPaginationOffset(),
theClientRequest.getClientQueryInfo().getPageSize(),
resultOut);
//LOG.info("Initiating getCachedResults Callback");
callback.queryComplete(theClientRequest,resultOut);
}
finally {
inputStream.close();
}
}
@Override
public String getCanonicalId() {
return encodePatternAsFilename("DURLQ:" + getQueryData().getDomainId());
}
private Path getRemoteOutputFilePath(ClientQueryInfo queryInfo,int shardId)throws IOException {
String sortByField = queryInfo.getSortByField();
Path remoteQueryPath = getHDFSQueryResultsPath();
// ok construct the final output name based on the shard id
return new Path(remoteQueryPath,getSharedOutputFileNameBasedOnSortByAndShardId(sortByField,shardId));
}
private int getShardIdGivenDomainId(long domainId) {
//ok, we need to figure out our shard id and map it ...
return (((int)getQueryData().getDomainId()) & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS;
}
@Override
public boolean requiresRemoteDispatch(FileSystem fileSystem,
Configuration conf, ShardMapper shardMapper,
QueryRequest<DomainURLListQueryInfo,URLFPV2,CrawlDatumAndMetadata> theClientRequest,
ArrayList<ShardIndexHostNameTuple> shardIdToHostNameMapping) throws IOException {
if (cachedResultsAvailable(fileSystem, conf, theClientRequest)) {
return false;
}
String sortByField = theClientRequest.getClientQueryInfo().getSortByField();
if (sortByField.compareTo(SORT_BY_NAME) != 0 && sortByField.compareTo(SORT_BY_PR) != 0) {
return false;
}
int targetShardId = getShardIdGivenDomainId(getQueryData().getDomainId());
// ok construct the final output name based on the shard id
Path dataOutputPath = getRemoteOutputFilePath(theClientRequest.getClientQueryInfo(),targetShardId);
// ok does the file exist ...
if (fileSystem.exists(dataOutputPath)) {
// ok no need for remote dispatch ...
return false;
}
// ok, remote file does not exist
// first map index name based on sort order field
String indexName = sortByField.compareTo(SORT_BY_NAME) == 0 ?
DatabaseIndexV2.MasterDatabaseIndex.INDEX_NAME_DOMAIN_ID_TO_URLLIST_SORTED_BY_NAME
: DatabaseIndexV2.MasterDatabaseIndex.INDEX_NAME_DOMAIN_ID_TO_URLLIST_SORTED_BY_PR;
// now retrieve shard mappings based on index
ArrayList<ShardIndexHostNameTuple> tuples = shardMapper.mapShardIdsForIndex(indexName);
ShardIndexHostNameTuple targetTuple = null;
for (ShardIndexHostNameTuple tuple : tuples) {
if (tuple.getShardId() == targetShardId) {
targetTuple = tuple;
break;
}
}
if (targetTuple == null) {
throw new IOException("Failed to find Mapping for Shard Index:" + targetShardId);
}
// add all returned mappings to shard mapping list ...
shardIdToHostNameMapping.add(targetTuple);
// return true indicating that we need to execute this query remotely
return true;
}
/*
@Override
protected long executeLocal(FileSystem remoteFileSystem, Configuration conf,EventLoop eventLoop, File tempFirDir, QueryRequest requestObject)throws IOException {
Path mergedURLDataPath = new Path(getLocalQueryResultsPath(requestObject),getURLOutputFileNameBasedOnSortByField(SORT_BY_NAME));
//LOG.info("executeLocal called");
// get a local file system object
FileSystem localFileSystem = FileSystem.getLocal(conf);
if (!localFileSystem.exists(mergedURLDataPath)) {
LOG.info("Execute Local for Query:" + getQueryId() +".Starting URL Data Merge");
//LOG.info("Checking for parts files for url data merge");
FileStatus urlDataStatusArray[] = remoteFileSystem.globStatus(new Path(getHDFSQueryResultsPath(),URL_DATA_PREFIX + "part-*"));
//LOG.info("Found:" + urlDataStatusArray.length + " url data parts");
if (urlDataStatusArray.length == 0) {
LOG.error("Execute Local for Query:" + getQueryId() +" FAILED.No Parts Files Found!");
return 0;
}
Vector<Path> urlDataPaths = new Vector<Path>();
for (FileStatus part : urlDataStatusArray) {
//LOG.info("Found Part:"+ part.getPath());
urlDataPaths.add(part.getPath());
}
LOG.info("Execute Local for Query:" + getQueryId() +".Initializing Merger");
SequenceFileSpillWriter<Text,CrawlDatumAndMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text,CrawlDatumAndMetadata>(
localFileSystem,
conf,
mergedURLDataPath,
Text.class,
CrawlDatumAndMetadata.class,true,true);
SequenceFileMerger<Text,CrawlDatumAndMetadata> merger
= new SequenceFileMerger<Text,CrawlDatumAndMetadata>(
remoteFileSystem,
conf,
urlDataPaths,
mergedFileSpillWriter,
Text.class,
CrawlDatumAndMetadata.class,
new RawValueKeyValueComparator<Text, CrawlDatumAndMetadata>() {
@Override
public int compare(Text key1, CrawlDatumAndMetadata value1, Text key2,CrawlDatumAndMetadata value2) {
return key1.compareTo(key2);
}
@Override
public int compareRaw(byte[] key1Data, int key1Offset,
int key1Length, byte[] key2Data, int key2Offset,
int key2Length, byte[] value1Data, int value1Offset,
int value1Length, byte[] value2Data, int value2Offset,
int value2Length) throws IOException {
return WritableComparator.compareBytes(key1Data,key1Offset,key1Length,key2Data,key2Offset,key2Length);
}
},null
,null);
try {
LOG.info("Execute Local for Query:" + getQueryId() +".Running Merger");
merger.mergeAndSpill();
LOG.info("Execute Local for Query:" + getQueryId() +".Merge Successfull.. Deleting Merge Inputs");
for (FileStatus urlDataPath : urlDataStatusArray) {
remoteFileSystem.delete(urlDataPath.getPath(),false);
}
}
catch (IOException e){
LOG.error("Execute Local for Query:" + getQueryId() +" FAILED during Merge with Exception:" + CCStringUtils.stringifyException(e));
throw e;
}
finally {
merger.close();
}
}
else {
LOG.info("Execute Local for Query:" + getQueryId() +" Merge File NAME URL Data Already Exists.Skipping");
}
// now check for query specific merge file ...
Path queryResultsPath = new Path(getLocalQueryResultsPath(requestObject),getURLOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField()));
LOG.info("Execute Local for Query:" + getQueryId() +" Checking for QueryResultsPath for DomainDetail Path is:" + queryResultsPath);
if (!localFileSystem.exists(queryResultsPath)) {
LOG.info("Execute Local for Query:" + getQueryId() +" Results File:" + queryResultsPath + " does not exist. Running sort and merge process");
String sortByField = requestObject.getClientQueryInfo().getSortByField();
LOG.info("Execute Local for Query:" + getQueryId() +" Allocating SpillWriter with output to:" + queryResultsPath);
// allocate a spill writer ...
SequenceFileSpillWriter<Text,CrawlDatumAndMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text,CrawlDatumAndMetadata>(localFileSystem,conf,queryResultsPath,Text.class,CrawlDatumAndMetadata.class,true,true);
try {
//LOG.info("Allocating MergeSortSpillWriter");
// and connect it to the merge spill writer ...
MergeSortSpillWriter<Text, CrawlDatumAndMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, CrawlDatumAndMetadata>(
localFileSystem,
conf,
sortedResultsFileSpillWriter,
tempFirDir,
getComparatorForSortField(sortByField),
getKeyGeneratorForSortField(sortByField),
null,
Text.class,
CrawlDatumAndMetadata.class,true);
try {
// create a vector representing the single input segment
Vector<Path> singleInputSegment = new Vector<Path>();
//LOG.info("Adding MergeResultsPath:" + mergedURLDataPath + " as input for Merger for DomainDetail URL Query Id:" + getQueryId());
singleInputSegment.add(mergedURLDataPath);
// create a SequenceFileReader
SequenceFileReader<Text, CrawlDatumAndMetadata> mergeSegmentReader = new SequenceFileReader<Text, CrawlDatumAndMetadata>(
localFileSystem,
conf,
singleInputSegment,
mergeSortSpillWriter,
Text.class,
CrawlDatumAndMetadata.class);
try {
LOG.info("Execute Local for Query:" + getQueryId() +" calling readAndSpill");
mergeSegmentReader.readAndSpill();
LOG.info("Execute Local for Query:" + getQueryId() +" readAndSpill finished");
}
finally {
if (mergeSegmentReader != null) {
mergeSegmentReader.close();
}
}
}
finally {
if (mergeSortSpillWriter != null) {
mergeSortSpillWriter.close();
}
}
}
finally {
if (sortedResultsFileSpillWriter != null) {
sortedResultsFileSpillWriter.close();
}
}
//LOG.info("Allocating SequenceFileIndex object for DomainDetail URL Query Id:" + getQueryId() + " with Path:" + queryResultsPath);
SequenceFileIndex<Text, SubDomainStats> indexFile = new SequenceFileIndex<Text, SubDomainStats>(new File(queryResultsPath.toString()),Text.class,SubDomainStats.class);
//LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount());
return indexFile.getRecordCount();
}
return 0;
}
*/
/*
private static OptimizedKeyGenerator<Text, CrawlDatumAndMetadata> getKeyGeneratorForSortField(String sortByField) throws IOException {
if (sortByField.equals(SORT_BY_STATUS)) {
return new OptimizedKeyGenerator<Text, CrawlDatumAndMetadata>() {
@Override
public long generateOptimizedKeyForPair(Text keyType,CrawlDatumAndMetadata value) throws IOException {
return (long)value.getStatus();
}
};
}
else if (sortByField.equals(SORT_BY_TIME)) {
return new OptimizedKeyGenerator<Text, CrawlDatumAndMetadata>() {
@Override
public long generateOptimizedKeyForPair(Text keyType,CrawlDatumAndMetadata value) throws IOException {
return (long)value.getMetadata().getLastFetchTimestamp();
}
};
}
else if (sortByField.equals(SORT_BY_PR)) {
return new OptimizedKeyGenerator<Text, CrawlDatumAndMetadata>() {
@Override
public long generateOptimizedKeyForPair(Text keyType,CrawlDatumAndMetadata value) throws IOException {
long valueOut = (long) value.getMetadata().getPageRank();
valueOut = (valueOut * 1000) + (long)((value.getMetadata().getPageRank() -(float) valueOut) * 1000.00f);
return valueOut;
}
};
}
return null;
}
*/
/*
private static RawValueKeyValueComparator<Text,CrawlDatumAndMetadata> getComparatorForSortField(String sortByField) throws IOException {
RawValueKeyValueComparator<Text,CrawlDatumAndMetadata> comparator = null;
if (sortByField.equals(SORT_BY_STATUS)) {
comparator = new RawValueKeyValueComparator<Text, CrawlDatumAndMetadata>() {
CrawlDatumAndMetadata value1 = new CrawlDatumAndMetadata();
CrawlDatumAndMetadata value2 = new CrawlDatumAndMetadata();
@Override
public int compare(Text key1, CrawlDatumAndMetadata value1, Text key2,CrawlDatumAndMetadata value2) {
return value1.getStatus() - value2.getStatus();
}
@Override
public int compareRaw(byte[] key1Data, int key1Offset, int key1Length,
byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data,
int value1Offset, int value1Length, byte[] value2Data,
int value2Offset, int value2Length) throws IOException {
value1.clear();
value2.clear();
value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length)));
value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length)));
return compare(null, value1, null, value2);
}
};
}
else if (sortByField.equals(SORT_BY_TIME)) {
comparator = new RawValueKeyValueComparator<Text, CrawlDatumAndMetadata>() {
CrawlDatumAndMetadata value1 = new CrawlDatumAndMetadata();
CrawlDatumAndMetadata value2 = new CrawlDatumAndMetadata();
@Override
public int compare(Text key1, CrawlDatumAndMetadata value1, Text key2,CrawlDatumAndMetadata value2) {
if (value1.getMetadata().getLastFetchTimestamp() > value2.getMetadata().getLastFetchTimestamp())
return 1;
else if (value1.getMetadata().getLastFetchTimestamp() < value2.getMetadata().getLastFetchTimestamp())
return -1;
return 0;
}
@Override
public int compareRaw(byte[] key1Data, int key1Offset, int key1Length,
byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data,
int value1Offset, int value1Length, byte[] value2Data,
int value2Offset, int value2Length) throws IOException {
value1.clear();
value2.clear();
value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length)));
value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length)));
return compare(null,value1,null,value2);
}
};
}
else if (sortByField.equals(SORT_BY_PR)) {
comparator = new RawValueKeyValueComparator<Text, CrawlDatumAndMetadata>() {
CrawlDatumAndMetadata value1 = new CrawlDatumAndMetadata();
CrawlDatumAndMetadata value2 = new CrawlDatumAndMetadata();
@Override
public int compare(Text key1, CrawlDatumAndMetadata value1, Text key2,CrawlDatumAndMetadata value2) {
if (value1.getMetadata().getPageRank() > value2.getMetadata().getPageRank())
return 1;
else if (value1.getMetadata().getPageRank() < value2.getMetadata().getPageRank())
return -1;
return 0;
}
@Override
public int compareRaw(byte[] key1Data, int key1Offset, int key1Length,
byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data,
int value1Offset, int value1Length, byte[] value2Data,
int value2Offset, int value2Length) throws IOException {
long timeStart = System.currentTimeMillis();
value1.clear();
value2.clear();
value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length)));
value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length)));
long timeEnd = System.currentTimeMillis();
long timeElapsed =timeEnd - timeStart;
return compare(null,value1,null,value2);
}
};
}
if (comparator == null) {
throw new IOException("Comparator for Field:" + sortByField + " Not Found or Defined!");
}
return comparator;
}
*/
}