/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.queryserver.master; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.InetAddress; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.TreeMap; import java.util.Vector; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; import java.util.zip.CRC32; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.commoncrawl.async.ConcurrentTask; import org.commoncrawl.async.Timer; import org.commoncrawl.async.ConcurrentTask.CompletionCallback; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.db.RecordStore; import org.commoncrawl.protocol.ArchiveInfo; import org.commoncrawl.protocol.shared.ArcFileItem; import org.commoncrawl.rpc.base.internal.AsyncClientChannel; import org.commoncrawl.rpc.base.internal.AsyncContext; import org.commoncrawl.rpc.base.internal.AsyncRequest; import org.commoncrawl.rpc.base.internal.AsyncServerChannel; import org.commoncrawl.rpc.base.shared.RPCException; import org.commoncrawl.rpc.base.shared.RPCStruct; import org.commoncrawl.server.CommonCrawlServer; import org.commoncrawl.service.queryserver.BaseConfig; import org.commoncrawl.service.queryserver.ClientQueryInfo; import org.commoncrawl.service.queryserver.Common; import org.commoncrawl.service.queryserver.ContentQueryRPCInfo; import org.commoncrawl.service.queryserver.ContentQueryRPCResult; import org.commoncrawl.service.queryserver.MasterState; import org.commoncrawl.service.queryserver.PersistentQueryInfo; import org.commoncrawl.service.queryserver.QueryServerMaster; import org.commoncrawl.service.queryserver.QueryStatus; import org.commoncrawl.service.queryserver.ShardIndexHostNameTuple; import org.commoncrawl.service.queryserver.SlaveStatus; import org.commoncrawl.service.queryserver.index.DatabaseIndexV2; import org.commoncrawl.service.queryserver.query.Query; import org.commoncrawl.service.queryserver.query.QueryCompletionCallback; import org.commoncrawl.service.queryserver.query.QueryProgressCallback; import org.commoncrawl.service.queryserver.query.QueryRequest; import org.commoncrawl.service.queryserver.query.QueryResult; import org.commoncrawl.service.queryserver.query.RemoteQueryCompletionCallback; import org.commoncrawl.service.queryserver.query.ShardMapper; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.FileUtils; /** * * @author rana * */ public class MasterServer extends CommonCrawlServer implements QueryServerMaster, ShardMapper, AsyncServerChannel.ConnectionCallback { ///////////////////////////////////////////////////////////////////////////////////////////////// // react to a status change in a query slave's state .. private static final String MasterDBStateKey = "DBState"; private static final String CachedQueryIDPrefix = "CQID_"; private static final String CachedQueryPrefix = "CQ_"; private String _slavesFile; private File _cacheDirs[]; private File _tempFileDir = null; private long _tempFileDirSeed = - 1; private File _webAppRoot = null; private Vector<QueryServerSlaveState> _slaves = new Vector<QueryServerSlaveState>(); private Map<String,QueryServerSlaveState> _slaveNameToOnlineStateMap = new TreeMap<String,QueryServerSlaveState>(); private Map<String, SlaveStatus> _slaveStatusMap = new TreeMap<String,SlaveStatus>(); private long _slavesFileCRC = -1; private String _hdfsWorkingDir = "crawl/querydb/temp"; private String _hdfsResultsDir = "crawl/querydb/results"; private String _hdfsResultsCacheDir = "crawl/querydb/cache"; private long _databaseId = -1; private Path _localDataDir = null; private int _dataDriveCount = -1; private DatabaseIndexV2.MasterDatabaseIndex _masterIndex = null; private Path _queryDBPath = null; /** record store object used to persist state **/ private RecordStore _recordStore = new RecordStore(); private MasterState _masterState = null; @SuppressWarnings("unchecked") private LinkedList<QueryRequest> _queuedClientQueries = new LinkedList<QueryRequest>(); @SuppressWarnings("unchecked") private Map<Long,QueryRequest> _activeRemoteOrLocalQueries = new HashMap<Long,QueryRequest>(); @SuppressWarnings("unchecked") private Set<QueryRequest> _activeClientQueries = new HashSet<QueryRequest>(); @SuppressWarnings("unused") private QueryServerFE _queryServerFE; private ExecutorService _s3DownloaderThreadPool = Executors.newCachedThreadPool(); private File getLocalCacheDirForQuery(long queryId) { int paritionId = ((int)queryId) % _cacheDirs.length; return _cacheDirs[paritionId]; } public MasterServer() { //setAsyncWebDispatch(true); } public DatabaseIndexV2.MasterDatabaseIndex getDatabaseIndex() { return _masterIndex; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // INTERNAL ROUTINES /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public static class BlockingQueryResult<KeyType,ValueType> { public BlockingQueryResult(QueryResult<KeyType,ValueType> resultObject) { querySucceeded = true; this.resultObject = resultObject; } public BlockingQueryResult(String failureReason) { querySucceeded = false; this.errorString = failureReason; } public boolean querySucceeded = false; public QueryResult<KeyType,ValueType> resultObject; public String errorString; } <DataType extends RPCStruct,KeyType,ValueType> BlockingQueryResult<KeyType,ValueType> blockingQueryRequest(final Query<DataType,KeyType,ValueType> queryObject,final ClientQueryInfo queryInfo) throws IOException { final LinkedBlockingQueue<BlockingQueryResult<KeyType,ValueType>> queue = new LinkedBlockingQueue<BlockingQueryResult<KeyType,ValueType>>(1); getEventLoop().setTimer(new Timer(0,false,new Timer.Callback() { @Override public void timerFired(Timer timer) { try { queueClientQueryRequest(queryObject,queryInfo,new QueryCompletionCallback<DataType,KeyType, ValueType>() { @Override public void queryComplete(QueryRequest<DataType,KeyType,ValueType> request,QueryResult<KeyType, ValueType> queryResult) { LOG.info("Recevied QueryComplete for query:" + request.getSourceQuery().getQueryId()); BlockingQueryResult<KeyType,ValueType> result = new BlockingQueryResult<KeyType,ValueType>(queryResult); try { LOG.info("Queing response for Query:" + request.getSourceQuery().getQueryId()); queue.put(result); LOG.info("Queued response for Query:" + request.getSourceQuery().getQueryId()); } catch (InterruptedException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @Override public void queryFailed(QueryRequest<DataType,KeyType,ValueType> request, String reason) { LOG.info("Received queryFailed for request:" + request.getSourceQuery().getQueryId()); BlockingQueryResult<KeyType,ValueType> result = new BlockingQueryResult<KeyType,ValueType>(reason); try { queue.put(result); } catch (InterruptedException e) { LOG.error(CCStringUtils.stringifyException(e)); } } }); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } })); try { return queue.take(); } catch (InterruptedException e) { LOG.error(CCStringUtils.stringifyException(e)); } return null; } <DataType extends RPCStruct,KeyType,ValueType> void queueClientQueryRequest(Query<DataType,KeyType,ValueType> queryObject,ClientQueryInfo theClientRequest,QueryCompletionCallback<DataType,KeyType,ValueType> callback) throws IOException { // set query info queryObject.setClientQueryInfo(theClientRequest); // get the cannonical id for this query String queryCanonicalId = queryObject.getCanonicalId(); LOG.info("Received Query Request with CannonicalId:" + queryCanonicalId); // now check cache for persistent query cache info ... PersistentQueryInfo persistentQueryInfo = getPersistentQueryInfo(queryCanonicalId); // ok, cached query found ... if (persistentQueryInfo != null) { LOG.info("Existing Query Id found:" + persistentQueryInfo.getQueryId() + " for Request with CannonicalId:" + queryCanonicalId); // found cached query... set id of source query object queryObject.setQueryId(persistentQueryInfo.getQueryId()); } else { // assign the query a new id queryObject.setQueryId(getNextQueryId()); LOG.info("Assigning Query Id:" + queryObject.getQueryId() + " for Request with CannonicalId:" + queryCanonicalId); // and store the relationship persistentQueryInfo = new PersistentQueryInfo(); persistentQueryInfo.setCannonicalQueryId(queryCanonicalId); persistentQueryInfo.setQueryId(queryObject.getQueryId()); persistentQueryInfo.setCreateTime(System.currentTimeMillis()); LOG.info("Inserting Persistent Query Record"); // insert new structure into database ... insertUpdatePersistentInfo(persistentQueryInfo,false); } // establish hdfs working directory Path hdfsWorkingDir = new Path(_hdfsWorkingDir,Long.toString(queryObject.getQueryId())); // remove existing directory if present ... CrawlEnvironment.getDefaultFileSystem().delete(hdfsWorkingDir, true); // create the working directory CrawlEnvironment.getDefaultFileSystem().mkdirs(hdfsWorkingDir); // establish the hdfs working directory ... queryObject.getCommonQueryInfo().setQueryResultPath(hdfsWorkingDir.toString()); // establish the query cache directory File localQueryDirectory = getLocalCacheDirForQuery(queryObject.getQueryId()); LOG.info("Query Cache Directory for Query:" + queryObject.getQueryId() + " is:" + localQueryDirectory.getAbsolutePath()); // make sure it exists ... localQueryDirectory.mkdirs(); // allocate client request object ... QueryRequest<DataType,KeyType,ValueType> clientQueryObj = new QueryRequest<DataType,KeyType,ValueType>(queryObject,theClientRequest,localQueryDirectory,callback); // setup context ... queryObject.setContext(clientQueryObj); LOG.info("Query Client Request"); // add it to queue ... _queuedClientQueries.addLast(clientQueryObj); //potentially start the next query ... potentiallyStartNextQuery(); } @SuppressWarnings("unchecked") private void potentiallyStartNextQuery()throws IOException { FileSystem fileSystem = CrawlEnvironment.getDefaultFileSystem(); LinkedList<QueryRequest> requeueList = new LinkedList<QueryRequest>(); while (_queuedClientQueries.size() != 0 && _activeClientQueries.size() < Common.MAX_CONCURRENT_QUERIES) { QueryRequest request = _queuedClientQueries.removeFirst(); LOG.info("Processing Query:" + request.getSourceQuery().getQueryId() + " ActiveCount:" + _activeClientQueries.size()); try { // first see if a remote (or local) query is active ... if (_activeRemoteOrLocalQueries.get(request.getSourceQuery().getQueryId()) != null) { LOG.info("Cannot Dispatch ClientRequest:" + request.getClientQueryInfo().getClientQueryId() + " because existing query in progress"); // Fail the query immediately for now .. request.getCompletionCallback().queryFailed(request,"A similar query is already running and may take some time to complete. Please try again later."); } else { ArrayList<ShardIndexHostNameTuple> shardIdMapping = new ArrayList<ShardIndexHostNameTuple>(); // first check to see if cached results are available ... if (request.getSourceQuery().cachedResultsAvailable(fileSystem,_configuration,request)) { // add to active ... _activeClientQueries.add(request); LOG.info("Running Cache Query for Query:" + request.getSourceQuery().getQueryId()); runCacheQuery(request); } // check to see if remote dispatch is required .. else if (request.getSourceQuery().requiresRemoteDispatch(fileSystem,_configuration,this,request,shardIdMapping)) { // ok, we need at least on shard to run on ... if (shardIdMapping.size() == 0) { LOG.error("Query:" + request.getSourceQuery().getQueryId() + " FAILED WITH EMPTY HOSTS(TO RUN ON) LIST"); throw new IOException("Empty Host List prior to remoteDispath!"); } // set shard id to host mapping into query request.getSourceQuery().setShardIdToHostMapping(shardIdMapping); // ok, we ready for remote dispatch ... // add to active ... _activeClientQueries.add(request); // add to remote dispatch id set _activeRemoteOrLocalQueries.put(request.getSourceQuery().getQueryId(),request); LOG.info("Running Remote Query for Query:" + request.getSourceQuery().getQueryId()); // and dispatch request .. runRemoteQuery(fileSystem,request); } // otherwise .. run Local Request else { // add to active ... _activeClientQueries.add(request); // add to remote dispatch id set _activeRemoteOrLocalQueries.put(request.getSourceQuery().getQueryId(),request); LOG.info("Running Local Query for Query:" + request.getSourceQuery().getQueryId()); // and dispatch request .. runLocalQuery(request); } } } catch (IOException e) { LOG.error("Client Request:" + request.getClientQueryInfo().getClientQueryId() + " Failed with Exception:" + CCStringUtils.stringifyException(e)); request.getCompletionCallback().queryFailed(request,CCStringUtils.stringifyException(e)); } } _queuedClientQueries.addAll(requeueList); } @SuppressWarnings("unchecked") private void deactivateRequest(QueryRequest request) { LOG.info("DeActivating Query:" + request.getSourceQuery().getQueryId()); // first things first, delete temp file!!! File queryTempFile = getTempDirForQuery(request.getSourceQuery().getQueryId()); LOG.info("** Deleting Temp File for Query:" + request.getSourceQuery().getQueryId() + " At:" + queryTempFile.getAbsolutePath()); FileUtils.recursivelyDeleteFile(queryTempFile); _activeClientQueries.remove(request); if (request.getRunState() == QueryRequest.RunState.RUNNING_REMOTE || request.getRunState() == QueryRequest.RunState.RUNNING_LOCAL) { _activeRemoteOrLocalQueries.remove(request.getSourceQuery().getQueryId()); } request.setRunState(QueryRequest.RunState.IDLE); try { potentiallyStartNextQuery(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @SuppressWarnings("unchecked") private void requeueRequest(QueryRequest request) { deactivateRequest(request); LOG.info("ReQueueing Query:" + request.getSourceQuery().getQueryId()); _queuedClientQueries.addFirst(request); try { potentiallyStartNextQuery(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @SuppressWarnings("unchecked") private void runRemoteQuery(final FileSystem remoteFileSystem,final QueryRequest request) { //LOG.info("runRemoteQuery Called for Query:" + request.getSourceQuery().getQueryId()); if (!request.setRunState(QueryRequest.RunState.RUNNING_REMOTE)) { deactivateRequest((QueryRequest)request.getSourceQuery().getContextObject()); request.getCompletionCallback().queryFailed(request, "Unable to transition to RUNNING_REMOTE"); return; } try { request.getSourceQuery().startRemoteQuery(_slaveNameToOnlineStateMap,request.getSourceQuery().getShardIdToHostMapping(), new QueryProgressCallback() { @Override public boolean updateProgress(Query theQueryObject,float percentComplete) { LOG.info("Got updateProgress callback for:" + theQueryObject.getQueryId()); return true; } }, new RemoteQueryCompletionCallback() { @Override public void queryComplete(Query query, long resultCount) { LOG.info("Recevied QueryComplete for Query:" + request.getSourceQuery().getQueryId()); // call remote dispatch complete try { query.remoteDispatchComplete(remoteFileSystem,_configuration,request,resultCount); if (resultCount > 0) { LOG.info("Remote Query:" + request.getSourceQuery().getQueryId() + " returned:" + resultCount + " results"); // deactive request first ... requeueRequest((QueryRequest)query.getContextObject()); } else { LOG.info("Query:" + request.getSourceQuery().getQueryId() + " returned zero results"); deactivateRequest(request); QueryResult result = new QueryResult(); result.setTotalRecordCount(0); request.getCompletionCallback().queryComplete(request, result); LOG.info("Query:" + request.getSourceQuery().getQueryId() + " DONE DUDE"); } } catch (IOException e) { String error = "Query: " + request.getSourceQuery().getQueryId() + " Failed with Exception:" + CCStringUtils.stringifyException(e); LOG.error(error); // deactivate the request deactivateRequest((QueryRequest)query.getContextObject()); request.getCompletionCallback().queryFailed(request, error); } } @Override public void queryFailed(Query query, final String reason) { LOG.info("Recevied QueryFailed for Query:" + request.getSourceQuery().getQueryId() + " Reason:" + reason); // inform query of failure query.remoteDispatchFailed(remoteFileSystem); // deactivate the request deactivateRequest((QueryRequest)query.getContextObject()); request.getCompletionCallback().queryFailed(request, reason); } }); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); deactivateRequest(request); request.getCompletionCallback().queryFailed(request, CCStringUtils.stringifyException(e)); } } @SuppressWarnings("unchecked") private void runLocalQuery(final QueryRequest request) { //LOG.info("runLocalQuery Called for Query:" + request.getSourceQuery().getQueryId()); if (!request.setRunState(QueryRequest.RunState.RUNNING_LOCAL)) { deactivateRequest((QueryRequest)request.getSourceQuery().getContextObject()); request.getCompletionCallback().queryFailed(request, "Unable to transition to RUNNING_LOCAL"); return; } try { request.getSourceQuery().startLocalQuery( CrawlEnvironment.getDefaultFileSystem(), _configuration,_masterIndex, getTempDirForQuery(request.getSourceQuery().getQueryId()), getEventLoop(), request, new RemoteQueryCompletionCallback() { @Override public void queryComplete(Query query, long resultCount) { LOG.info("Recevied QueryComplete for Query:" + request.getSourceQuery().getQueryId()); if (resultCount > 0) { LOG.info("Local Query:" + request.getSourceQuery().getQueryId() + " returned:" + resultCount + " results"); // requeue request ... requeueRequest((QueryRequest)query.getContextObject()); } else { LOG.info("Query:" + request.getSourceQuery().getQueryId() + " returned zero results"); // deactive ... deactivateRequest((QueryRequest)query.getContextObject()); // initiate callback QueryResult result = new QueryResult(); result.setTotalRecordCount(0); request.getCompletionCallback().queryComplete(request, result); LOG.info("Query:" + request.getSourceQuery().getQueryId() + " DONE DUDE"); } } @Override public void queryFailed(Query query, final String reason) { LOG.info("Recevied QueryFailed for Query:" + request.getSourceQuery().getQueryId() + " Reason:" + reason); deactivateRequest((QueryRequest)query.getContextObject()); request.getCompletionCallback().queryFailed(request, reason); } }); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); deactivateRequest(request); request.getCompletionCallback().queryFailed(request, CCStringUtils.stringifyException(e)); } } @SuppressWarnings("unchecked") private void runCacheQuery(QueryRequest request) { //LOG.info("runCacheQuery Called for Query:" + request.getSourceQuery().getQueryId()); if (!request.setRunState(QueryRequest.RunState.RUNNING_CACHE)) { deactivateRequest((QueryRequest)request.getSourceQuery().getContextObject()); request.getCompletionCallback().queryFailed(request, "Unable to transition to RUNNING_CACHE"); return; } try { request.getSourceQuery().startCacheQuery( _masterIndex, CrawlEnvironment.getDefaultFileSystem(), _configuration, getEventLoop(), request, new QueryCompletionCallback() { @Override public void queryComplete(QueryRequest request,QueryResult queryResult) { deactivateRequest(request); request.getCompletionCallback().queryComplete(request, queryResult); } @Override public void queryFailed(QueryRequest request, String reason) { deactivateRequest(request); request.getCompletionCallback().queryFailed(request, reason); } } ); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); deactivateRequest(request); request.getCompletionCallback().queryFailed(request, CCStringUtils.stringifyException(e)); } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // COMMONCRAWL SERVER OVERRIDES /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @Override protected String getDefaultDataDir() { return CrawlEnvironment.DEFAULT_DATA_DIR; } @Override protected String getDefaultHttpInterface() { return CrawlEnvironment.DEFAULT_HTTP_INTERFACE; } @Override protected int getDefaultHttpPort() { return CrawlEnvironment.DEFAULT_QUERY_MASTER_HTTP_PORT; } @Override protected String getDefaultLogFileName() { return "qmaster"; } @Override protected String getDefaultRPCInterface() { return CrawlEnvironment.DEFAULT_RPC_INTERFACE; } @Override protected int getDefaultRPCPort() { return CrawlEnvironment.DEFAULT_QUERY_MASTER_RPC_PORT; } @Override protected String getWebAppName() { return CrawlEnvironment.QUERY_MASTER_WEBAPP_NAME; } /** * Get the pathname to the <code>patch</code> files. * @param path Path to find. * @return the pathname as a URL */ private static String getWebAppsPath(final String path) throws IOException { URL url = MasterServer.class.getClassLoader().getResource(path); if (url == null) throw new IOException("webapps not found in CLASSPATH"); return url.toString(); } private File getDefaultWebAppPath() throws IOException { return new File(getWebAppsPath("webapps") + File.separator + getWebAppName()); } @Override protected boolean initServer() { _tempFileDirSeed = System.currentTimeMillis(); if (_slavesFile == null) { LOG.error("Slaves File not specified. Specify Slaves file via --slaves"); return false; } else { try { // get a pointer to the hdfs file system // _fileSystem = CrawlEnvironment.getDefaultFileSystem(); // parse slaves file .. parseSlavesFile(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } } // initialize database ... File databasePath = new File(getDataDirectory().getAbsolutePath() + "/" + CrawlEnvironment.QMASTER_DB); LOG.info("Config says QMaster db path is: "+databasePath); // initialize master index if (_databaseId == -1 || _localDataDir == null || _dataDriveCount == -1 || _cacheDirs == null) { if (_databaseId == -1) LOG.error("Database Id is Not Defined"); if (_localDataDir == null) LOG.error("Local DataDir is NULL"); if (_dataDriveCount == -1) LOG.error("Data Drive Count is not Defined"); if (_cacheDirs == null) LOG.error("CacheDirs is NULL"); return false; } try { FileSystem remoteFS = CrawlEnvironment.getDefaultFileSystem(); // fully resolve slave names HashSet<String> onlineSlaves = new HashSet<String>(); for (String slave : _slaveNameToOnlineStateMap.keySet()) { LOG.info("Slave:" + slave + " maps to FQN:" + InetAddress.getByName(slave).getCanonicalHostName()); onlineSlaves.add(InetAddress.getByName(slave).getCanonicalHostName()); } // load master index .. _masterIndex = new DatabaseIndexV2.MasterDatabaseIndex(_configuration, remoteFS,_dataDriveCount, _databaseId,onlineSlaves); // initialize record store _recordStore.initialize(databasePath, null); // load db state ... _masterState = (MasterState) _recordStore.getRecordByKey(MasterDBStateKey); if (_masterState == null) { _masterState = new MasterState(); _masterState.setLastQueryId(0); _recordStore.beginTransaction(); _recordStore.insertRecord("", MasterDBStateKey, _masterState); _recordStore.commitTransaction(); } // create server channel ... AsyncServerChannel channel = new AsyncServerChannel(this, this.getEventLoop(), this.getServerAddress(),this); // register RPC services it supports ... registerService(channel,QueryServerMaster.spec); } catch (IOException e) { LOG.error("Database Initialization Failed with Exception:" + CCStringUtils.stringifyException(e)); return false; } if (_tempFileDir == null) { _tempFileDir = new File(getDataDirectory(),"qserver_temp"); _tempFileDir.mkdirs(); LOG.info("TempFilr Dir is null. Setting TempFile Dir to:" + _tempFileDir.getAbsolutePath()); } if (_webAppRoot == null) { try { _webAppRoot = getDefaultWebAppPath(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } LOG.info("WebApp Root not specified.Using default at:" + _webAppRoot.getAbsolutePath()); } try { // load database state ... // loadState(); // connect to slaves ... connectToSlaves(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } // clear working directory... try { LOG.info("Clearing working directory:" + _hdfsWorkingDir); CrawlEnvironment.getDefaultFileSystem().delete(new Path(_hdfsWorkingDir,"*"),true); LOG.info("Cleared working directory:" + _hdfsWorkingDir); } catch (IOException e1) { LOG.error(CCStringUtils.stringifyException(e1)); } try { // locate query db path _queryDBPath = locateQueryDBPath(); } catch (IOException e) { LOG.error("Failed to locate QueryDB Path with Exception:" + CCStringUtils.stringifyException(e)); return false; } if (_queryDBPath == null) { LOG.error("Failed to find queryDB candidate."); return false; } try { _queryServerFE = new QueryServerFE(this,_webAppRoot); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } return true; } private File getTempDirForQuery(long queryId) { return new File(_tempFileDir,Long.toString(queryId) + "-" + _tempFileDirSeed); } private void writeMasterState() throws IOException { _recordStore.beginTransaction(); _recordStore.updateRecordByKey(MasterDBStateKey, _masterState); _recordStore.commitTransaction(); } private long getNextQueryId() throws IOException { long nextQueryId = _masterState.getLastQueryId() + 1; _masterState.setLastQueryId(nextQueryId); writeMasterState(); return nextQueryId; } private void insertUpdatePersistentInfo(PersistentQueryInfo persistentQueryInfo,boolean isUpdate) throws IOException { persistentQueryInfo.setLastAccessTime(System.currentTimeMillis()); _recordStore.beginTransaction(); if (isUpdate) { _recordStore.updateRecordByKey(CachedQueryPrefix + persistentQueryInfo.getCannonicalQueryId(), persistentQueryInfo); } else { _recordStore.insertRecord(CachedQueryIDPrefix + persistentQueryInfo.getQueryId(), CachedQueryPrefix + persistentQueryInfo.getCannonicalQueryId(), persistentQueryInfo); } _recordStore.commitTransaction(); } private PersistentQueryInfo getPersistentQueryInfo(String canonicalId) throws IOException { return (PersistentQueryInfo) _recordStore.getRecordByKey(CachedQueryPrefix+canonicalId); } @Override protected boolean parseArguements(String[] argv) { for(int i=0; i < argv.length;++i) { if (argv[i].equalsIgnoreCase("--slaves")) { _slavesFile = argv[++i]; } else if (argv[i].equalsIgnoreCase("--databaseId")) { _databaseId = Long.parseLong(argv[++i]); } else if (argv[i].equalsIgnoreCase("--localDataDir")) { _localDataDir = new Path(argv[++i]); } else if (argv[i].equalsIgnoreCase("--dataDriveCount")) { _dataDriveCount = Integer.parseInt(argv[++i]); } else if (argv[i].equalsIgnoreCase("--cacheFileDir")) { String paths = argv[++i]; String splitPaths[] = paths.split(","); _cacheDirs = new File[splitPaths.length]; int index=0; for (String path : splitPaths) { _cacheDirs[index] = new File(path); _cacheDirs[index].mkdirs(); if (!_cacheDirs[index].isDirectory()) { LOG.error("Invalid Cache Directory Specified:" + _cacheDirs[index].getAbsolutePath()); return false; } index++; } } else if (argv[i].equalsIgnoreCase("--tempFileDir")) { _tempFileDir = new File(argv[++i]); // delete the directory contents up front FileUtils.recursivelyDeleteFile(_tempFileDir); // and recreate _tempFileDir.mkdirs(); if (!_tempFileDir.isDirectory()) { LOG.error("Invalid Temp Directory Specified:" + _tempFileDir.getAbsolutePath()); return false; } } else if (argv[i].equalsIgnoreCase("--webAppRoot")) { _webAppRoot = new File(argv[++i]); if (!_webAppRoot.isDirectory()) { LOG.error("Invalid Web App Directory Specified:" + _webAppRoot.getAbsolutePath()); return false; } } } return true; } @Override protected void printUsage() { System.out.println( "Required Parameters: --domainFile domainFilePath --cacheFileDir cacheFileDirectory"); } @Override protected boolean startDaemons() { return true; } @Override protected void stopDaemons() { } private Path locateQueryDBPath()throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); FileStatus statusArray[] = fs.globStatus(new Path("crawl/querydb/db/*")); Path candidatePath = null; for (FileStatus fileStatus : statusArray) { if (candidatePath == null) { candidatePath = fileStatus.getPath(); } else { long prevTimestamp = Long.parseLong(candidatePath.getName()); long currentTimestamp = Long.parseLong(fileStatus.getPath().getName()); if (currentTimestamp > prevTimestamp) { candidatePath = fileStatus.getPath(); } } } if (candidatePath != null) { LOG.info("Selected Candidate Path:" + candidatePath); } return candidatePath; } public BaseConfig getBaseConfigForSlave(QueryServerSlaveState slave) { BaseConfig baseConfig = new BaseConfig(); baseConfig.setBaseWorkingDir(_hdfsWorkingDir); baseConfig.setQueryResultsDir(_hdfsResultsDir); baseConfig.setQueryCacheDir(_hdfsResultsCacheDir); baseConfig.setQueryDBPath(_queryDBPath.toString()); // baseConfig.setFileSystem(_fileSystem.getUri().toString()); baseConfig.setDatabaseTimestamp(_databaseId); return baseConfig; } void connectToSlaves() throws IOException { LOG.info("Connecting to Slaves"); for (QueryServerSlaveState slave : _slaves) { slave.connect(); } } void parseSlavesFile()throws IOException { LOG.info("Loading Slaves File from:" + _slavesFile); InputStream stream =null; URL resourceURL = CrawlEnvironment.getHadoopConfig().getResource(_slavesFile); if (resourceURL != null) { stream = resourceURL.openStream(); } // try as filename else { LOG.info("Could not load resource as an URL. Trying as an absolute pathname"); stream = new FileInputStream(new File(_slavesFile)); } if (stream == null) { throw new FileNotFoundException(); } BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(stream))); String slaveHostPlusCount = null; LOG.info("Loading slaves file"); while ((slaveHostPlusCount = reader.readLine()) != null) { if (!slaveHostPlusCount.startsWith("#")) { StringTokenizer tokenizer = new StringTokenizer(slaveHostPlusCount,":"); if (tokenizer.countTokens() != 3){ throw new IOException("Invalid Slave Entry:" + slaveHostPlusCount + " in slaves File"); } else { String slaveName = tokenizer.nextToken(); //TODO:INSTANCE COUNT IS IGNORED !!! QueryServerSlaveState state = new QueryServerSlaveState(this,slaveName); LOG.info("Adding slave:" + slaveName); _slaves.add(state); // map host name to onlinestate _slaveNameToOnlineStateMap.put(slaveName,state); // and add SlaveState entry _slaveStatusMap.put(slaveName, new SlaveStatus()); } } } // now close the file and reopen to to compute the crc ... reader.close(); stream.close(); CRC32 fileCRC = new CRC32(); InputStream crcStream = null; if (resourceURL != null) { crcStream = resourceURL.openStream(); } else { LOG.info("Could not load resource as an URL. Trying as an absolute pathname"); crcStream = new FileInputStream(new File(_slavesFile)); } byte[] buf = new byte[4096]; int nRead = 0; while ( (nRead = crcStream.read(buf, 0, buf.length)) > 0 ) { fileCRC.update(buf, 0, nRead); } _slavesFileCRC = fileCRC.getValue(); LOG.info("Slaves File CRC is:" + _slavesFileCRC); crcStream.close(); } @SuppressWarnings("unchecked") void slaveStatusChanged(QueryServerSlaveState slave,SlaveStatus slaveStatus) { // LOG.info("Received slaveStatusChanged from slave:" + slave.getFullyQualifiedName()); if (slaveStatus != null && slaveStatus.getQueryStatus().size() != 0) { //LOG.info("Received:" + slaveStatus.getQueryStatus() + " QueryStatus updated from Slave:" + slave.getFullyQualifiedName()); // broadcast all query changes ... for (QueryStatus queryStatus : slaveStatus.getQueryStatus()) { //LOG.info("RCVD Status for Query:" + queryStatus.getQueryId() + " Status:" + QueryStatus.Status.toString(queryStatus.getStatus())); QueryRequest request = _activeRemoteOrLocalQueries.get(queryStatus.getQueryId()); if (request != null) { //LOG.info("FOUND QueryRequestObj:" + request + " for Query:" + queryStatus.getQueryId()); try { request.getSourceQuery().updateQueryStatusForSlave(slave.getHostName(),queryStatus); } catch (IOException e) { LOG.error("Error Updating QueryStatus for Query:" + request.getSourceQuery().getQueryId() + " Slave:" + slave.getFullyQualifiedName() + " Error:" + CCStringUtils.stringifyException(e)); } } else { LOG.error("DID NOT FIND QueryRequestObj for Query:" + queryStatus.getQueryId()); } } // clear query status array ... slaveStatus.getQueryStatus().clear(); } try { if (slaveStatus != null) { _slaveStatusMap.get(slave.getHostName()).merge(slaveStatus); } else { _slaveStatusMap.get(slave.getHostName()).clear(); } } catch (CloneNotSupportedException e) { } try { potentiallyStartNextQuery(); } catch (IOException e) { LOG.error("Error encountered calling startNextQuery. Exception:" + CCStringUtils.stringifyException(e)); } } private void completeContentQuery(AsyncContext<ContentQueryRPCInfo, ContentQueryRPCResult> rpcContext,ArcFileItem item) { if (item != null) { rpcContext.getOutput().setSuccess(true); rpcContext.getOutput().setArcFileResult(item); rpcContext.setStatus(AsyncRequest.Status.Success); try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } else { rpcContext.getOutput().setSuccess(false); rpcContext.setStatus(AsyncRequest.Status.Error_RequestFailed); try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } private void startS3Download(final AsyncContext<ContentQueryRPCInfo, ContentQueryRPCResult> rpcContext,final ArchiveInfo archiveInfo) { _s3DownloaderThreadPool.submit(new ConcurrentTask<ArcFileItem>(_eventLoop,new Callable<ArcFileItem>() { @Override public ArcFileItem call() throws Exception { LOG.info("Starting S3 Download for URL:" + rpcContext.getInput().getUrl()); return S3Helper.retrieveArcFileItem(archiveInfo, _eventLoop); } }, new CompletionCallback<ArcFileItem>() { @Override public void taskComplete(ArcFileItem loadResult) { LOG.info("S3 Download for URL:" + rpcContext.getInput().getUrl() + " Completed with " + ((loadResult == null) ? "NULL": "Valid") + "load Result"); completeContentQuery(rpcContext,loadResult); } @Override public void taskFailed(Exception e) { LOG.error("S3 Download for URL:" + rpcContext.getInput().getUrl() + " Failed with Exception:" + CCStringUtils.stringifyException(e)); completeContentQuery(rpcContext,null); } })); } @Override public void doContentQuery(final AsyncContext<ContentQueryRPCInfo, ContentQueryRPCResult> rpcContext) throws RPCException { /* LOG.info("Got ContentQuery RPC for URL:" + rpcContext.getInput().getUrl() + "Sending directly to slaves"); final ContentQueryState queryState = new ContentQueryState(); for (QueryServerSlaveState slaveState : _slaves) { if (slaveState.getRemoteStub() != null) { queryState.totalDispatchCount++; slaveState.getRemoteStub().doMetadataQuery(rpcContext.getInput(),new AsyncRequest.Callback<ContentQueryRPCInfo, CrawlDatumAndMetadata>() { @Override public void requestComplete(final AsyncRequest<ContentQueryRPCInfo, CrawlDatumAndMetadata> request) { queryState.completedCount++; if (request.getStatus() == AsyncRequest.Status.Success && !queryState.done) { queryState.done = true; // found a valid result ... LOG.info("Found Metadata for URL:" + rpcContext.getInput().getUrl()); // check to see if archive information is available for the this url ... ArchiveInfo archiveInfo = null; if (request.getOutput().getMetadata().getArchiveInfo().size() != 0) { Collections.sort(request.getOutput().getMetadata().getArchiveInfo(), new Comparator<ArchiveInfo> () { @Override public int compare(ArchiveInfo o1, ArchiveInfo o2) { return (o1.getArcfileDate() < o2.getArcfileDate()) ? -1 : (o1.getArcfileDate() > o2.getArcfileDate()) ? 1 : 0; } }); archiveInfo = request.getOutput().getMetadata().getArchiveInfo().get(request.getOutput().getMetadata().getArchiveInfo().size()-1); } // if archive info is available ... if (archiveInfo != null) { LOG.info("Archive Info Found for URL:" + rpcContext.getInput().getUrl() + " Starting S3Download"); // start a download thread ... startS3Download(rpcContext,archiveInfo); } // otherwsie ... fail request ... else { LOG.info("Archive Info not Found for URL:" + rpcContext.getInput().getUrl() + " Failing Request"); completeContentQuery(rpcContext,null); } } if (!queryState.done && queryState.completedCount == queryState.totalDispatchCount) { // ok all the queries failed to return results ... fail the request ... LOG.info("All Queries Completed and Failed for MetadataQuery for URL:" + rpcContext.getInput().getUrl()); rpcContext.setStatus(AsyncRequest.Status.Error_RequestFailed); try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } }); } } */ try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @Override public void IncomingClientConnected(AsyncClientChannel channel) { } @Override public void IncomingClientDisconnected(AsyncClientChannel channel) { } @Override public ArrayList<ShardIndexHostNameTuple> mapShardIdsForIndex(String indexName)throws IOException { ArrayList<ShardIndexHostNameTuple> tupleListOut = _masterIndex.mapShardIdsForIndex(indexName); if (tupleListOut == null) { throw new IOException("Unable to find tupleListMapping for Index:" + indexName); } return tupleListOut; } }