/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.listcrawler; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.URL; import java.net.URLClassLoader; import java.util.ArrayList; import java.util.Date; import java.util.Map; import java.util.TreeMap; import java.util.Vector; import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import org.apache.hadoop.fs.Path; import org.apache.log4j.DailyRollingFileAppender; import org.apache.log4j.Layout; import org.apache.log4j.spi.LoggingEvent; import org.commoncrawl.async.Timer; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.db.RecordStore; import org.commoncrawl.io.NIODNSQueryResult; import org.commoncrawl.io.NIODNSQueryClient; import org.commoncrawl.io.NIODNSResolver; import org.commoncrawl.io.NIOHttpConnection; import org.commoncrawl.protocol.CrawlSegmentHost; import org.commoncrawl.protocol.CrawlSegmentURL; import org.commoncrawl.protocol.CrawlURL; import org.commoncrawl.protocol.URLFP; import org.commoncrawl.rpc.base.internal.AsyncClientChannel; import org.commoncrawl.server.ServletLauncher; import org.commoncrawl.service.crawler.CrawlTarget; import org.commoncrawl.service.crawler.CrawlerServer; import org.commoncrawl.service.crawler.filters.URLPatternBlockFilter; import org.commoncrawl.service.crawler.filters.Filter.FilterResult; import org.commoncrawl.service.listcrawler.CrawlListDatabaseRecord; import org.commoncrawl.service.queryserver.QueryServerMaster; import org.commoncrawl.util.URLFingerprint; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.CustomLogger; import org.commoncrawl.util.IPAddressUtils; import org.mortbay.jetty.Handler; import org.mortbay.jetty.servlet.ServletHolder; import com.ibm.icu.text.SimpleDateFormat; /** * An advanced version of the basic crawler that maintains a long term history and cache of * crawled content and that also supports lists based crawling * * @author rana * */ public class ProxyServer extends CrawlerServer implements CrawlQueueLoader { private static final int MAX_QUEUED_DNS_REQUESTS = 1000; static ProxyServer _server = null; private CacheManager _cache; private CrawlHistoryManager _crawlHistoryManager; private int _cacheFlushThreshold = -1; private CustomLogger _requestLog; private InetSocketAddress _queryMasterAddress; private AsyncClientChannel _queryMasterChannel; private QueryServerMaster.AsyncStub _queryMasterStub; private boolean _queryMasterAvailable = false; private URLPatternBlockFilter _urlBlockFilter = null; private int _debugMode = 0; private File _crawlHistoryLogDir = null; private RecordStore _recordStore = new RecordStore(); public ProxyServer() { } public CacheManager getCache() { return _cache; } /** * * @return crawl history manager */ public CrawlHistoryManager getCrawlHistoryManager() { return _crawlHistoryManager; } /** * get reference to the singleton server instance * */ public static ProxyServer getSingleton() { return _server; } /** * * @return */ public File getCrawlHistoryDataDir() { return _crawlHistoryLogDir; } /** * get the request log file name * */ public static String getRequestLogFileName() { return "requestLog.log"; } /** * get at the async stub for the query master service * */ public QueryServerMaster.AsyncStub getQueryMasterStub() { return _queryMasterStub; } /** * get the connected / disconnected status of the query master service * connection * */ public boolean isConnectedToQueryMaster() { return _queryMasterAvailable; } @Override protected String getDefaultDataDir() { return CrawlEnvironment.DEFAULT_DATA_DIR; } @Override protected String getDefaultHttpInterface() { return CrawlEnvironment.DEFAULT_HTTP_INTERFACE; } @Override protected int getDefaultHttpPort() { return CrawlEnvironment.PROXY_SERVICE_HTTP_PORT; } @Override protected String getDefaultLogFileName() { return "proxyServer.log"; } @Override protected String getDefaultRPCInterface() { return CrawlEnvironment.DEFAULT_RPC_INTERFACE; } @Override protected int getDefaultRPCPort() { return CrawlEnvironment.PROXY_SERVICE_RPC_PORT; } @Override protected String getWebAppName() { return "proxy"; }; SimpleDateFormat dateFormat = new SimpleDateFormat("YYYY.MM.dd-HH:mm:ss.SSS"); synchronized void logProxyFailure(int httpResultCode, String failureDesc, String originalURL, String finalURL, long startTime) { StringBuffer sb = new StringBuffer(2048); sb.append(String.format("%1$24.24s ", dateFormat .format(new Date(startTime)))); sb.append(String.format("%1$8.8s ", (System.currentTimeMillis() - startTime))); sb.append(String.format("%1$4.4s ", httpResultCode)); sb.append(String.format("%1$40.40s ", failureDesc)); sb.append(originalURL); sb.append(" "); sb.append(finalURL); _requestLog.error(sb.toString()); } synchronized void logProxySuccess(int httpResultCode, String origin, String originalURL, String finalURL, long startTime) { StringBuffer sb = new StringBuffer(2048); sb.append(String.format("%1$24.24s ", dateFormat .format(new Date(startTime)))); sb.append(String.format("%1$8.8s ", (System.currentTimeMillis() - startTime))); sb.append(String.format("%1$4.4s ", httpResultCode)); sb.append(String.format("%1$10.10s ", origin)); sb.append(originalURL); sb.append(" "); sb.append(finalURL); _requestLog.error(sb.toString()); } private static class CustomLoggerLayout extends Layout { StringBuffer sbuf = new StringBuffer(1024); @Override public String format(LoggingEvent event) { sbuf.setLength(0); sbuf.append(event.getMessage()); sbuf.append(LINE_SEP); return sbuf.toString(); } @Override public boolean ignoresThrowable() { return true; } public void activateOptions() { } } @Override protected boolean initServer() { _server = this; if (super.initServer()) { try { // get database path ... File databasePath = new File(getDataDirectory().getAbsolutePath() + "/" + CrawlEnvironment.PROXY_SERVICE_DB); LOG.info("Config says Proxy db path is: "+databasePath); // initialize record store _recordStore.initialize(databasePath, null); _requestLog = new CustomLogger("RequestLog"); LOG.info("Initializing Proxy Request Log"); _requestLog.addAppender(new DailyRollingFileAppender( new CustomLoggerLayout(), _server.getLogDirectory() + "/requestLog.log", "yyyy-MM-dd")); LOG.info("Constructing CacheManager. HDFSPath:" + CrawlEnvironment.getDefaultFileSystem() + " LocalDataPath:" + getDataDirectory()); _cache = new CacheManager(CrawlEnvironment.getDefaultFileSystem(), getDataDirectory(), getEventLoop()); LOG.info("Initializing CacheManager"); if (_cacheFlushThreshold != -1) { _cache.setCacheFlushThreshold(_cacheFlushThreshold); } int cacheManagerInitFlags = 0; if (_debugMode == 1) { cacheManagerInitFlags |= CacheManager.INIT_FLAG_SKIP_HDFS_WRITER_INIT | CacheManager.INIT_FLAG_SKIP_INDEX_LOAD; } _cache.initialize(cacheManagerInitFlags); LOG.info("Initializing History Manager"); _crawlHistoryLogDir = new File(getDataDirectory(), "historyData"); _crawlHistoryLogDir.mkdir(); // default to no init flags for history manager int historyManagerFlags = 0; // but if in debug mode, disable a whole bunch of things (for now) if (_debugMode == 1) { historyManagerFlags = CrawlHistoryManager.INIT_FLAG_SKIP_LOG_WRITER_THREAD_INIT; } _crawlHistoryManager = new CrawlHistoryManager(CrawlEnvironment .getDefaultFileSystem(), new Path("crawl/proxy/history"), _crawlHistoryLogDir, getEventLoop(), historyManagerFlags); // start queueing thread LOG.info("Starting Communications with Query Master At:" + _queryMasterAddress); _queryMasterChannel = new AsyncClientChannel(_eventLoop, new InetSocketAddress(0), _queryMasterAddress, this); _queryMasterChannel.open(); _queryMasterStub = new QueryServerMaster.AsyncStub(_queryMasterChannel); } catch (IOException e) { LOG.error("Failed to Initialize CacheManager. Exception:" + CCStringUtils.stringifyException(e)); } /** init jersey framework **/ String classesRoot = System.getProperty("commoncrawl.classes.root"); LOG.info("Classes Root is:" + classesRoot); try { ArrayList<URL> urls = new ArrayList<URL>(); urls.add(new File(classesRoot).toURL()); LOG.info("URL is:" + urls.get(0).toString()); URLClassLoader loader = new URLClassLoader(urls.toArray(new URL[0]), Thread.currentThread().getContextClassLoader()); _webServer.getWebAppContext().setClassLoader(loader); } catch (MalformedURLException e) { LOG.error(CCStringUtils.stringifyException(e)); } ServletHolder holder = _webServer.addServlet(null, "/*", ServletLauncher.class); holder.setInitParameter(ServletLauncher.SERVLET_REGISTRY_KEY, ProxyServletRegistry.class.getCanonicalName()); // holder.setInitParameter(ServletContainer.APPLICATION_CONFIG_CLASS, // "org.commoncrawl.crawl.proxy.ProxyServerUIApp"); // holder.setInitParameter("x-hack-nocache","true"); // holder.setInitParameter(ServletContainer.RESOURCE_CONFIG_CLASS,"com.sun.jersey.api.core.ScanningResourceConfig"); getWebServer().setThreads(20, 175, 1); // add list uploader filter getWebServer().getWebAppContext().addFilter(MultiPartFilter.class, "/ListUploader", Handler.ALL); getWebServer().addServlet("proxyRequest", "/proxy", ProxyServlet.class); // getWebServer().addServlet("testProxyRequest", "*", // ProxyServlet2.class); getWebServer().addServlet("logRequest", RequestLogServlet.servletPath, RequestLogServlet.class); // add uploader servlet getWebServer().addServlet("uploader", "/ListUploader", ListUploadServlet.class); getWebServer().addServlet("uploader", "/ListUploaderDirect", ListUploadServlet.class); // add upload form getWebServer().addServlet("uploadForm", "/ListUploadForm", ListUploadServlet.ListUploadForm.class); // view lists form getWebServer().addServlet("viewLists", "/CrawlLists", CrawlListsServlet.class); // hack getWebServer().addServlet("requeueList", "/Requeue", ListUploadServlet.ListRequeueServlet.class); getWebServer().addServlet("requeueBrokenLists", "/RequeueBrokenLists", ListUploadServlet.RequeueBrokenListsServlet.class); // add doc uploader filter and servlet //getWebServer().getWebAppContext().addFilter( // DocUploadMultiPartFilter.class, "/DocUploader", Handler.ALL); //getWebServer().addServlet("docuploader-check", "/DocInCache", // DocUploadServlet.DocInCacheCheck.class); // disable list loader if in debug mode if (_debugMode == 1) { LOG.warn("List Loader Disabled in Debug Mode"); } else { // ok do a delayed list loader initialization getEventLoop().setTimer(new Timer(10000, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { initListLoader(); } })); } } return true; } /** * get database name for this instance * */ @Override public String getDatabaseName() { return CrawlEnvironment.PROXY_SERVICE_DB; } /** * enable the crawl log (yes by default) * */ @Override public boolean enableCrawlLog() { return false; } /** * externally manage crawl segments * */ @Override public boolean externallyManageCrawlSegments() { return true; } /** * crawl completed for the specified crawl target * */ @Override public void crawlComplete(NIOHttpConnection connection, CrawlURL url, CrawlTarget optTargetObj, boolean successOrFailure) { // log it into the history log _crawlHistoryManager.crawlComplete(url); // cache if necessary if (url.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS && optTargetObj != null && optTargetObj.getCompletionCallback() != null) { _server.logProxySuccess(url.getResultCode(), "origin", url.getUrl(), url .getRedirectURL(), optTargetObj.getRequestStartTime()); } else { if (optTargetObj != null && optTargetObj.getCompletionCallback() != null) { // if (url.getLastAttemptFailureReason() != // CrawlURL.FailureReason.RobotsExcluded && // url.getLastAttemptFailureReason() != // CrawlURL.FailureReason.BlackListedURL) { _server.logProxyFailure(url.getResultCode(), CrawlURL.FailureReason .toString(url.getLastAttemptFailureReason()) + " - " + url.getLastAttemptFailureReason(), url.getUrl(), url .getRedirectURL(), (optTargetObj != null && optTargetObj .getRequestStartTime() != -1) ? optTargetObj.getRequestStartTime() : System.currentTimeMillis()); // } } } if (optTargetObj != null && optTargetObj.getCompletionCallback() != null) { // delegate to callback optTargetObj.getCompletionCallback().crawlComplete(connection, url, optTargetObj, successOrFailure); } else { // completion callback is null... we need to handle the caching of this // object direct if (successOrFailure && url.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) { // LOG.info("### CACHING Calling cacheCrawlURL for URL:" + // url.getUrl()); ProxyServlet.cacheCrawlURLResult(url, null); } else { // LOG.info("### CACHING Skipping Write of crawlURL:" + url.getUrl() + // "SuccessOrFailFlag:" + successOrFailure + " LastAttemptResult:" + // url.getLastAttemptResult()); } } } /** * Inject an externally populated crawl url into the proxy server's queues * * @param crawlURL */ public void injectCrawlURL(final CrawlURL crawlURL, final Semaphore completionSemaphore) { getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { LOG.info("Received Injected URL:" + crawlURL.getUrl()); // log it into the history log _crawlHistoryManager.crawlComplete(crawlURL); // log it _server.logProxySuccess(crawlURL.getResultCode(), "injection", crawlURL .getUrl(), crawlURL.getRedirectURL(), 0); // and cache it ProxyServlet.cacheCrawlURLResult(crawlURL, completionSemaphore); } })); } /** * notification that a fetch is starting on the target url * */ @Override public void fetchStarting(CrawlTarget target, NIOHttpConnection connection) { } @Override protected boolean parseArguements(String[] argv) { if (super.parseArguements(argv)) { for (int i = 0; i < argv.length; ++i) { if (argv[i].equalsIgnoreCase("--querymaster")) { if (i + 1 < argv.length) { _queryMasterAddress = new InetSocketAddress(argv[++i], CrawlEnvironment.DEFAULT_QUERY_MASTER_RPC_PORT); } } else if (argv[i].equalsIgnoreCase("--cacheFlushThreshold")) { if (i + 1 < argv.length) { _cacheFlushThreshold = Integer.parseInt(argv[++i]); } } else if (argv[i].equalsIgnoreCase("--debugMode")) { if (i + 1 < argv.length) { _debugMode = Integer.parseInt(argv[++i]); } } } return (_queryMasterAddress != null); } else { return false; } } @Override public void OutgoingChannelConnected(AsyncClientChannel channel) { if (channel == _queryMasterChannel) { LOG.info("Connected to QueryMaster Server"); _queryMasterAvailable = true; } else { super.OutgoingChannelConnected(channel); } } @Override public boolean OutgoingChannelDisconnected(AsyncClientChannel channel) { if (channel == _queryMasterChannel) { // LOG.info("QueryMaster Server Disconnected"); _queryMasterAvailable = false; return false; } else { return super.OutgoingChannelDisconnected(channel); } } private static final int MAX_TARGETS_PER_ITERATION = 100; static final int DEFAULT_DNS_TIMEOUT = 30000; /** * should we use black lists * */ @Override public boolean useGlobalBlockLists() { return false; } /** * check host stats for failures * */ public boolean failHostsOnStats() { return true; } /** reload custom filters on directory service change **/ @Override protected void reloadFilters() { // load crawler's filters ... super.reloadFilters(); // and load our custom filters .. _urlBlockFilter = new URLPatternBlockFilter(); try { LOG.info("### Loading URL Block Filter"); _urlBlockFilter.loadFromPath(getDirectoryServiceAddress(), CrawlEnvironment.PROXY_URL_BLOCK_LIST_FILTER_PATH, false); LOG.info("### IP Address Block Filter "); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } /** * * @return the path to the crawl rate override filter */ @Override public String getCrawlRateOverrideFilterPath() { return CrawlEnvironment.PROXY_CRAWL_RATE_MOD_FILTER_PATH; } /** is url in server block list **/ @Override public boolean isURLInBlockList(URL url) { String rootDomainName = URLUtils.extractRootDomainName(url.getHost()); if (rootDomainName != null) { return _urlBlockFilter.filterItem(rootDomainName, url.getHost(), url .getPath(), null, null) == FilterResult.Filter_Reject; } LOG.warn("Invalid Domain passed to isURLInBlockList via URL:" + url.toString()); return true; } @Override public int getMaxRobotsExlusionsInLoopOverride() { return 20; } /** * get the host idle flush threshold * * the number of milliseconds a host needs to be idle for it to be purged from * memory * **/ @Override public int getHostIdleFlushThreshold() { return 120000; } /** * disable cycle timer * */ @Override public boolean disableCycleTimer() { return true; } /************************************************************************/ // List Loader Support Routines /************************************************************************/ private CrawlSegmentHost _activeLoadHost = null; private long _loaderLastUpdateTime = -1; private Semaphore _loaderDNSSemaphore = new Semaphore( MAX_QUEUED_DNS_REQUESTS); private Semaphore _loaderQueueSemaphore = new Semaphore(1); private Thread _loaderQueuePollThread; private boolean _shutdownPollThread = false; void initListLoader() { _loaderQueuePollThread = new Thread(new Runnable() { @Override public void run() { while (!_shutdownPollThread) { try { CrawlSegmentHostQueueItem queueItem = _loaderQueue.take(); if (queueItem._host != null) { dispatchHost(queueItem._host); } else { return; } } catch (InterruptedException e1) { } } } }); _loaderQueuePollThread.start(); _crawlHistoryManager.startQueueLoaderThread(this); } void shutdownListLoader() { _shutdownPollThread = true; try { _loaderQueue.put(new CrawlSegmentHostQueueItem()); } catch (InterruptedException e1) { } try { _loaderQueuePollThread.join(); } catch (InterruptedException e) { } _loaderQueuePollThread = null; _shutdownPollThread = false; _crawlHistoryManager.stopQueueLoaderThread(); } public static class CrawlSegmentHostQueueItem { public CrawlSegmentHostQueueItem(CrawlSegmentHost host) { _host = host; } public CrawlSegmentHostQueueItem() { _host = null; } CrawlSegmentHost _host; } static final int MAX_QUEUED_HOSTS = 40000; LinkedBlockingQueue<CrawlSegmentHostQueueItem> _loaderQueue = new LinkedBlockingQueue<CrawlSegmentHostQueueItem>(MAX_QUEUED_HOSTS); @Override public void queueURL(URLFP urlfp, String url) { _loaderLastUpdateTime = System.currentTimeMillis(); // LOG.info("Received QueueURL Request for URL:" + url); String hostName = URLUtils.fastGetHostFromURL(url); if (hostName == null) { LOG.error("###queueURL failed for url:" + url + " with null HostName!"); return; } if (hostName.length() != 0) { CrawlSegmentHost dispatchHost = null; try { _loaderQueueSemaphore.acquireUninterruptibly(); // ok is there an active host ... if (_activeLoadHost != null && !_activeLoadHost.getHostName().equals(hostName)) { // ok time to dispatch this guy immediately dispatchHost = _activeLoadHost; _activeLoadHost = null; } if (_activeLoadHost == null) { _activeLoadHost = new CrawlSegmentHost(); _activeLoadHost.setHostName(hostName); _activeLoadHost.setHostFP(URLFingerprint .generate64BitURLFPrint(hostName)); _activeLoadHost.setSegmentId(-1); _activeLoadHost.setListId(ProxyServer.getServer() .getHighPriorityListId()); } // queue the url CrawlSegmentURL urlObject = new CrawlSegmentURL(); urlObject.setUrl(url); urlObject.setUrlFP(urlfp.getUrlHash()); _activeLoadHost.getUrlTargets().add(urlObject); // if target count exceeds threshold ... if (_activeLoadHost.getUrlTargets().size() >= 1000) { dispatchHost = _activeLoadHost; _activeLoadHost = null; } } finally { _loaderQueueSemaphore.release(); } if (dispatchHost != null) { try { _loaderQueue.put(new CrawlSegmentHostQueueItem(dispatchHost)); } catch (InterruptedException e) { } } } } @Override public void flush() { CrawlSegmentHost dispatchHost = null; _loaderQueueSemaphore.acquireUninterruptibly(); try { dispatchHost = _activeLoadHost; _activeLoadHost = dispatchHost; } finally { _loaderQueueSemaphore.release(); } if (dispatchHost != null) { try { _loaderQueue.put(new CrawlSegmentHostQueueItem(dispatchHost)); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } private void dispatchHost(final CrawlSegmentHost host) { // LOG.info("Dispatch Host Called for Host:" + host.getHostName()); // acquire loader semaphore ... try { while (!_loaderDNSSemaphore.tryAcquire(100, TimeUnit.MILLISECONDS)) { LOG.info("###URLLoader Waiting on DNS Queue"); } } catch (InterruptedException e1) { LOG.error(CCStringUtils.stringifyException(e1)); } host.setIpAddress(0); // schedule resolution ... NIODNSQueryClient queryClient = new NIODNSQueryClient() { @Override public void AddressResolutionFailure(NIODNSResolver source,String hostName, Status status, String errorDesc) { LOG.info("queueExternalURL for Failed with:DNS Failed for High Priority Request:" + hostName + " Errror:" + errorDesc); _loaderDNSSemaphore.release(); // fail the urls ... getEngine().processHostIPResolutionResult(host,true,null); } @Override public void AddressResolutionSuccess(NIODNSResolver source, String hostName, String name, InetAddress address, long addressTTL) { int hostAddress = 0; if (address != null && address.getAddress() != null) { byte[] addr = address.getAddress(); if (addr.length == 4) { hostAddress = IPAddressUtils.IPV4AddressToInteger(addr); } } if (hostAddress != 0) { // LOG.info("DNS Success for Host:" + hostName + "Queueing"); // set the address into the host object host.setIpAddress(hostAddress); host.setTtl(addressTTL + 30000000); queueExternalHost(host, null); } else { // LOG.error("DNS Failed for High Priority URL:"+ url + // " with Zero IP"); LOG.info("queueExternalURL Failed with:DNS Failed for Host:" + host.getHostName() + " with Zero IP"); } _loaderDNSSemaphore.release(); } @Override public void DNSResultsAvailable() { } @Override public void done(NIODNSResolver source, Future<NIODNSQueryResult> task) { } }; try { getServer().getDNSServiceResolver().resolve(queryClient, host.getHostName(), false, true, DEFAULT_DNS_TIMEOUT); } catch (IOException e) { // LOG.error("Failed to Dispatch DNS Query for High Priority URL:" + url + // " Exception:" + CCStringUtils.stringifyException(e)); LOG.info("queueExternalURL for Host:" + host.getHostName() + " Failed with:Exception:" + CCStringUtils.stringifyException(e)); _loaderDNSSemaphore.release(); } } @Override public void stop() { LOG.info("ProxyServer Stop Called"); if (_crawlHistoryManager != null) { LOG.info("Shutting Down CrawlHistory List Loader "); shutdownListLoader(); LOG.info("Shutting Down CrawlHistoryManager"); _crawlHistoryManager.shutdown(); } if (_cache != null) { LOG.info("Shutting Down CacheManager"); _cache.shutdown(); } LOG.info("ProxyServer Callign Super Stop"); super.stop(); } private static final String CRAWL_LIST_RECORD_PARENT_ID = "CRAWL_LIST_RECORD_TYPE"; private static final String CRAWL_LIST_RECORD_PREFIX = "CrawlList_"; void requeueList(final long originalListId, final File listDataFile) { final Semaphore blockingSemaphore = new Semaphore(0); getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { try { // get the existing database record CrawlListDatabaseRecord record = (CrawlListDatabaseRecord) _recordStore .getRecordByKey(CRAWL_LIST_RECORD_PREFIX + originalListId); if (record != null) { LOG.info("### Found Record for ListId:" + originalListId); // delete record // tell history manager to load list ... LOG.info("### Reloading List"); long newListId = _crawlHistoryManager.loadList(listDataFile,record.getRefreshInterval()); LOG.info("### Reloaded List Id is:" + newListId); // update list id record.setListId(newListId); // update list filename record.setTempFileName(listDataFile.getName()); LOG.info("### Upading Database Record"); _recordStore.beginTransaction(); _recordStore.deleteRecordById(record.getRecordId()); _recordStore.insertRecord(CRAWL_LIST_RECORD_PARENT_ID, CRAWL_LIST_RECORD_PREFIX + newListId, record); _recordStore.commitTransaction(); LOG.info("### Updated Database Record"); } } catch (IOException e) { } } })); } long queueListImportRequest(final CrawlListDatabaseRecord record) { final Semaphore blockingSemaphore = new Semaphore(0); getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { try { File listDataPath = new File(getCrawlHistoryDataDir(), record .getTempFileName()); long listId = _crawlHistoryManager.loadList(listDataPath,record.getRefreshInterval()); record.setListId(listId); _recordStore.beginTransaction(); _recordStore.insertRecord(CRAWL_LIST_RECORD_PARENT_ID, CRAWL_LIST_RECORD_PREFIX + listId, record); _recordStore.commitTransaction(); } catch (IOException e) { record.setFieldClean(CrawlListDatabaseRecord.Field_LISTID); LOG.error(CCStringUtils.stringifyException(e)); } finally { blockingSemaphore.release(); } } })); blockingSemaphore.acquireUninterruptibly(); return (record.isFieldDirty(CrawlListDatabaseRecord.Field_LISTID)) ? record .getListId() : -1; } private static class MutableBoolean { public boolean result = false; } public boolean doesListBelongToCustomer(final long listId,final String customerId) { final MutableBoolean resultValue = new MutableBoolean(); final Runnable runnable = new Runnable() { @Override public void run() { try { CrawlListDatabaseRecord record = (CrawlListDatabaseRecord) _recordStore .getRecordByKey(CRAWL_LIST_RECORD_PREFIX + listId); if (record != null) { resultValue.result = (record.getCustomerName().equals(customerId)); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } }; if (Thread.currentThread() != getEventLoop().getEventThread()) { final Semaphore blockingSemaphore = new Semaphore(0); getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { runnable.run(); blockingSemaphore.release(); } })); blockingSemaphore.acquireUninterruptibly(); } else { runnable.run(); } return resultValue.result; } /** * get the list ids associated with the specified customer id * * @param customerId * @return Set of list ids */ public Map<Long, CrawlListDatabaseRecord> getListInfoForCustomerId( final String customerId) { final TreeMap<Long, CrawlListDatabaseRecord> listRecords = new TreeMap<Long, CrawlListDatabaseRecord>(); final Runnable runnable = new Runnable() { @Override public void run() { try { Vector<Long> recordIds = _recordStore .getChildRecordsByParentId(CRAWL_LIST_RECORD_PARENT_ID); for (long recordId : recordIds) { CrawlListDatabaseRecord databaseRecord = (CrawlListDatabaseRecord) _recordStore .getRecordById(recordId); if (databaseRecord != null && (customerId.equals("*") || databaseRecord.getCustomerName() .equals(customerId))) { listRecords.put(databaseRecord.getListId(), databaseRecord); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } }; if (Thread.currentThread() != getEventLoop().getEventThread()) { final Semaphore blockingSemaphore = new Semaphore(0); getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { runnable.run(); blockingSemaphore.release(); } })); blockingSemaphore.acquireUninterruptibly(); } else { runnable.run(); } return listRecords; } }