/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.parser.server; import java.net.URL; import java.util.Collection; import java.util.HashSet; import java.util.Map.Entry; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.Semaphore; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.async.Callback; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.rpc.base.internal.AsyncClientChannel; import org.commoncrawl.rpc.base.internal.AsyncContext; import org.commoncrawl.rpc.base.internal.AsyncServerChannel; import org.commoncrawl.rpc.base.internal.NullMessage; import org.commoncrawl.rpc.base.internal.AsyncRequest.Status; import org.commoncrawl.rpc.base.shared.RPCException; import org.commoncrawl.server.CommonCrawlServer; import org.commoncrawl.service.parser.ParseRequest; import org.commoncrawl.service.parser.ParseResult; import org.commoncrawl.service.parser.ParserServiceSlave; import org.commoncrawl.service.parser.SlaveStatus; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.FlexBuffer; import com.google.common.collect.Multimap; import com.google.common.collect.TreeMultimap; /** * * @author rana * */ public class ParserSlaveServer extends CommonCrawlServer implements ParserServiceSlave, AsyncServerChannel.ConnectionCallback{ private static final Log LOG = LogFactory.getLog(ParserSlaveServer.class); private static final int MAX_QUEUE_SIZE_DEFAULT = 10; private static final int DEFAULT_WORKER_THREAD_COUNT = 5; private int max_queue_size = MAX_QUEUE_SIZE_DEFAULT; private int thread_count = DEFAULT_WORKER_THREAD_COUNT; private class Request { public Request(AsyncContext<ParseRequest, ParseResult> request) { requestContext = request; } public AsyncContext<ParseRequest, ParseResult> requestContext; } private LinkedBlockingDeque<Request> requestQueue = new LinkedBlockingDeque<Request>(); private HashSet<Long> _activeChannels = new HashSet<Long>(); private Thread _parserThreads[]; private Semaphore _threadSemaphore = null; private AtomicInteger _activeThreads = new AtomicInteger(); @Override protected String getDefaultDataDir() { return CrawlEnvironment.DEFAULT_DATA_DIR; } @Override protected String getDefaultHttpInterface() { return CrawlEnvironment.DEFAULT_HTTP_INTERFACE; } @Override protected int getDefaultHttpPort() { return CrawlEnvironment.DEFAULT_PARSER_SLAVE_HTTP_PORT; } @Override protected String getDefaultLogFileName() { return "historyserver.log"; } @Override protected String getDefaultRPCInterface() { return CrawlEnvironment.DEFAULT_RPC_INTERFACE; } @Override protected int getDefaultRPCPort() { return CrawlEnvironment.DEFAULT_PARSER_SLAVE_RPC_PORT; } @Override protected String getWebAppName() { return CrawlEnvironment.DEFAULT_PARSER_SLAVE_WEBAPP_NAME; } @Override protected boolean initServer() { try { // create server channel ... AsyncServerChannel channel = new AsyncServerChannel(this, this.getEventLoop(), this.getServerAddress(),this); // register RPC services it supports ... registerService(channel,ParserServiceSlave.spec); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } return true; } /** do a clean shutdown (if possible) **/ @Override public void stop() { // ok, wait to grab the checkpoint thread semaphore LOG.info("Server Shutdown Detected."); // ok safe to call super now ... super.stop(); } @Override protected boolean parseArguements(String[] argv) { for(int i=0; i < argv.length;++i) { if (argv[i].equalsIgnoreCase("--queue_size")) { max_queue_size = Integer.parseInt(argv[++i]); if (max_queue_size < 1) { throw new RuntimeException("Invalid Queue Size"); } } else if (argv[i].equalsIgnoreCase("--worker_threads")) { thread_count = Integer.parseInt(argv[++i]); if (thread_count < 1) { throw new RuntimeException("Invalid Thread Count"); } } } return true; } @Override protected void printUsage() { } @Override protected boolean startDaemons() { _parserThreads = new Thread[thread_count]; _threadSemaphore = new Semaphore(-(thread_count - 1)); for (int i=0;i<thread_count;++i) { _parserThreads[i] = new Thread(new Runnable() { @Override public void run() { try { while (true) { try { final Request request = requestQueue.take(); if (request.requestContext == null) { LOG.info("Parser Thread:"+ Thread.currentThread().getId() +" Exiting."); return; } else { ParseRequest parseRequest= request.requestContext.getInput(); ParseResult parseResult = request.requestContext.getOutput(); LOG.info("Parser Thread:" + Thread.currentThread().getId() + " got request for url:"+ parseRequest.getDocURL()); try { _activeThreads.incrementAndGet(); URL url = new URL(parseRequest.getDocURL()); ParseWorker worker = new ParseWorker(); worker.parseDocument( request.requestContext.getOutput(), parseRequest.getDomainId(), parseRequest.getDocId(), url, parseRequest.getDocHeaders(), new FlexBuffer( parseRequest.getDocContent().getReadOnlyBytes(), parseRequest.getDocContent().getOffset(), parseRequest.getDocContent().getCount())); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); parseResult.setParseSuccessful(false); if (parseResult.getParseFailureReason().length() == 0) { parseResult.setParseFailureReason(CCStringUtils.stringifyException(e)); } } finally { _activeThreads.decrementAndGet(); } getEventLoop().queueAsyncCallback(new Callback() { @Override public void execute() { try { request.requestContext.completeRequest(); } catch (RPCException e) { LOG.error("RPC Exception when processing ParseRequest:" + CCStringUtils.stringifyException(e)); } } }); } } catch (InterruptedException e) { } } } finally { _activeThreads.decrementAndGet(); _threadSemaphore.release(); } } }); _parserThreads[i].start(); } return true; } @Override protected void stopDaemons() { if (_parserThreads != null) { LOG.info("Stop Daemons Called. Sending Threads Shutdown request"); for (int i=0;i<_parserThreads.length;++i) { try { requestQueue.put(new Request(null)); } catch (InterruptedException e) { } } LOG.info("Waiting for threads to die"); // now try to acquire shutdown sempahore _threadSemaphore.acquireUninterruptibly(); LOG.info("Parser Threads are dead"); } } @Override public void IncomingClientConnected(AsyncClientChannel channel) { synchronized(this) { _activeChannels.add(channel.getChannelId()); } } @Override public void IncomingClientDisconnected(AsyncClientChannel channel) { synchronized(this) { _activeChannels.remove(channel.getChannelId()); } } @Override public void queryStatus(AsyncContext<NullMessage, SlaveStatus> rpcContext) throws RPCException { rpcContext.getOutput().setActive(true); //rpcContext.getOutput().setLoad( // ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage()); rpcContext.getOutput().setActiveDocs(_activeThreads.get()); rpcContext.getOutput().setQueuedDocs(requestQueue.size()); rpcContext.setStatus(Status.Success); rpcContext.completeRequest(); } @Override public void parseDocument(AsyncContext<ParseRequest, ParseResult> rpcContext) throws RPCException { if (requestQueue.size() >= max_queue_size) { rpcContext.setErrorDesc("Queue is Full.Failing Request."); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } else { requestQueue.addLast(new Request(rpcContext)); } } public static void main(String[] args) { Multimap<String,String> options = TreeMultimap.create(); for (int i=0;i<args.length;++i) { String optionName = args[i]; if (++i != args.length) { String optionValue = args[i]; options.put(optionName, optionValue); } } options.removeAll("--server"); options.put("--server",ParserSlaveServer.class.getName()); Collection<Entry<String,String>> entrySet = options.entries(); String finalArgs[] = new String[entrySet.size() * 2]; int index = 0; for (Entry entry : entrySet) { finalArgs[index++] = (String)entry.getKey(); finalArgs[index++] = (String)entry.getValue(); } try { CommonCrawlServer.main(finalArgs); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } } }