/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules; import static org.archive.modules.CoreAttributeConstants.A_HERITABLE_KEYS; import java.io.Serializable; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; import java.util.Set; import org.apache.commons.httpclient.URIException; import org.archive.crawler.event.AMQPUrlPublishedEvent; import org.archive.crawler.frontier.AMQPUrlReceiver; import org.archive.modules.fetcher.FetchHTTP; import org.json.JSONObject; import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContextAware; import org.springframework.context.ApplicationEvent; import org.springframework.beans.BeansException; import com.rabbitmq.client.AMQP; import com.rabbitmq.client.AMQP.BasicProperties; /** * @author eldondev * @contributor nlevitt */ public class AMQPPublishProcessor extends AMQPProducerProcessor implements Serializable, ApplicationContextAware { private static final long serialVersionUID = 2L; public static final String A_SENT_TO_AMQP = "sentToAMQP"; // annotation protected ApplicationContext appCtx; public void setApplicationContext(ApplicationContext appCtx) throws BeansException { this.appCtx = appCtx; } public AMQPPublishProcessor() { // set default values setExchange("umbra"); setRoutingKey("urls"); } { setClientId("requests"); } public String getClientId() { return (String) kp.get("clientId"); } /** * Client id to include in the json payload. AMQPUrlReceiver queueName * should have the same value, since umbra will route request urls based on * this key. */ public void setClientId(String clientId) { kp.put("clientId", clientId); } @SuppressWarnings("unchecked") public Map<String, Object> getExtraInfo() { return (Map<String, Object>) kp.get("extraInfo"); } /** * Arbitrary additional information to include in the json payload. */ public void setExtraInfo(Map<String, Object> extraInfo) { kp.put("extraInfo", extraInfo); } /** * @return true iff url is http or https, is not robots.txt, was not * received via AMQP */ protected boolean shouldProcess(CrawlURI curi) { try { return !curi.getAnnotations().contains(AMQPUrlReceiver.A_RECEIVED_FROM_AMQP) && !"/robots.txt".equals(curi.getUURI().getPath()) && (curi.getUURI().getScheme().equals(FetchHTTP.HTTP_SCHEME) || curi.getUURI().getScheme().equals(FetchHTTP.HTTPS_SCHEME)); } catch (URIException e) { throw new RuntimeException(e); } } /** * Constructs the json to send via AMQP. This includes the url, and some * metadata from the CrawlURI. The metadata should be passed back to * heritrix with each url discovered from this url. (XXX need context in * class javadoc) * * @return the message to send via AMQP * @see CrawlURI#inheritFrom(CrawlURI) */ protected JSONObject buildJsonMessage(CrawlURI curi) { JSONObject message = new JSONObject().put("url", curi.toString()); if (getClientId() != null) { message.put("clientId", getClientId()); } if (getExtraInfo() != null) { for (String k: getExtraInfo().keySet()) { message.put(k, getExtraInfo().get(k)); } } HashMap<String, Object> metadata = new HashMap<String,Object>(); metadata.put("pathFromSeed", curi.getPathFromSeed()); @SuppressWarnings("unchecked") Set<String> heritableKeys = (Set<String>) curi.getData().get(A_HERITABLE_KEYS); HashMap<String, Object> heritableData = new HashMap<String,Object>(); if (heritableKeys != null) { for (String key: heritableKeys) { heritableData.put(key, curi.getData().get(key)); } } metadata.put("heritableData", heritableData); message.put("metadata", metadata); return message; } @Override protected byte[] buildMessage(CrawlURI curi) { try { return buildJsonMessage(curi).toString().getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } @Override protected void success(CrawlURI curi, byte[] message, BasicProperties props) { super.success(curi, message, props); curi.getAnnotations().add(A_SENT_TO_AMQP); appCtx.publishEvent(new AMQPUrlPublishedEvent(AMQPPublishProcessor.this, curi)); } protected BasicProperties props = new AMQP.BasicProperties.Builder(). contentType("application/json").build(); @Override protected BasicProperties amqpMessageProperties() { return props; } }