/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.protocol; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.Options; import org.apache.commons.lang.StringUtils; import org.apache.storm.Config; import org.apache.storm.utils.Utils; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.util.ConfUtils; import crawlercommons.robots.BaseRobotRules; public abstract class AbstractHttpProtocol implements Protocol { private com.digitalpebble.stormcrawler.protocol.HttpRobotRulesParser robots; protected boolean skipRobots = false; protected boolean storeHTTPHeaders = false; protected boolean useCookies = false; @Override public void configure(Config conf) { this.skipRobots = ConfUtils.getBoolean(conf, "http.skip.robots", false); this.storeHTTPHeaders = ConfUtils.getBoolean(conf, "http.store.headers", false); this.useCookies = ConfUtils.getBoolean(conf, "http.use.cookies", false); robots = new HttpRobotRulesParser(conf); } @Override public BaseRobotRules getRobotRules(String url) { if (this.skipRobots) return RobotRulesParser.EMPTY_RULES; return robots.getRobotRulesSet(this, url); } @Override public void cleanup() { } public static String getAgentString(Config conf) { return getAgentString(ConfUtils.getString(conf, "http.agent.name"), ConfUtils.getString(conf, "http.agent.version"), ConfUtils.getString(conf, "http.agent.description"), ConfUtils.getString(conf, "http.agent.url"), ConfUtils.getString(conf, "http.agent.email")); } protected static String getAgentString(String agentName, String agentVersion, String agentDesc, String agentURL, String agentEmail) { StringBuilder buf = new StringBuilder(); buf.append(agentName); if (StringUtils.isNotBlank(agentVersion)) { buf.append("/"); buf.append(agentVersion); } boolean hasAgentDesc = StringUtils.isNotBlank(agentDesc); boolean hasAgentURL = StringUtils.isNotBlank(agentURL); boolean hasAgentEmail = StringUtils.isNotBlank(agentEmail); if (hasAgentDesc || hasAgentEmail || hasAgentURL) { buf.append(" ("); if (hasAgentDesc) { buf.append(agentDesc); if (hasAgentURL || hasAgentEmail) buf.append("; "); } if (hasAgentURL) { buf.append(agentURL); if (hasAgentEmail) buf.append("; "); } if (hasAgentEmail) { buf.append(agentEmail); } buf.append(")"); } return buf.toString(); } /** Called by extensions of this class **/ protected static void main(AbstractHttpProtocol protocol, String args[]) throws Exception { Config conf = new Config(); // loads the default configuration file Map defaultSCConfig = Utils.findAndReadConfigFile( "crawler-default.yaml", false); conf.putAll(ConfUtils.extractConfigElement(defaultSCConfig)); Options options = new Options(); options.addOption("c", true, "configuration file"); CommandLineParser parser = new DefaultParser(); CommandLine cmd = parser.parse(options, args); if (cmd.hasOption("c")) { String confFile = cmd.getOptionValue("c"); ConfUtils.loadConf(confFile, conf); } protocol.configure(conf); Set<Runnable> threads = new HashSet<>(); class Fetchable implements Runnable { String url; Fetchable(String url) { this.url = url; } public void run() { StringBuilder stringB = new StringBuilder(); stringB.append(url).append("\n"); if (!protocol.skipRobots) { BaseRobotRules rules = protocol.getRobotRules(url); stringB.append("robots allowed: ") .append(rules.isAllowed(url)).append("\n"); if (rules instanceof RobotRules) { stringB.append("robots requests: ") .append(((RobotRules) rules) .getContentLengthFetched().length) .append("\n"); } } Metadata md = new Metadata(); long start = System.currentTimeMillis(); ProtocolResponse response; try { response = protocol.getProtocolOutput(url, md); stringB.append(response.getMetadata()).append("\n"); stringB.append("status code: " + response.getStatusCode()) .append("\n"); stringB.append( "content length: " + response.getContent().length) .append("\n"); long timeFetching = System.currentTimeMillis() - start; stringB.append("fetched in : " + timeFetching + " msec"); System.out.println(stringB); } catch (Exception e) { e.printStackTrace(); } finally { threads.remove(this); } } } for (String arg : cmd.getArgs()) { Fetchable p = new Fetchable(arg); threads.add(p); new Thread(p).start(); } while (threads.size() > 0) { Thread.sleep(1000); } protocol.cleanup(); System.exit(0); } }