package org.atomnuke.source.crawler; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.util.HashMap; import java.util.Map; import javax.xml.bind.JAXBException; import javax.xml.namespace.QName; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.atomnuke.atom.io.AtomReadException; import org.atomnuke.atom.io.AtomReaderFactory; import org.atomnuke.atom.io.ReaderResult; import org.atomnuke.atom.io.reader.sax.SaxAtomReaderFactory; import org.atomnuke.atom.model.Link; import org.atomnuke.source.AtomSource; import org.atomnuke.source.AtomSourceException; import org.atomnuke.source.result.AtomSourceResult; import org.atomnuke.source.result.AtomSourceResultImpl; import org.atomnuke.task.context.AtomTaskContext; import org.atomnuke.lifecycle.InitializationException; import org.atomnuke.service.ServiceUnavailableException; import org.atomnuke.service.introspection.ServicesInterrogator; import org.atomnuke.source.action.ActionType; import org.atomnuke.source.action.AtomSourceActionImpl; import org.atomnuke.source.crawler.auth.AuthenticationHandler; import org.atomnuke.source.crawler.config.model.FeedCrawlerTargets; import org.atomnuke.source.crawler.config.model.FeedTarget; import org.atomnuke.source.crawler.config.model.HttpHeader; import org.atomnuke.util.config.ConfigurationException; import org.atomnuke.util.config.io.ConfigurationManager; import org.atomnuke.util.config.io.file.FileConfigurationManager; import org.atomnuke.util.config.io.marshall.ConfigurationMarshaller; import org.atomnuke.util.config.io.marshall.jaxb.JaxbConfigurationMarhsaller; import org.atomnuke.util.config.update.ConfigurationContext; import org.atomnuke.util.config.update.ConfigurationUpdateService; import org.atomnuke.util.config.update.listener.ConfigurationListener; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author zinic */ public class FeedCrawlerSource implements AtomSource { private static final QName _CrawlerTargets_QNAME = new QName("http://atomnuke.org/configuration/feed-crawler", "crawler-targets"); private static final String CFG_MANAGER_NAME = "FeedCrawlerTargetsConfigurationManager"; private static final Logger LOG = LoggerFactory.getLogger(FeedCrawlerSource.class); private final AtomReaderFactory atomReaderFactory; private AuthenticationHandler authenticationHandler; private ServicesInterrogator availableServices; private String nextLocation, myActorId; private StateManager stateManager; private HttpClient httpClient; public FeedCrawlerSource() { atomReaderFactory = new SaxAtomReaderFactory(); } public ConfigurationContext<FeedCrawlerTargets> getConfigurationContext(ServicesInterrogator interrogator, String configDir) throws JAXBException, ServiceUnavailableException, ConfigurationException { final ConfigurationUpdateService cfgService = interrogator.firstAvailable(ConfigurationUpdateService.class); ConfigurationContext<FeedCrawlerTargets> ctx = cfgService.get(CFG_MANAGER_NAME); if (ctx == null) { final File configurationFile = new File(configDir, "feed-crawler-targets.cfg.xml"); final ConfigurationMarshaller<FeedCrawlerTargets> marshallerInstance = JaxbConfigurationMarhsaller.newJaxConfigurationMarshaller(FeedCrawlerTargets.class, _CrawlerTargets_QNAME); final ConfigurationManager<FeedCrawlerTargets> targetsConfigurationManager = new FileConfigurationManager<FeedCrawlerTargets>(marshallerInstance, configurationFile); ctx = cfgService.register(CFG_MANAGER_NAME, targetsConfigurationManager); } return ctx; } @Override public void init(AtomTaskContext tc) throws InitializationException { myActorId = tc.actorId(); availableServices = tc.services(); try { final ConfigurationContext<FeedCrawlerTargets> configContext = getConfigurationContext(tc.services(), tc.environment().configurationDirectory()); configContext.addListener(new ConfigurationListener<FeedCrawlerTargets>() { @Override public void updated(FeedCrawlerTargets configuration) throws ConfigurationException { configUpdate(configuration); } }); httpClient = availableServices.firstAvailable(HttpClient.class); } catch (Exception sue) { throw new InitializationException(sue); } nextLocation = stateManager.loadState(); } @Override public void destroy() { stateManager.writeState(nextLocation); } @Override public synchronized AtomSourceResult poll() throws AtomSourceException { // If the nextLocation is null then we haven't been configured yet if (nextLocation != null) { try { final ReaderResult readResult = read(nextLocation); if (readResult.isFeed()) { for (Link pageLink : readResult.getFeed().links()) { if (pageLink.rel().equalsIgnoreCase("previous")) { nextLocation = pageLink.href(); stateManager.writeState(nextLocation); break; } } return new AtomSourceResultImpl(new AtomSourceActionImpl(ActionType.HAS_NEXT), readResult.getFeed()); } } catch (Exception ex) { throw new AtomSourceException("Failed to poll ATOM feed: \"" + nextLocation + "\" - Error: " + ex.getMessage(), ex); } } return new AtomSourceResultImpl(new AtomSourceActionImpl(ActionType.SLEEP)); } private synchronized void configUpdate(FeedCrawlerTargets configuration) { for (FeedTarget feedTarget : configuration.getFeed()) { if (myActorId.equals(feedTarget.getActorRef())) { // This is us, let's configure nextLocation = feedTarget.getHref(); // Where should we write state? if (feedTarget.getFsOptions() != null) { final File stateFile = new File(URI.create(feedTarget.getFsOptions().getStateFile())); stateManager = new StateManager(stateFile); } // Is there an auth handler we should use? if (feedTarget.getAuthentication() != null) { final String authenticationHandlerName = feedTarget.getAuthentication().getHandler(); try { authenticationHandler = availableServices.lookup(authenticationHandlerName, AuthenticationHandler.class); } catch (ServiceUnavailableException sue) { LOG.error("Unable to find an authentication handler named: " + authenticationHandlerName + ". While this is not fatal, authentication for this feed crawler will not be enabled."); } } break; } } } private ReaderResult read(String location) throws AtomReadException, IOException { boolean done = false; InputStream inputStream = null; try { while (!done) { final HttpGet httpGet = new HttpGet(location); if (authenticationHandler != null) { for (Map.Entry<String, String> headerToAdd : authenticationHandler.authenticationHeaders().entrySet()) { httpGet.addHeader(headerToAdd.getKey(), headerToAdd.getValue()); } } final HttpResponse response = httpClient.execute(httpGet); final int statusCode = response.getStatusLine().getStatusCode(); switch (statusCode) { case 200: final HttpEntity entity = response.getEntity(); final ReaderResult result = atomReaderFactory.getInstance().read(entity.getContent()); return result; case 401: if (authenticationHandler != null) { authenticationHandler.authenticate(); } break; default: done = true; // TODO: log } } } finally { if (inputStream != null) { inputStream.close(); } } return null; } }