package io.monokkel.actors; import akka.actor.*; import akka.contrib.throttle.Throttler; import akka.contrib.throttle.TimerBasedThrottler; import akka.routing.ActorRefRoutee; import akka.routing.BroadcastRoutingLogic; import akka.routing.Routee; import akka.routing.Router; import com.google.common.collect.Lists; import io.monokkel.actors.domain.ThrottlerConfig; import io.monokkel.core.DataCleaner; import io.monokkel.core.DataValidator; import io.monokkel.core.Indexer; import io.monokkel.core.UrlVisitor; import io.monokkel.core.api.ResponseParser; import io.monokkel.exceptions.IndexDocumentException; import io.monokkel.exceptions.ParseException; import io.monokkel.exceptions.UrlVisitException; import io.monokkel.exceptions.ValidationError; import io.monokkel.messages.*; import io.monokkel.messages.builders.ParseResponseBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.concurrent.duration.Duration; import java.io.IOException; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; /** * Created by Tarjei on 28/05/14. * <p/> * The main actor for crawling urls. When receiving a message it parses it and * distributes it to the different actors. The actors are: * <p/> * urlVisited: A stateful url actor that contain all urls visited by the crawler * urlVisitor: A visitor actor that will visit an url and respond with the content * parserBroadcast: Multiple parsers that all receives a message with the content. Based on each actors content check it will choose how to parse the data * indexActor: An index actor is sending the content to ElasticSearch */ public class CrawlerActor extends UntypedActor { private static final Integer DEFAULT_START_DEPTH = 0; private static Logger log = LoggerFactory.getLogger(CrawlerActor.class); private final ActorRef cleanerActor; private final ActorRef urlVisited; private final ActorRef urlVisitor; private final Router parserBroadcast; private final ActorRef indexActor; private final ActorRef urlThrottler; private final Boolean isThrottlerDisabled; private final ActorRef validatorActor; public CrawlerActor(final UrlVisitor urlVisitor, final Indexer indexer, final DataCleaner dataCleaner, final ThrottlerConfig throttlerConfig, final List<ResponseParser> responseParsers, final DataValidator dataValidator) { this.urlVisited = getContext().actorOf(UrlVisitedActor.props(), "urlVisitedActor"); this.urlVisitor = getContext().actorOf(UrlVisitorActor.props(urlVisitor), "urlVisitorActor"); List<Routee> routes = createParserActorBroadcastRoutes(responseParsers); parserBroadcast = new Router(new BroadcastRoutingLogic(), routes); this.indexActor = getContext().actorOf(IndexActor.props(indexer), "indexActor"); this.cleanerActor = getContext().actorOf(CleanerActor.props(dataCleaner)); this.validatorActor = getContext().actorOf(DataValidatorActor.props(dataValidator)); final Integer seconds = throttlerConfig.getSeconds(); final Integer urlsPer = throttlerConfig.getUrlsPer(); isThrottlerDisabled = throttlerConfig.isDisabled(); this.urlThrottler = getContext().actorOf(Props.create(TimerBasedThrottler.class, new Throttler.Rate(urlsPer, Duration.create(seconds, TimeUnit.SECONDS))), "urlThrottler"); urlThrottler.tell(new Throttler.SetTarget(this.urlVisitor), null); } private List<Routee> createParserActorBroadcastRoutes(List<ResponseParser> responseParsers) { return responseParsers.stream().map(responseParser -> { final ActorRef parserActorRef = getContext().actorOf(ParserActor.props(responseParser)); return new ActorRefRoutee(parserActorRef); }).collect(Collectors.toList()); } public static Props props(final UrlVisitor urlVisitor, final Indexer indexer, final DataCleaner dataCleaner, final ThrottlerConfig throttlerConfig, final List<ResponseParser> responseParser, final DataValidator dataValidator) { return Props.create(CrawlerActor.class, () -> new CrawlerActor(urlVisitor, indexer, dataCleaner, throttlerConfig, responseParser, dataValidator)); } private static SupervisorStrategy strategy = new OneForOneStrategy(10, Duration.create("1 minute"), param -> { if (param instanceof UrlVisitException) { log.info("UrlVisitException is received. No point to restart the actor. It is therefore resumed", param); return SupervisorStrategy.resume(); } else if (param instanceof ValidationError) { ValidationError validationError = (ValidationError)param; log.warn("Failed to validate field \"{}\" with data \"{}\" using expression \"{}\"",validationError.getKey(),validationError.getValue(),validationError.getValidateExpression()); return SupervisorStrategy.resume(); } else if (param instanceof IOException) { log.warn("IOException is received and the actors are restarted", param); return SupervisorStrategy.restart(); } else if (param instanceof ParseException) { log.warn("HtmlParseException is received and actors are resumed", param); return SupervisorStrategy.resume(); } else if (param instanceof IndexDocumentException) { log.warn("IndexDocumentException is received. The IndexActor is restarted", param); return SupervisorStrategy.restart(); } else { log.error("Exception escalated higher in the hierarchy. The actor is stopped", param); return SupervisorStrategy.escalate(); } }); @Override public SupervisorStrategy supervisorStrategy() { return strategy; } @Override public void onReceive(Object message) throws Exception { if (message instanceof SeedUrl) { SeedUrl seedUrl = (SeedUrl) message; urlVisited.tell(new IsUrlVisitedBefore(seedUrl.getUrls(), DEFAULT_START_DEPTH, seedUrl.getMaxDepth()), getSelf()); } else if (message instanceof UrlIsNotVisitedBefore) { UrlIsNotVisitedBefore urlIsNotVisitedBefore = (UrlIsNotVisitedBefore) message; final List<String> urls = urlIsNotVisitedBefore.getUrls(); final ActorRef visitor = isThrottlerDisabled ? urlVisitor : urlThrottler; urls.parallelStream().forEach(url -> visitor.tell(new VisitUrl(url, urlIsNotVisitedBefore.getDepth(), urlIsNotVisitedBefore.getMaxDepth()), getSelf())); } else if (message instanceof UnparsedResponse) { UnparsedResponse unparsedResponse = (UnparsedResponse) message; parserBroadcast.route(new ParseResponseBuilder().setResponse(unparsedResponse.getResponse()).setUrl(unparsedResponse.getUrl()).setTimeStamp(unparsedResponse.getTimeStamp()).setDepth(unparsedResponse.getDepth()).setMaxDepth(unparsedResponse.getMaxDepth()).setTypesFromTheResponseHeader(unparsedResponse.getTypesFromTheResponseHeader()).createParseResponse(), getSelf()); } else if (message instanceof ParserDone) { ParserDone parserDone = (ParserDone) message; cleanerActor.tell(new CleanTransformedContent(parserDone.getPageData(), parserDone.getDepth(), parserDone.getUrlSet(), parserDone.getMaxDepth()), getSelf()); } else if (message instanceof CleanerDone) { CleanerDone cleanerDone = (CleanerDone) message; urlVisited.tell(new IsUrlVisitedBefore(Lists.newArrayList(cleanerDone.getUrlSet()), cleanerDone.getDepth(), cleanerDone.getMaxDepth()), getSelf()); validatorActor.tell(new ValidatePageData(cleanerDone.getPageData()),getSelf()); } else if(message instanceof IndexValidatedDocument){ IndexValidatedDocument indexValidatedDocument = (IndexValidatedDocument) message; indexActor.tell(new IndexResponse(indexValidatedDocument.getPageData()), getSelf()); } else { unhandled(message); } } }