package io.monokkel.actors;
import akka.actor.*;
import akka.contrib.throttle.Throttler;
import akka.contrib.throttle.TimerBasedThrottler;
import akka.routing.ActorRefRoutee;
import akka.routing.BroadcastRoutingLogic;
import akka.routing.Routee;
import akka.routing.Router;
import com.google.common.collect.Lists;
import io.monokkel.actors.domain.ThrottlerConfig;
import io.monokkel.core.DataCleaner;
import io.monokkel.core.DataValidator;
import io.monokkel.core.Indexer;
import io.monokkel.core.UrlVisitor;
import io.monokkel.core.api.ResponseParser;
import io.monokkel.exceptions.IndexDocumentException;
import io.monokkel.exceptions.ParseException;
import io.monokkel.exceptions.UrlVisitException;
import io.monokkel.exceptions.ValidationError;
import io.monokkel.messages.*;
import io.monokkel.messages.builders.ParseResponseBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.concurrent.duration.Duration;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
/**
* Created by Tarjei on 28/05/14.
* <p/>
* The main actor for crawling urls. When receiving a message it parses it and
* distributes it to the different actors. The actors are:
* <p/>
* urlVisited: A stateful url actor that contain all urls visited by the crawler
* urlVisitor: A visitor actor that will visit an url and respond with the content
* parserBroadcast: Multiple parsers that all receives a message with the content. Based on each actors content check it will choose how to parse the data
* indexActor: An index actor is sending the content to ElasticSearch
*/
public class CrawlerActor extends UntypedActor {
private static final Integer DEFAULT_START_DEPTH = 0;
private static Logger log = LoggerFactory.getLogger(CrawlerActor.class);
private final ActorRef cleanerActor;
private final ActorRef urlVisited;
private final ActorRef urlVisitor;
private final Router parserBroadcast;
private final ActorRef indexActor;
private final ActorRef urlThrottler;
private final Boolean isThrottlerDisabled;
private final ActorRef validatorActor;
public CrawlerActor(final UrlVisitor urlVisitor, final Indexer indexer, final DataCleaner dataCleaner, final ThrottlerConfig throttlerConfig, final List<ResponseParser> responseParsers, final DataValidator dataValidator) {
this.urlVisited = getContext().actorOf(UrlVisitedActor.props(), "urlVisitedActor");
this.urlVisitor = getContext().actorOf(UrlVisitorActor.props(urlVisitor), "urlVisitorActor");
List<Routee> routes = createParserActorBroadcastRoutes(responseParsers);
parserBroadcast = new Router(new BroadcastRoutingLogic(), routes);
this.indexActor = getContext().actorOf(IndexActor.props(indexer), "indexActor");
this.cleanerActor = getContext().actorOf(CleanerActor.props(dataCleaner));
this.validatorActor = getContext().actorOf(DataValidatorActor.props(dataValidator));
final Integer seconds = throttlerConfig.getSeconds();
final Integer urlsPer = throttlerConfig.getUrlsPer();
isThrottlerDisabled = throttlerConfig.isDisabled();
this.urlThrottler = getContext().actorOf(Props.create(TimerBasedThrottler.class, new Throttler.Rate(urlsPer,
Duration.create(seconds, TimeUnit.SECONDS))), "urlThrottler");
urlThrottler.tell(new Throttler.SetTarget(this.urlVisitor), null);
}
private List<Routee> createParserActorBroadcastRoutes(List<ResponseParser> responseParsers) {
return responseParsers.stream().map(responseParser -> {
final ActorRef parserActorRef = getContext().actorOf(ParserActor.props(responseParser));
return new ActorRefRoutee(parserActorRef);
}).collect(Collectors.toList());
}
public static Props props(final UrlVisitor urlVisitor, final Indexer indexer, final DataCleaner dataCleaner, final ThrottlerConfig throttlerConfig, final List<ResponseParser> responseParser, final DataValidator dataValidator) {
return Props.create(CrawlerActor.class, () -> new CrawlerActor(urlVisitor, indexer, dataCleaner, throttlerConfig, responseParser, dataValidator));
}
private static SupervisorStrategy strategy = new OneForOneStrategy(10, Duration.create("1 minute"), param -> {
if (param instanceof UrlVisitException) {
log.info("UrlVisitException is received. No point to restart the actor. It is therefore resumed", param);
return SupervisorStrategy.resume();
} else if (param instanceof ValidationError) {
ValidationError validationError = (ValidationError)param;
log.warn("Failed to validate field \"{}\" with data \"{}\" using expression \"{}\"",validationError.getKey(),validationError.getValue(),validationError.getValidateExpression());
return SupervisorStrategy.resume();
} else if (param instanceof IOException) {
log.warn("IOException is received and the actors are restarted", param);
return SupervisorStrategy.restart();
} else if (param instanceof ParseException) {
log.warn("HtmlParseException is received and actors are resumed", param);
return SupervisorStrategy.resume();
} else if (param instanceof IndexDocumentException) {
log.warn("IndexDocumentException is received. The IndexActor is restarted", param);
return SupervisorStrategy.restart();
} else {
log.error("Exception escalated higher in the hierarchy. The actor is stopped", param);
return SupervisorStrategy.escalate();
}
});
@Override
public SupervisorStrategy supervisorStrategy() {
return strategy;
}
@Override
public void onReceive(Object message) throws Exception {
if (message instanceof SeedUrl) {
SeedUrl seedUrl = (SeedUrl) message;
urlVisited.tell(new IsUrlVisitedBefore(seedUrl.getUrls(), DEFAULT_START_DEPTH, seedUrl.getMaxDepth()), getSelf());
} else if (message instanceof UrlIsNotVisitedBefore) {
UrlIsNotVisitedBefore urlIsNotVisitedBefore = (UrlIsNotVisitedBefore) message;
final List<String> urls = urlIsNotVisitedBefore.getUrls();
final ActorRef visitor = isThrottlerDisabled ? urlVisitor : urlThrottler;
urls.parallelStream().forEach(url -> visitor.tell(new VisitUrl(url, urlIsNotVisitedBefore.getDepth(), urlIsNotVisitedBefore.getMaxDepth()), getSelf()));
} else if (message instanceof UnparsedResponse) {
UnparsedResponse unparsedResponse = (UnparsedResponse) message;
parserBroadcast.route(new ParseResponseBuilder().setResponse(unparsedResponse.getResponse()).setUrl(unparsedResponse.getUrl()).setTimeStamp(unparsedResponse.getTimeStamp()).setDepth(unparsedResponse.getDepth()).setMaxDepth(unparsedResponse.getMaxDepth()).setTypesFromTheResponseHeader(unparsedResponse.getTypesFromTheResponseHeader()).createParseResponse(), getSelf());
} else if (message instanceof ParserDone) {
ParserDone parserDone = (ParserDone) message;
cleanerActor.tell(new CleanTransformedContent(parserDone.getPageData(), parserDone.getDepth(), parserDone.getUrlSet(), parserDone.getMaxDepth()), getSelf());
} else if (message instanceof CleanerDone) {
CleanerDone cleanerDone = (CleanerDone) message;
urlVisited.tell(new IsUrlVisitedBefore(Lists.newArrayList(cleanerDone.getUrlSet()), cleanerDone.getDepth(), cleanerDone.getMaxDepth()), getSelf());
validatorActor.tell(new ValidatePageData(cleanerDone.getPageData()),getSelf());
} else if(message instanceof IndexValidatedDocument){
IndexValidatedDocument indexValidatedDocument = (IndexValidatedDocument) message;
indexActor.tell(new IndexResponse(indexValidatedDocument.getPageData()), getSelf());
} else {
unhandled(message);
}
}
}