package com.widowcrawler.fetch;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.s3.model.PutObjectResult;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.netflix.archaius.Config;
import com.widowcrawler.core.model.FetchInput;
import com.widowcrawler.core.model.PageAttribute;
import com.widowcrawler.core.model.ParseInput;
import com.widowcrawler.core.queue.QueueManager;
import com.widowcrawler.core.siteattr.RobotsTxtManager;
import com.widowcrawler.core.util.DomainUtils;
import com.widowcrawler.core.worker.Worker;
import com.widowcrawler.terminator.model.RobotsTxt;
import com.widowcrawler.terminator.model.RuleType;
import org.apache.commons.lang3.StringUtils;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.ws.rs.client.ClientBuilder;
import javax.ws.rs.client.Invocation;
import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
import java.io.ByteArrayInputStream;
import java.net.URL;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import static com.widowcrawler.core.retry.Retry.retry;
/**
* @author Scott Mansfield
*/
public class FetchWorker extends Worker {
// TODO: make sure the If-Modified-Since and ETag work
private static final Logger logger = LoggerFactory.getLogger(FetchWorker.class);
private static final String NEXT_QUEUE_CONFIG_KEY = "com.widowcrawler.queue.next";
private static final String BUCKET_NAME_CONFIG_KEY = "com.widowcrawler.bucket.name";
private static final String USE_BASE_DOMAIN_CONFIG_KEY = "com.widowcrawler.use.base.domain";
private static final String BASE_DOMAIN_CONFIG_KEY = "com.widowcrawler.base.domain";
@Inject
QueueManager queueManager;
@Inject
ObjectMapper objectMapper;
@Inject
AmazonS3 amazonS3Client;
private FetchInput input;
public FetchWorker() { }
public FetchWorker withInput(FetchInput input) {
this.input = input;
return this;
}
@Override
public boolean doWork() {
try {
final RobotsTxt robotsTxt = RobotsTxtManager.getByDomain(new URL(input.getUrl()).getHost());
// FIXME: HORRIBLE, HACKY, NO-GOOD BADNESS
boolean allowed = robotsTxt.getRuleSets().get("*").stream().anyMatch(rule -> {
try {
//logger.info("Rule} " + rule.getRuleType() + ": " + rule.getPathMatch());
return rule.getRuleType() == RuleType.DISALLOW &&
StringUtils.equalsIgnoreCase(rule.getPathMatch(), new URL(input.getUrl()).getPath());
} catch (Exception ex) {
logger.error("Exception while evaluating robots.txt allowed-ness");
return false;
}
});
if (!allowed) {
logger.info("URL " + input.getUrl() + " is not allowed by the robots.txt");
return true;
}
// END HORRIBLE, HACKY, NO-GOOD BADNESS
if (config.getBoolean(USE_BASE_DOMAIN_CONFIG_KEY) &&
!DomainUtils.isBaseDomain(config.getString(BASE_DOMAIN_CONFIG_KEY), this.input.getUrl())) {
logger.warn("Rejecting message because it goes not have the right base domain:\n" +
"\tUrl: " + input.getUrl() + "\n" +
"\tReferrer: " + input.getReferrer());
return true;
}
Invocation invocation = ClientBuilder.newClient()
.target(this.input.getUrl())
.request()
.header("User-Agent", "Widow Crawler (http://widowcrawler.com)")
.buildGet();
// TODO: can I get more accurate timing from the response object?
long startTime = System.nanoTime();
Response response = invocation.invoke();
double requestDuration = (System.nanoTime() - startTime) / 1_000_000D;
// Massage the headers into a more usable form
MultivaluedMap<String, String> stringHeaders = response.getStringHeaders();
Map<String, List<String>> headerMap = new HashMap<>(stringHeaders.keySet().size());
stringHeaders.keySet().forEach(key -> headerMap.put(key, stringHeaders.get(key)));
String pageBody = response.readEntity(String.class);
// Use the Content-Length header, with a backup of reading the entity and getting
// the length of that
int responseLength = response.getLength();
if (responseLength == -1) {
responseLength = pageBody.length();
}
String pageContentRef = UUID.randomUUID().toString();
ObjectMetadata objectMetadata = new ObjectMetadata();
objectMetadata.setContentLength(pageBody.getBytes().length);
String bucketName = config.getString(BUCKET_NAME_CONFIG_KEY);
PutObjectRequest putObjectRequest = new PutObjectRequest(
bucketName,
pageContentRef,
new ByteArrayInputStream(pageBody.getBytes()),
objectMetadata);
final PutObjectResult putObjectResult = retry(() -> amazonS3Client.putObject(putObjectRequest));
logger.info("S3 put success. Object ID: " + pageContentRef + " | Content MD5: " + putObjectResult.getContentMd5());
ParseInput parseInput = new ParseInput.Builder()
.withAttribute(PageAttribute.ORIGINAL_URL, this.input.getUrl())
.withAttribute(PageAttribute.REFERRER, this.input.getReferrer())
.withAttribute(PageAttribute.PAGE_CONTENT_REF, pageContentRef)
.withAttribute(PageAttribute.HEADERS, headerMap)
.withAttribute(PageAttribute.STATUS_CODE, response.getStatus())
.withAttribute(PageAttribute.LOCALE, response.getLanguage())
.withAttribute(PageAttribute.TIME_ACCESSED, new DateTime(response.getDate()))
.withAttribute(PageAttribute.LOAD_TIME_MILLIS, requestDuration)
.withAttribute(PageAttribute.RESPONSE_SIZE, responseLength)
.build();
String nextQueue = config.getString(NEXT_QUEUE_CONFIG_KEY);
this.queueManager.enqueue(nextQueue, objectMapper.writeValueAsString(parseInput));
return true;
} catch (Exception ex) {
logger.error("Exception while fetching", ex);
return false;
}
}
}