package com.widowcrawler.analyze.resources;
import com.amazonaws.services.dynamodbv2.AmazonDynamoDB;
import com.amazonaws.services.dynamodbv2.model.*;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.S3Object;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.netflix.archaius.Config;
import com.widowcrawler.analyze.model.GetPageSummaryResponse;
import com.widowcrawler.analyze.model.GetRawContentResponse;
import com.widowcrawler.analyze.model.ListPagesResponse;
import com.widowcrawler.analyze.model.PageVisitInfoResponse;
import com.widowcrawler.core.model.PageAttribute;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.ws.rs.GET;
import javax.ws.rs.Path;
import javax.ws.rs.PathParam;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.Response;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.*;
import java.util.stream.Collectors;
/**
* @author Scott Mansfield
*/
@Path("pages")
public class PageResources {
private static final Logger logger = LoggerFactory.getLogger(PageResources.class);
private static final String DYNAMO_TABLE_NAME_CONFIG_KEY = "com.widowcrawler.table.name";
private static final String BUCKET_NAME_CONFIG_KEY = "com.widowcrawler.bucket.name";
@Inject
Config config;
@Inject
AmazonDynamoDB dynamoDB;
@Inject
ObjectMapper objectMapper;
@Inject
AmazonS3 amazonS3;
@GET
public Response getAllPages(@QueryParam("last") String startKey) {
try {
String tableName = config.getString(DYNAMO_TABLE_NAME_CONFIG_KEY);
ScanRequest scanRequest = new ScanRequest()
.withTableName(tableName)
.withAttributesToGet(
PageAttribute.ORIGINAL_URL.toString(),
PageAttribute.TIME_ACCESSED.toString()
);
if (StringUtils.isNotBlank(startKey)) {
byte[] decodedLEK = Base64.getUrlDecoder().decode(startKey);
Map<String, AttributeValue> exclusiveStartKey = objectMapper.readValue(decodedLEK,
new TypeReference<Map<String, AttributeValue>>() { });
scanRequest.withExclusiveStartKey(exclusiveStartKey);
}
final ScanResult scanResult = dynamoDB.scan(scanRequest);
Map<String, List<Long>> pagesAndTimes = new HashMap<>();
// For each URL, extract the different times accessed
scanResult.getItems().forEach(
row -> {
String pageURL = row.get(PageAttribute.ORIGINAL_URL.toString()).getS();
List<Long> timesList = null;
if (pagesAndTimes.containsKey(pageURL)) {
timesList = pagesAndTimes.get(pageURL);
} else {
timesList = new ArrayList<>();
}
timesList.add(Long.valueOf(row.get(PageAttribute.TIME_ACCESSED.toString()).getN()));
pagesAndTimes.put(pageURL, timesList);
}
);
Double consumedCapacity = null;
if (scanResult.getConsumedCapacity() != null) {
consumedCapacity = scanResult.getConsumedCapacity().getCapacityUnits();
}
String serializedStartKey = new String(Base64.getUrlEncoder().encode(
objectMapper.writeValueAsString(scanResult.getLastEvaluatedKey()).getBytes()));
return Response.ok(new ListPagesResponse(
true,
"Page listing successful",
consumedCapacity,
serializedStartKey,
pagesAndTimes
)).build();
} catch (Exception ex) {
String message = "Getting pages failed. Error: " + ex.getClass().getName() + ": " + ex.getMessage();
logger.error(message, ex);
return Response.serverError().entity(new ListPagesResponse(false, message, null, null, null)).build();
}
}
@GET
@Path("{base64Page}")
public Response summarizePage(@PathParam("base64Page") String base64Page) {
try {
String decoded = getDecodedURL(base64Page);
String tableName = config.getString(DYNAMO_TABLE_NAME_CONFIG_KEY);
Map<String, Condition> conditionMap = new HashMap<String, Condition>() {{
put(PageAttribute.ORIGINAL_URL.toString(),
new Condition()
.withComparisonOperator(ComparisonOperator.EQ)
.withAttributeValueList(new AttributeValue().withS(decoded)));
}};
QueryRequest queryRequest = new QueryRequest()
.withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL)
.withTableName(tableName)
.withSelect(Select.ALL_ATTRIBUTES)
.withKeyConditions(conditionMap);
QueryResult queryResult = dynamoDB.query(queryRequest);
Double capacityConsumed = null;
if (queryResult.getConsumedCapacity() != null) {
capacityConsumed = queryResult.getConsumedCapacity().getCapacityUnits();
}
final List<Map<PageAttribute, Object>> visits = queryResult.getItems().stream()
.map(this::getPageAttributeObjectMap)
.collect(Collectors.toList());
return Response.ok(new GetPageSummaryResponse(
true,
"Page summary successful",
capacityConsumed,
visits
)).build();
} catch (Exception ex) {
String message = "Getting page summary failed. Error: " + ex.getMessage();
logger.error(message, ex);
return Response.serverError().entity(new GetPageSummaryResponse(false, message, null, null)).build();
}
}
@GET
@Path("{base64Page}/{timeAccessed}")
public Response getPageVisitInfo(
@PathParam("base64Page") String base64Page,
@PathParam("timeAccessed") Long timeAccessed) {
try {
String decoded = getDecodedURL(base64Page);
String tableName = config.getString(DYNAMO_TABLE_NAME_CONFIG_KEY);
Map<String, Condition> conditionMap = new HashMap<String, Condition>() {{
put(PageAttribute.ORIGINAL_URL.toString(),
new Condition()
.withComparisonOperator(ComparisonOperator.EQ)
.withAttributeValueList(new AttributeValue().withS(decoded)));
put(PageAttribute.TIME_ACCESSED.toString(),
new Condition()
.withComparisonOperator(ComparisonOperator.EQ)
.withAttributeValueList(new AttributeValue().withN(timeAccessed.toString())));
}};
QueryRequest queryRequest = new QueryRequest()
.withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL)
.withTableName(tableName)
.withSelect(Select.ALL_ATTRIBUTES)
.withKeyConditions(conditionMap);
QueryResult queryResult = dynamoDB.query(queryRequest);
Double capacityConsumed = null;
if (queryResult.getConsumedCapacity() != null) {
capacityConsumed = queryResult.getConsumedCapacity().getCapacityUnits();
}
final Map<PageAttribute, Object> attributeValueMap = getPageAttributeObjectMap(queryResult.getItems().get(0));
return Response.ok(new PageVisitInfoResponse(
true,
"Page info get successful",
capacityConsumed,
attributeValueMap
)).build();
} catch (Exception ex) {
String message = "Getting page summary failed. Error: " + ex.getMessage();
logger.error(message, ex);
return Response.serverError().entity(new GetPageSummaryResponse(false, message, null, null)).build();
}
}
@GET
@Path("rawContent/{contentID}")
public Response getRawContent(@PathParam("contentID") String contentID) {
try {
String bucketName = config.getString(BUCKET_NAME_CONFIG_KEY);
GetObjectRequest getObjectRequest = new GetObjectRequest(bucketName, contentID);
final S3Object s3Object = amazonS3.getObject(getObjectRequest);
String content = IOUtils.toString(s3Object.getObjectContent());
return Response.ok(new GetRawContentResponse(true, "Content retrieval successful", content)).build();
} catch (Exception ex) {
String message = "Getting page content failed. Error: " + ex.getMessage();
logger.error(message, ex);
return Response.serverError().entity(new GetRawContentResponse(false, message, null)).build();
}
}
private String getDecodedURL(String base64Page) throws UnsupportedEncodingException {
return URLDecoder.decode(new String(Base64.getUrlDecoder().decode(base64Page)), "utf-8");
}
private Map<PageAttribute, Object> getPageAttributeObjectMap(Map<String, AttributeValue> row) {
return row.entrySet().stream()
.collect(Collectors.<Map.Entry<String, AttributeValue>, PageAttribute, Object>toMap(
data -> PageAttribute.valueOf(data.getKey()),
data -> {
try {
AttributeValue av = data.getValue();
PageAttribute key = PageAttribute.valueOf(data.getKey());
switch (key.getType()) {
case LONG:
return Long.valueOf(av.getN());
case DOUBLE:
// TODO: First LOAD_TIME_MILLIS is *always* null, some others are as well
logger.info("About to parse " + key.toString() + " into Double: " + av.getS());
if (av.getS() == null) return -1D;
return Double.valueOf(av.getS());
case HASH:
return objectMapper.readValue(av.getS(), Map.class);
case ARRAY:
return objectMapper.readValue(av.getS(), List.class);
case STRING:
default:
return av.getS();
}
} catch (Exception ex) {
logger.error("Error converting", ex);
return "";
}
}
));
}
}