package com.widowcrawler.index;
import com.amazonaws.services.dynamodbv2.AmazonDynamoDB;
import com.amazonaws.services.dynamodbv2.model.AttributeValue;
import com.amazonaws.services.dynamodbv2.model.PutItemRequest;
import com.amazonaws.services.dynamodbv2.model.PutItemResult;
import com.amazonaws.services.dynamodbv2.model.ReturnConsumedCapacity;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.widowcrawler.core.model.IndexInput;
import com.widowcrawler.core.model.PageAttribute;
import com.widowcrawler.core.retry.RetryFailedException;
import com.widowcrawler.core.worker.Worker;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.sql.DataSource;
import java.security.NoSuchAlgorithmException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Map;
import java.util.stream.Collectors;
import static com.widowcrawler.core.retry.Retry.retry;
/**
* @author Scott Mansfield
*/
public class IndexWorker extends Worker {
private static final Logger logger = LoggerFactory.getLogger(IndexWorker.class);
private static final String TABLE_NAME_CONFIG_KEY = "com.widowcrawler.table.name";
@Inject
AmazonDynamoDB dynamoDBClient;
@Inject
ObjectMapper objectMapper;
@Inject
DataSource dataSource;
private IndexInput indexInput;
public IndexWorker withInput(IndexInput indexInput) {
this.indexInput = indexInput;
return this;
}
@Override
protected boolean doWork() {
try {
logger.info("Received IndexInput: " + indexInput.getAttribute(PageAttribute.ORIGINAL_URL));
// Check key fields to make sure they exist
if (!indexInput.getAttributes().keySet().contains(PageAttribute.ORIGINAL_URL) ||
!indexInput.getAttributes().keySet().contains(PageAttribute.TIME_ACCESSED)) {
throw new IllegalArgumentException("Attributes ORIGINAL_URL and TIME_ACCESSED must exist");
}
writeToDynamo();
//writeToRDBMS();
return true;
} catch (Exception ex) {
logger.error("Error while indexing.", ex);
return false;
}
}
private void writeToDynamo() throws RetryFailedException, InterruptedException {
Map<String, AttributeValue> attributeValueMap = indexInput.getAttributes().entrySet()
.stream()
.filter((entry) -> entry.getValue() != null)
.filter((entry) -> StringUtils.isNotBlank(entry.getValue().toString()))
.collect(Collectors.toMap(
e -> e.getKey().toString(),
e -> {
try {
switch (e.getKey().getType()) {
case LONG:
case DOUBLE:
return new AttributeValue().withN(e.getValue().toString());
default:
String serializedData = null;
// use objectMapper for everything but strings because it
// adds quotes to raw strings
if (e.getValue().getClass() != String.class) {
serializedData = objectMapper.writeValueAsString(e.getValue());
} else {
serializedData = (String) e.getValue();
}
return new AttributeValue().withS(serializedData);
}
} catch (Exception ex) {
logger.error("Couldn't serialize data to index.", ex);
return null;
}
}
));
String tableName = config.getString(TABLE_NAME_CONFIG_KEY);
PutItemRequest putItemRequest = new PutItemRequest()
.withTableName(tableName)
.withItem(attributeValueMap)
.withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL);
final PutItemResult putItemResult = retry(() -> dynamoDBClient.putItem(putItemRequest));
logger.info("Consumed table capacity: " + putItemResult.getConsumedCapacity().getCapacityUnits());
}
private void writeToRDBMS() throws SQLException, NoSuchAlgorithmException {
Connection connection = dataSource.getConnection();
connection.prepareStatement("use widow;").execute();
final PreparedStatement statement = connection.prepareStatement(
"insert into " +
"page_data(time_accessed," +
" original_url_hash," +
" original_url," +
" load_time_millis," +
" status_code," +
" headers," +
" response_size," +
" content_size," +
" page_content_ref," +
" title," +
" out_links," +
" css_links," +
" img_links," +
" js_links," +
" size_with_assets)" +
"values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
statement.setLong( 1 , Long.valueOf(indexInput.getAttribute(PageAttribute.TIME_ACCESSED).toString()));
statement.setString(2 , DigestUtils.shaHex(indexInput.getAttribute(PageAttribute.ORIGINAL_URL).toString()));
statement.setString(3 , indexInput.getAttribute(PageAttribute.ORIGINAL_URL).toString());
statement.setLong( 4 , Double.valueOf(indexInput.getAttribute(PageAttribute.LOAD_TIME_MILLIS).toString()).longValue());
statement.setInt( 5 , Integer.valueOf(indexInput.getAttribute(PageAttribute.STATUS_CODE).toString()));
statement.setString(6 , indexInput.getAttribute(PageAttribute.HEADERS).toString());
statement.setInt( 7 , Integer.valueOf(indexInput.getAttribute(PageAttribute.RESPONSE_SIZE).toString()));
statement.setInt( 8 , Integer.valueOf(indexInput.getAttribute(PageAttribute.CONTENT_SIZE).toString()));
statement.setString(9 , indexInput.getAttribute(PageAttribute.PAGE_CONTENT_REF).toString());
statement.setString(10, indexInput.getAttribute(PageAttribute.TITLE).toString());
statement.setString(11, indexInput.getAttribute(PageAttribute.OUT_LINKS).toString());
statement.setString(12, indexInput.getAttribute(PageAttribute.CSS_LINKS).toString());
statement.setString(13, indexInput.getAttribute(PageAttribute.IMG_LINKS).toString());
statement.setString(14, indexInput.getAttribute(PageAttribute.JS_LINKS).toString());
statement.setInt( 15, Integer.valueOf(indexInput.getAttribute(PageAttribute.SIZE_WITH_ASSETS).toString()));
statement.executeUpdate();
}
}