import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class AmazonParser
{
private static final byte PRODUCT_LIST_REQUEST = 0;
private static final byte PRODUCT_INFO_REQUEST = 1;
private static final byte MODULE_RESPONSE = 0;
private static final byte MODULE_HTTP_GET_REQUEST = 1;
private static final int TYPE_LONG = 0;
private static final int TYPE_STRING = 1;
private static final int BUFFER_SIZE = 4096;
private static final int DOWNLOAD_OK = 0;
private static final Charset ASCII = Charset.forName("US-ASCII");
private static final Charset UTF8 = Charset.forName("UTF-8");
private static final String SORT_BY_PRICE = "&sort=price";
private static final String LOW_PRICE = "&low-price=";
private static final String PAGE_NUMBER = "&page=";
private static final String PRODUCT_URL = "http://www.amazon.com/gp/product/";
private static final String TECHNICAL_DETAILS_URL = "http://www.amazon.com/dp/tech-data/";
private static final String[] CATEGORY_URLS = new String[] {
/* Computer Cases */
"http://www.amazon.com/s/ref=sr_nr_n_0?rh=n%3A572238",
/* CPU Processors */
"http://www.amazon.com/s/ref=sr_nr_n_1?rh=n%3A229189",
/* Fans & Cooling */
"http://www.amazon.com/s/ref=sr_nr_n_3?rh=n%3A3012290011",
/* Graphics Cards */
"http://www.amazon.com/s/ref=sr_nr_n_4?rh=n%3A284822",
/* I/O Port Cards */
"http://www.amazon.com/s/ref=sr_nr_n_4?rh=n%3A3012291011",
/* Internal Hard Drives */
"http://www.amazon.com/s/ref=sr_nr_n_4?rh=n%3A1254762011",
/* Internal Optical Drives */
"http://www.amazon.com/s/ref=sr_nr_n_8?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A1292107011",
/* Internal Sound Cards */
"http://www.amazon.com/s/ref=sr_nr_n_9?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A284823",
/* Memory */
"http://www.amazon.com/s/ref=sr_nr_n_11?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A172500",
/* Motherboards */
"http://www.amazon.com/s/ref=sr_nr_n_12?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A1048424",
/* Network Cards */
"http://www.amazon.com/s/ref=sr_nr_n_13?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A13983711",
/* Power Supplies */
"http://www.amazon.com/s/ref=sr_nr_n_14?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A1161760",
/* Video Capture Cards */
"http://www.amazon.com/s/ref=sr_nr_n_15?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A284824",
/* Internal Modems */
"http://www.amazon.com/s/ref=sr_nr_n_16?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A172508",
/* Internal Solid State Drives */
"http://www.amazon.com/s/ref=sr_nr_n_17?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A1292116011",
/* Internal Memory Card Readers */
"http://www.amazon.com/s/ref=sr_nr_n_18?rh=n%3A172282%2Cn%3A!493964%2Cn%3A541966%2Cn%3A193870011%2Cn%3A3310626011"
};
private static final DataOutputStream out =
new DataOutputStream(System.out);
private static final DataInputStream in =
new DataInputStream(System.in);
private static byte[] httpResponse() throws IOException
{
/* ignore the HTTP content type field */
int length = in.readUnsignedShort();
byte[] data = new byte[length];
in.readFully(data);
ByteArrayOutputStream response = new ByteArrayOutputStream(BUFFER_SIZE);
length = in.readUnsignedShort();
while (length != 0) {
data = new byte[length];
in.readFully(data);
response.write(data);
length = in.readUnsignedShort();
}
int errorCode = in.readUnsignedByte();
if (errorCode != DOWNLOAD_OK) {
System.err.println("AmazonParser.httpResponse ERROR:"
+ " Error occurred during download.");
return null;
}
return response.toByteArray();
}
private static byte[] httpGetRequest(String url) throws IOException
{
out.writeByte(MODULE_HTTP_GET_REQUEST);
out.writeShort(url.length());
out.write(url.getBytes(ASCII));
out.flush();
return httpResponse();
}
private static byte[] encodeState(int category, int page, String lowPrice)
{
String state = category + "|" + page + "|" + lowPrice;
return state.getBytes(UTF8);
}
private static State decodeState(byte[] state)
{
if (state == null || state.length == 0)
return null;
try {
String[] tokens = new String(state, UTF8).split("\\|");
int category = Integer.parseInt(tokens[0]);
int page = Integer.parseInt(tokens[1]);
String lowPrice = tokens[2];
if (lowPrice.equals("null"))
lowPrice = null;
return new State(category, page, lowPrice);
} catch (Exception e) {
System.err.println("AmazonParser.decodeState: Could not decode state. "
+ e.getClass().getSimpleName() + " thrown. " + e.getMessage());
return null;
}
}
private static void respond(ArrayList<String> productIds,
int category, int page, String lowPrice) throws IOException
{
byte[] state = encodeState(category, page, lowPrice);
out.writeByte(MODULE_RESPONSE);
out.writeShort(state.length);
out.write(state);
out.writeShort(productIds.size());
for (String productId : productIds) {
byte[] data = productId.getBytes(UTF8);
out.writeShort(data.length);
out.write(data);
}
out.flush();
}
private static void respond(Map<String, Object> keyValues) throws IOException
{
out.writeByte(MODULE_RESPONSE);
out.writeShort(keyValues.size());
for (Entry<String, Object> pair : keyValues.entrySet()) {
String keyString = pair.getKey().trim().toLowerCase();
byte[] key = keyString.getBytes(UTF8);
out.writeShort(key.length);
out.write(key);
Object valueObject = pair.getValue();
if (valueObject instanceof String) {
byte[] value = ((String) valueObject).getBytes(UTF8);
out.writeByte(TYPE_STRING);
out.writeShort(value.length);
out.write(value);
} else if (valueObject instanceof Number) {
out.writeByte(TYPE_LONG);
out.writeLong(((Number) valueObject).longValue());
}
}
out.flush();
}
private static void parseCategory(int category, int startPage, String lowPrice)
{
String url = CATEGORY_URLS[category];
byte[] data;
ArrayList<String> productIds = new ArrayList<String>();
for (int page = startPage; page <= 400; page++)
{
/* construct the request URL */
String request = url + PAGE_NUMBER + page
+ SORT_BY_PRICE;
if (lowPrice != null)
request += LOW_PRICE + lowPrice;
try {
data = httpGetRequest(request);
} catch (IOException e) {
System.err.println("AmazonParser.parseCategory ERROR:"
+ " Error requesting URL '" + request + "'.");
return;
}
/* get the current result position and total result count */
Document document = Jsoup.parse(new String(data, UTF8));
Elements elements = document.select("#resultCount");
if (elements.size() != 1) {
System.err.println("AmazonParser.parseCategory ERROR:"
+ " Error parsing result count.");
return;
}
String[] results = elements.get(0).text().split(" ");
int currResult = Integer.parseInt(results[3].replace(",", ""));
int resultCount = Integer.parseInt(results[5].replace(",", ""));
/* parse the Amazon product IDs */
elements = document.select(".prod");
for (Element element : elements) {
productIds.add(element.attr("name"));
}
/* send the product IDs to the core */
try {
respond(productIds, category, page, lowPrice);
} catch (IOException e) {
System.err.println("AmazonParser.parseCategory ERROR:"
+ " Error responding with product ID list.");
return;
}
/* check to see if we are done parsing this category */
if (currResult == resultCount)
return;
else if (page == 400) {
/* parse the price of the last product */
String price;
Elements priceElement = elements.last().select(".price");
if (priceElement.size() == 0) {
Elements newpElement = elements.last().select(".newp");
Elements redElement = newpElement.select(".red");
if (redElement.size() == 0)
price = newpElement.text();
else
price = redElement.text();
} else {
Elements redElement = priceElement.select(".red");
if (redElement.size() == 0)
price = priceElement.text();
else
price = redElement.text();
}
String newLowPrice = price.trim().replaceAll("\\$", "");
if (newLowPrice.equals(lowPrice)) {
int newprice = Integer.parseInt(newLowPrice.replaceAll("\\.", "")) + 1;
lowPrice = (newprice / 100) + "." + (newprice % 100);
} else
lowPrice = newLowPrice;
page = 0;
}
productIds.clear();
}
}
private static void getProductList(State state)
{
/* first get the list of stores from the root JSON document */
for (int i = 0; i < CATEGORY_URLS.length; i++) {
if (state == null)
parseCategory(i, 1, null);
else if (state.getCategory() == i) {
parseCategory(i, state.getPage(), state.getLowPrice());
state = null;
}
}
}
private static Integer parsePrice(Object price) {
if (price == null || !price.getClass().equals(String.class))
return null;
try {
String parsed = (String) price;
if (parsed.contains("-")) {
parsed = parsed.split("-")[0];
}
return Integer.parseInt(parsed.trim().replaceAll("\\$", "").replaceAll("\\.", ""));
} catch (NumberFormatException e) {
return null;
}
}
private static void parseProductDetail(
Map<String, Object> keyValues, Element listElement)
{
String text = listElement.text();
String[] tokens = text.split(":", 2);
if (tokens.length < 2)
return;
String key = tokens[0].trim();
String value = tokens[1].trim().replaceAll(
"\\(View shipping rates and policies\\)", "").trim();
if (key.equals("Shipping Weight"))
key = "shipping weight";
else if (key.equals("Item Weight"))
key = "weight";
else if (key.equals("Size")
|| key.equals("Product Dimensions")
|| key.equals("Size (LWH)"))
{
key = "dimensions";
String[] subtokens = value.split(";", 2);
value = subtokens[0].trim();
if (subtokens.length > 1)
keyValues.put("weight", subtokens[1].trim());
} else if (key.equals("Item model number"))
key = "model";
else return;
keyValues.put(key, value);
}
private static void parseTechnicalDetail(
Map<String, Object> keyValues, Element listElement)
{
if (listElement.select("b").size() == 0)
return;
String text = listElement.text();
String[] tokens = text.split(":", 2);
if (tokens.length < 2)
return;
String key = tokens[0].trim().toLowerCase();
String value = tokens[1].trim();
if (key.equals("memory storage capacity"))
key = "capacity";
keyValues.put(key, value);
}
private static void parseTechnicalDetails(
Map<String, Object> keyValues, String productId)
{
String url = TECHNICAL_DETAILS_URL + productId;
byte[] data;
try {
data = httpGetRequest(url);
} catch (IOException e) {
System.err.println("AmazonParser.parseTechnicalDetails ERROR:"
+ " Error requesting URL '" + url + "'.");
return;
}
Document document = Jsoup.parse(new String(data, UTF8));
Elements elements = document.select(".bucket");
for (Element element : elements) {
Elements subelements = element.select("h2");
if (subelements.size() == 0)
continue;
String header = subelements.get(0).text().trim();
if (header.equals("Product Features and Technical Details")) {
subelements = element.select("li");
for (Element subelement : subelements) {
int size = keyValues.size();
parseProductDetail(keyValues, subelement);
if (keyValues.size() == size)
parseTechnicalDetail(keyValues, subelement);
}
}
}
}
private static void parseProductInfo(String productId)
{
String url = PRODUCT_URL + productId;
byte[] data;
HashMap<String, Object> keyValues = new HashMap<String, Object>();
try {
data = httpGetRequest(url);
if (data == null || data.length == 0) {
respond(keyValues);
return;
}
} catch (IOException e) {
System.err.println("AmazonParser.parseProductInfo ERROR:"
+ " Error requesting URL '" + url + "'.");
return;
}
Document document = Jsoup.parse(new String(data, UTF8));
/* parse the price and store the price and URL */
Integer price = null;
Elements elements = document.select("#olpDivId .olpCondLink");
for (Element element : elements) {
if (element.text().contains("new")) {
Elements subelements = element.select(".price");
if (subelements.size() > 0)
price = parsePrice(subelements.get(0).text());
}
}
if (price == null) {
elements = document.select("#actualPriceValue .priceLarge");
if (elements.size() == 0) {
elements = document.select(".price");
if (elements.size() > 0)
price = parsePrice(elements.get(0).text());
} else
price = parsePrice(elements.get(0).text());
}
if (price != null)
keyValues.put("price", price);
keyValues.put("url", url);
/* parse the product name */
elements = document.select("#btAsinTitle");
if (elements.size() > 0) {
String name = elements.get(0).text().trim();
if (name != null)
keyValues.put("name", name);
}
/* parse the image */
elements = document.select("#main-image");
if (elements.size() > 0) {
String image = elements.get(0).attr("src");
if (image != null)
keyValues.put("image", image);
}
/* parse brand name */
elements = document.select(".buying span");
for (Element element : elements) {
String brand = element.text().trim();
if (brand.startsWith("by")) {
keyValues.put("brand", brand.substring(3).trim());
break;
}
}
/* parse product details */
elements = document.select("td.bucket");
for (Element element : elements) {
Elements subelements = element.select("h2");
if (subelements.size() == 0)
continue;
String header = subelements.get(0).text().trim();
if (header.equals("Product Details")) {
subelements = element.select("li");
for (Element subelement : subelements)
parseProductDetail(keyValues, subelement);
} else if (header.equals("Technical Details")) {
subelements = element.select("a");
if (subelements.size() > 0 &&
subelements.get(0).text().contains("See more technical details"))
{
parseTechnicalDetails(keyValues, productId);
} else {
subelements = element.select("li");
for (Element subelement : subelements)
parseTechnicalDetail(keyValues, subelement);
}
}
}
try {
respond(keyValues);
} catch (IOException e) {
System.err.println("NeweggParser.parseProductInfo ERROR:"
+ " Error responding with product information.");
return;
}
}
public static void main(String[] args)
{
try {
/* wait for the type of request */
switch (in.readUnsignedByte()) {
case PRODUCT_LIST_REQUEST:
State previous = null;
int length = in.readUnsignedShort();
if (length > 0) {
byte[] data = new byte[length];
in.readFully(data);
previous = decodeState(data);
}
getProductList(previous);
break;
case PRODUCT_INFO_REQUEST:
length = in.readUnsignedShort();
while (length > 0) {
byte[] data = new byte[length];
in.readFully(data);
parseProductInfo(new String(data, UTF8));
length = in.readUnsignedShort();
}
break;
default:
}
} catch (IOException e) {
System.err.println("AmazonParser.main ERROR:"
+ " Error communicating with core.");
return;
}
}
}
class State {
private int category;
private int page;
private String lowPrice;
public State(int category, int page, String lowPrice) {
this.category = category;
this.page = page;
this.lowPrice = lowPrice;
}
public int getCategory() {
return category;
}
public int getPage() {
return page;
}
public String getLowPrice() {
return lowPrice;
}
}