package com.hao.spider;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.hao.model.spider.Page;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.selector.Html;
import java.io.IOException;
import java.net.URL;
import java.util.List;
import static com.hao.common.Commons.*;
/**
* Created by user on 2016/4/13.
*/
public class TMallHomePageDownload implements Downloader{
private static final Logger LOGGER = LoggerFactory.getLogger(TMallHomePageDownload.class);
private static final String ASYNC_URL_TEMPLATE = "https://%s/widgetAsync.htm?ids=%s&path=%s&callback=callbackGetMods%s&site_instance_id=%s";
@Override
public Page download(String url) throws IOException {
Preconditions.checkNotNull(url);
URL indexUrl = new URL(url);
Document document = getDocument(url, "UTF-8");
String content = document.html();
Html mainHtml = Html.create(content);
String siteId = mainHtml.regex("site_instance_id=(\\d+)", 1).get();
List<String> asyncIdList = mainHtml.xpath("//div[@class='J_TAsyncModule']/@data-widgetid").all();
List<Html> asyncHtmlList = Lists.newArrayListWithExpectedSize(asyncIdList.size());
for (String id : asyncIdList) {
String aUrl = String.format(ASYNC_URL_TEMPLATE,indexUrl.getHost(),id,indexUrl.getPath(),id,siteId);
Document aDocument = getDocument(aUrl, "UTF-8");
String aHtml = aDocument.html();
String aContent = aHtml.substring(aHtml.indexOf("{"), aHtml.lastIndexOf("}"));
if (LOGGER.isInfoEnabled()) {
LOGGER.info("content is :{}",aContent);
}
asyncHtmlList.add(Html.create(aContent));
sleep(1,3);
}
return Page.create(url,mainHtml,asyncHtmlList);
}
}