package us.codecraft.webmagic.model.samples; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.AfterExtractor; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ComboExtract; import us.codecraft.webmagic.model.annotation.ComboExtract.OP; import us.codecraft.webmagic.model.annotation.ConfigInfo; import us.codecraft.webmagic.model.annotation.ExprType; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; @TargetUrl("http://www.walmart.com/ip/\\d+") public class Walmart implements AfterExtractor { @ExtractBy(value = "//ol[@itemprop='breadcrumb']//li[last()]/a/text()", configure=@ConfigInfo(defaultValue = "")) private String categroy; @ExtractBy(value = "//meta[@itemprop='brand']/@content") private String manufacturer; @ExtractBy(value = "//meta[@itemprop='model']/@content") private String mpn; @ComboExtract(value = { @ExtractBy(value = "//meta[@itemprop='name']/@content", type = ExprType.XPATH), @ExtractBy(value = "h1.productTitle", type = ExprType.CSS, configure=@ConfigInfo(isOuterHtml = false)), @ExtractBy(value = "h1.productTitle p span", type = ExprType.CSS) }, op = OP.OR) private String productName; @ComboExtract(value = { @ExtractBy(value = "R3_ITEM\\.setId\\(['\"](\\d+)['\"]\\)", type = ExprType.REGEX), @ExtractBy(value = "var\\s+DefaultItem\\s*=\\s*\\{\\s*itemId\\s*:\\s*(\\d+)\\s*,", type = ExprType.REGEX), @ExtractBy(value = "//input[@name='product_id']/@value") }, op = OP.OR) private String productId; @ComboExtract(value = { @ExtractBy(value = "table.SpecTable", type = ExprType.CSS), @ExtractBy(value = "Walmart No\\.:</td>\\s*<td.+?>(\\d+)</td>", type = ExprType.REGEX) }, op = OP.AND) private String channelSKU; @ExtractBy(value = "div#UPC_MESSAGE strong#UPC_CODE", type = ExprType.CSS, configure=@ConfigInfo(isOuterHtml = false)) private String upc; @ExtractBy(value = "http://content.webcollage.net/walmart/resources/content-player/v2/content-player.min.js", type = ExprType.CONTAINS, configure=@ConfigInfo(defaultValue="false")) private String wcPlayer; @ComboExtract(value = { @ExtractBy(value = "http://content.webcollage.net/walmart/resources/content-player/v2/ppp.min.js", type = ExprType.CONTAINS,configure=@ConfigInfo(defaultValue="false")), @ExtractBy(value = "div#wc-aplus", type = ExprType.CSS) }, op = OP.OR) private String wcEmc; @ExtractBy(value = "a", type = ExprType.CSS, configure = @ConfigInfo(isRemoveTag=true), multi=true) private List<String> test; public Walmart() { //nothing } @Override public void afterProcess(Page page) { Map<String, String> map = page.getResultItems().getAllHttpHeaderResponses(); Set<String> keys = map.keySet(); for (String key : keys) { //System.out.println(key + ":" + map.get(key)); } } public static void main(String[] args) { Set<Integer> acceptStatCode = new HashSet<Integer>(); acceptStatCode.add(200); acceptStatCode.add(503); String domain = "www.walmart.com"; String userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:22.0) Gecko/20100101 Firefox/22.0"; int sleepTime = 1000; int retryTimes = 3; Site walmartSite = Site.me().setDomain(domain).setAcceptStatCode(acceptStatCode).setUserAgent(userAgent) .setSleepTime(sleepTime).setRetryTimes(retryTimes); OOSpider.create(walmartSite, new WalmartPageModelPipeline(), Walmart.class).test( "http://www.walmart.com/ip/9886285"); } }