package com.hao.fetcher; import com.google.common.collect.Lists; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.List; import static com.hao.common.Commons.*; /** * *爬取此网站 http://autocat.gates.cn/App/CarSearch * Created by user on 2016/4/5. */ public class FetcherAutoCat { /** * 从http://autocat.gates.cn/App/CarSearch去爬取到所有的品牌 * @param url * @return */ public static List<String> fetchBrand(String url) { Document content = null; try { content = getDocument(url, "UTF-8"); } catch (Exception e) { //如果报异常证明抓取有问题 重新连接 fetchBrand(url); } Elements optElems = content.select("#CarBrand").select("option"); List<String> brandList = Lists.newArrayList(); optElems.stream().filter(optElem -> !optElem.text().isEmpty()).forEach(optElem -> brandList.add(optElem.attr("value"))); return brandList; } /** * 抓取到所有的品牌连接 拼接出 品牌的url 为下一步做准备 * @param url http://autocat.gates.cn/App/CarSearch * @return */ public static List<String> fetchBrandLinks(String url) { List<String> brandList = fetchBrand(url); List<String> links = Lists.newArrayList(); brandList.forEach(brand -> { String specLink = "http://autocat.gates.cn/App/CarSearch?brand=" + brand; links.add(specLink); }); return links; } /** * 传入的值为#{fetchBrandLinks(url)} 抓取的链接 * @param link * @return */ public static List<String> parseBrandLink(String link) { Document content = null; try { content = getDocument(link, "UTF-8"); } catch (Exception e) { parseBrandLink(link); } Elements trElems = content.select(".CarListBox"); List<String> needUrls = Lists.newArrayList(); trElems.forEach(trElem -> { String onclick = trElem.attr("onclick"); String number = onclick.substring(onclick.indexOf("'")+1,onclick.lastIndexOf("'")); String needUrl = "http://autocat.gates.cn/App/CarProducts?GatesNumber=" + number; needUrls.add(needUrl); }); return needUrls; } /** * 分析详情页参数 * @param specLink */ public static void parseSpecPage(String specLink,String filePath) { StringBuilder builder = new StringBuilder(); Document content = null; try { content = getDocument(specLink, "UTF-8"); } catch (Exception e) { parseSpecPage(specLink,filePath); } while (content == null) { parseSpecPage(specLink,filePath); } Elements tableElems = content.select("table"); Elements tdElems = null; if (tableElems != null) { tdElems = tableElems.get(1).select("tr").get(1).select("td"); } if (tdElems != null) { for (Element tdElem : tdElems) { builder.append("\"").append(tdElem.text()).append("\"").append(","); } } System.out.println("now downloading url is:" + specLink); Elements productTrElems = null; if (tableElems != null) { productTrElems = tableElems.get(2).select("tr"); } if(productTrElems == null) { System.out.println(content); } else { productTrElems.stream().filter(productTrElem -> productTrElem.text().contains("滤清器")).forEach(productTrElem -> { Elements productTdElems = productTrElem.select("td"); productTdElems.forEach(productTdElem -> { builder.append("\"").append(productTdElem.text()).append("\"").append(","); }); }); } try { writeStringtoFile(filePath, builder.toString() + "\n", true); } catch (IOException e) { e.printStackTrace(); } System.out.println(builder.toString()); } public static void execute(String url,String storePath) throws Exception { List<String> brandUrls = fetchBrandLinks(url); for (String brandUrl : brandUrls) { List<String> specUrls = parseBrandLink(brandUrl); for (String specUrl : specUrls) { parseSpecPage(specUrl, storePath); } } } public static void main(String[] args) { try { execute("http://autocat.gates.cn/App/CarSearch","D:/tmp/autocattest.txt"); } catch (Exception e) { e.printStackTrace(); } } }