package com.hao.fetcher; import com.google.common.base.Preconditions; import com.google.common.collect.*; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import static com.hao.common.Commons.*; /** * * 爬取http://www.varta-automotive.cn/zh-cn/search-batteries的蓄电池数据 * @author donghao * Created by user on 2016/2/24. */ public class FetcherBatteries { /** * * @param url 网页的url * @param charset 网页编码 */ public static List<String> parseRawPage(String url, String charset){ Preconditions.checkNotNull(url,"url can not be null"); ArrayList<String> brandList = Lists.newArrayList(); Document document = getDocument(url, charset); Elements optElems = document.select("select > option"); for (Element optElem : optElems) { String text = optElem.val(); if(!text.contains("Select One")){ brandList.add(text); } } return brandList; } /** 获得数据格式{三菱=三菱, 一汽海马=海马, 通用别克=别克 ...} * 解析获取生产厂商的数据 * @param brandList * @return */ public static Map<String, String> getFirmLink(List<String> brandList){ Map<String,String> map = Maps.newLinkedHashMap(); for (String brand : brandList) { String url = "http://www.varta-automotive.cn/index.php/tools/blocks/find_a_battery/lookup.php?make=" + brand + "&placeHolder=%E8%AF%B7%E9%80%89%E6%8B%A9%E6%82%A8%E7%9A%84%E6%B1%BD%E8%BD%A6%E7%94%9F%E4%BA%A7%E5%8E%82%E5%95%86&"; Document document = getDocument(url, "UTF-8"); Elements optElems = document.select("option"); for (Element optElem : optElems) { String firm = optElem.val(); if(!firm.contains("Select One")){ map.put(firm,brand); } } } return map; } /** * 传入的map 格式 {firm=brand,''''''''} * 将链接写入Linkpath 将数据写入dataPath 是为了防止请求过于频繁被封 * @param map * @return */ public static void getTypeLink(Map<String,String> map,String linkPath,String dataPath){ StringBuilder builder = new StringBuilder(); map.forEach( (firm,brand) ->{ String url = "http://www.varta-automotive.cn/index.php/tools/blocks/find_a_battery/lookup.php?make=" + brand +"&oem=" + firm +"&placeHolder=%E8%AF%B7%E9%80%89%E6%8B%A9%E6%82%A8%E7%9A%84%E6%B1%BD%E8%BD%A6%E5%9E%8B%E5%8F%B7&"; Document document = getDocument(url, "UTF-8"); Elements optElems = document.select("option"); for (Element optElem : optElems) { String type = optElem.val(); if(!type.contains("Select One")){ builder.append("\"").append(brand).append("\"").append(",").append("\"").append(firm).append("\"").append(",").append("\"").append(type).append("\"").append("\n"); String searchLink = "http://www.varta-automotive.cn/zh-cn/search-batteries?make=" + brand + "&oem=" + firm +"&model=" + type; try { writeStringtoFile(linkPath,searchLink + "\n",true); } catch (IOException e) {e.printStackTrace();} } } }); try { writeStringtoFile(dataPath,builder.toString(),true); } catch (IOException e) {e.printStackTrace(); } } public static List<String> parseSearchLink(String link) { Document document = getDocument(link, "UTF-8"); Elements resultElems = document.select(".searchResult"); List<String> values = Lists.newArrayListWithCapacity(resultElems.size()); for (Element resultElem : resultElems) { String name = resultElem.select(".details > h3 > a").text(); String detailName = resultElem.select(".details > h5").text(); String value = "," + "\"" + name + "\"" + "," + "\"" + detailName + "\""; values.add(value); } return values; } /** * 解析的数据格式 * 雷诺,雷诺,科雷傲 雷诺,雷诺,梅佳娜 雷诺,雷诺,梅佳娜CC *@param linkPath 传入链接的文件路径 * @param dataPath 传入数据的文件路径 * @throws IOException */ public static void combineData(String linkPath,String dataPath,String writePath) throws IOException { List<String> linkList = readLink(linkPath); List<String> dataList = readLink(dataPath); for (int i = 0; i < linkList.size(); i++) { String link = linkList.get(i); link = link.replaceAll(" ","%20"); List<String> nameList = parseSearchLink(link); for (String name : nameList) { String val = dataList.get(i) + name; writeStringtoFile(writePath,val + "\n",true); } } } public static void main(String[] args) { //// List<String> strings = parseRawPage("http://www.varta-automotive.cn/zh-cn/search-batteries", "UTF-8"); //// Map<String, String> firmLink = getFirmLink(strings); //// getTypeLink(firmLink,"D:/tmp/batteries_link.txt","D:/tmp/batteries_data.txt"); // List<String> s = parseSearchLink("http://www.varta-automotive.cn/zh-cn/search-batteries?make=中华&oem=华晨中华&model=骏捷"); // System.out.println(s); try { combineData("D:/tmp/batteries_link.txt","D:/tmp/batteries_data.txt","all_batteries_data.txt"); } catch (IOException e) { e.printStackTrace(); } } }