package com.hao.fetcher;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.hao.util.HttpClientUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.hao.common.Commons.*;
/**
*
* 抓取此网站:http://qichechaxun15.cw503.4everdns.com/query.aspx
* Created by user on 2016/4/18.
*/
public class FetcherBrakes {
private static final String url = "http://qichechaxun15.cw503.4everdns.com/query.aspx";
private static LinkedHashMap<String,String> generalForm = new LinkedHashMap();
static {
generalForm.put("__EVENTTARGET","");
generalForm.put("__EVENTARGUMENT","");
generalForm.put("__LASTFOCUS","");
generalForm.put("__VIEWSTATE","/wEPDwULLTIwODk1MTAzOTMPZBYCAgMPZBYGAgEPEA8WBh4NRGF0YVRleHRGaWVsZAUIdXNlcl9wd2QeDkRhdGFWYWx1ZUZpZWxkBQh1c2VyX3B3ZB4LXyFEYXRhQm91bmRnZBAVUw3pgInmi6kt5ZOB54mMAAJEUwNHTUMITUflkI3niLUETUlOSQVTbWFydAblpaXov6oG5a6d6aqPBuWunemprAnkv53ml7bmjbcM5YyX5rG95Yi26YCgBuWllOmpsAblpZTohb4G5pys55SwCeavlOS6mui/qgbmoIfoh7QG5Yir5YWLBumVv+WuiQbplb/ln44M6ZW/5Liw54yO6LG5BuS8oOelugblpKfkvJcG6YGT5aWHBuS4nOWNlwnoj7LkuprnibkG5Liw55SwBumjjuW6pgbpo47npZ4G6aOO6KGMBuemj+eJuQbnpo/nlLAG5ZOI6aOeBuWTiOW8lwbmtbfpqawG5oKN6amsBuWNjuazsAblkInliKkG5ZCJ5pmuBuaxn+a3rgbmjbfosbkG6YeR5p2vDOWHr+i/quaLieWFiwzlhYvojrHmlq/li5IM6Zu35YWL6JCo5pavBumbt+ivugbnkIblv7UG5Yqb5biGBuiOsuiKsQbmnpfogq8G6ZOD5pyoBumZhumjjgbot6/omY4M6ams6JCo5ouJ6JKCCemprOiHqui+vgnnurPmmbrmjbcG6K605q2MBuasp+WunQblpYfnkZ4G5ZCv6L6wBui1t+S6mgblhajpoboG5pel5LqnBuiNo+WogQbnkZ7pupIG6JCo5Y2aBuS4ieiPsQzkuIrmsb3lpKfpgJoG5Y+M6b6ZCeaWr+W3tOmygQnmlq/mn6/ovr4J54m55pav5ouJBuWogem6nwnmsoPlsJTmsoMG5LqU6I+xBueOsOS7ownpm6rkvZvlhbAJ6Zuq6ZOB6b6ZBuS4gOaxvQnkvp3nu7Tmn68M6Iux6I+y5bC86L+qBuS4reWNjgbkvJfms7AVUw3pgInmi6kt5ZOB54mMAAJEUwNHTUMITUflkI3niLUETUlOSQVTbWFydAblpaXov6oG5a6d6aqPBuWunemprAnkv53ml7bmjbcM5YyX5rG95Yi26YCgBuWllOmpsAblpZTohb4G5pys55SwCeavlOS6mui/qgbmoIfoh7QG5Yir5YWLBumVv+WuiQbplb/ln44M6ZW/5Liw54yO6LG5BuS8oOelugblpKfkvJcG6YGT5aWHBuS4nOWNlwnoj7LkuprnibkG5Liw55SwBumjjuW6pgbpo47npZ4G6aOO6KGMBuemj+eJuQbnpo/nlLAG5ZOI6aOeBuWTiOW8lwbmtbfpqawG5oKN6amsBuWNjuazsAblkInliKkG5ZCJ5pmuBuaxn+a3rgbmjbfosbkG6YeR5p2vDOWHr+i/quaLieWFiwzlhYvojrHmlq/li5IM6Zu35YWL6JCo5pavBumbt+ivugbnkIblv7UG5Yqb5biGBuiOsuiKsQbmnpfogq8G6ZOD5pyoBumZhumjjgbot6/omY4M6ams6JCo5ouJ6JKCCemprOiHqui+vgnnurPmmbrmjbcG6K605q2MBuasp+WunQblpYfnkZ4G5ZCv6L6wBui1t+S6mgblhajpoboG5pel5LqnBuiNo+WogQbnkZ7pupIG6JCo5Y2aBuS4ieiPsQzkuIrmsb3lpKfpgJoG5Y+M6b6ZCeaWr+W3tOmygQnmlq/mn6/ovr4J54m55pav5ouJBuWogem6nwnmsoPlsJTmsoMG5LqU6I+xBueOsOS7ownpm6rkvZvlhbAJ6Zuq6ZOB6b6ZBuS4gOaxvQnkvp3nu7Tmn68M6Iux6I+y5bC86L+qBuS4reWNjgbkvJfms7AUKwNTZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2dnZ2cWAWZkAgMPEGQQFQEQ6YCJ5oupLeWItumAoOWVhhUBEOmAieaLqS3liLbpgKDllYYUKwMBZxYBZmQCBQ8QZBAVAQ3pgInmi6kt6L2m5Z6LFQEN6YCJ5oupLei9puWeixQrAwFnZGQYAQUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgEFDEltYWdlQnV0dG9uMdwIq89Pj1G+oWKFvHGUgzdFZ2L5");
generalForm.put("__VIEWSTATEGENERATOR","EDD8C9AE");
}
public static List<String> getBrands() {
Document content = getDocument(url, "UTF-8");
if (content == null) {
getBrands();
}
Elements viewState = content.select("#__VIEWSTATE");
String value = viewState.attr("value");
generalForm.put("__VIEWSTATE",value);
Elements optElems = content.select("#DropDownList1 > option");
List<String> brands = Lists.newArrayList();
optElems.stream().filter(optElem -> !optElem.text().equals("选择-品牌")).forEach(optElem -> brands.add(optElem.text()));
brands.remove(0); //去除第一个空格
return brands;
}
public static List<String> getManufacturer(String brand) throws Exception {
Map<String,String> form = Maps.newLinkedHashMap();
form.putAll(generalForm);
form.put("DropDownList1",brand);
String html = HttpClientUtil.sendHttpPost(url, form);
if (html == null) {
getManufacturer(brand);
}
Document content = Jsoup.parse(html);
Elements viewState = content.select("#__VIEWSTATE");
String value = viewState.attr("value");
generalForm.put("__VIEWSTATE",value);// 这个值必须替换成正确的值 否则取不到数据
System.out.println(generalForm);
Elements optElems = content.select("#DropDownList2 > option");
List<String> manus = Lists.newArrayList();
optElems.stream().filter(optElem -> !optElem.text().equals("选择-制造商")).forEach(optElem -> manus.add(optElem.text()));
return manus;
}
public static List<List<String>> getType(String brand,String manu) throws Exception {
Map<String,String> form = Maps.newLinkedHashMap();
form.putAll(generalForm);
form.put("DropDownList1",brand);
form.put("DropDownList2",manu);
String html = HttpClientUtil.sendHttpPost(url, form);
if (html == null) {
getType(brand,manu);
}
Document content = Jsoup.parse(html);
Elements viewState = content.select("#__VIEWSTATE");
String value = viewState.attr("value");
generalForm.put("__VIEWSTATE",value);
Elements optElems1 = content.select("#DropDownList3 > option");
List<String> types = Lists.newArrayList(); // 车型
optElems1.stream().filter(optElem -> !optElem.text().equals("选择-车型")).forEach(optElem -> types.add(optElem.text()));
List<String> axis = Lists.newArrayList(); // 轴
Elements optElems2 = content.select("#TextBox1 > option");
optElems2.stream().filter(optElem -> !optElem.text().equals("选择-类型")).forEach(optElem -> axis.add(optElem.text()));
List<String> productLines = Lists.newArrayList(); //产品线
Elements optElems3 = content.select("#DropDownList4 > option");
optElems3.stream().filter(optElem -> !optElem.text().equals("选择-类型")).forEach(optElem -> productLines.add(optElem.text()));
List<List<String>> lists = Lists.newArrayList();
lists.add(types);
lists.add(axis);
lists.add(productLines);
return lists;
}
public static List<List<String>> getSpecData(String brand,String manu,String type,String axis,String productLine) throws Exception {
Map<String,String> form = Maps.newLinkedHashMap();
form.putAll(generalForm);
form.put("DropDownList1",brand);
form.put("DropDownList2",manu);
form.put("DropDownList3",type);
form.put("TextBox1",axis);
form.put("DropDownList4",productLine);
form.put("TextBox2","");
form.put("ImageButton1.x","35"); //最好给个随机值 必须加上 否则取不到值
form.put("ImageButton1.y","6"); //最好给个随机值 必须加上 否则取不到值 很操蛋的问题
String html = HttpClientUtil.sendHttpPost(url, form);
if (html == null) {
getSpecData(brand, manu, type, axis, productLine);
}
Document content = Jsoup.parse(html);
Elements viewState = content.select("#__VIEWSTATE");
String value = viewState.attr("value");
generalForm.put("__VIEWSTATE",value);
Elements trElems = content.select(".query01 > table > tbody > tr");
List<List<String>> dataList = Lists.newArrayListWithCapacity(trElems.size());
for (Element trElem : trElems) {
String href = trElem.select("td > a").attr("href");
href = href.replace(" ","%20");
String specUrl = "http://qichechaxun15.cw503.4everdns.com/" + href;
System.out.println("now downloading spec url:" + specUrl);
List<String> param = getParam(specUrl);
Elements tdElems = trElem.select("td");
List<String> data = Lists.newArrayList();
for (Element tdElem : tdElems) {
data.add(tdElem.text());
}
data.addAll(param);
dataList.add(data);
}
System.out.println(dataList);
return dataList;
}
public static List<String> getParam(String specUrl) {
if (Objects.equals(specUrl,"http://qichechaxun15.cw503.4everdns.com/")) {
return Collections.emptyList();
}
if (!specUrl.startsWith("http://qichechaxun15.cw503.4everdns.com/query_detail.aspx?user_email=&")) {
return Collections.emptyList();
}
Document content = getDocument(specUrl, "UTF-8");
if (content == null) {
getParam(specUrl);
}
List<String> params = Lists.newArrayList();
String param = content.select(".query_right02").get(0).text();
String imgHref = content.select(".bd > ul > img").attr("src");
if (Objects.equals(imgHref, "")) {
imgHref = content.select(".bd > ul > li > img").attr("src");
}
try {
imgHref = URLEncoder.encode(imgHref, "UTF-8");
imgHref.replace("%2F",".");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
String imgUrl = "http://qichechaxun15.cw503.4everdns.com/" + imgHref;
params.add(param);
params.add(imgUrl);
return params;
}
public static void execute(String storePath) throws Exception {
List<String> brands = getBrands();
for (String brand : brands) {
List<String> manufacturers = getManufacturer(brand);
for (String manufacturer : manufacturers) {
List<List<String>> lists = getType(brand, manufacturer);
List<String> types = lists.get(0); //个数不确定
List<String> axis = lists.get(1); // 个数为2
List<String> productLines = lists.get(2); //个数为2
for (String type : types) {
for (String axi : axis) {
for (String productLine : productLines) {
List<List<String>> specDataList = getSpecData(brand, manufacturer, type, axi, productLine);
for (List<String> specData : specDataList) {
String result = "";
for (String s : specData) {
result += "\"" + s + "\"" + ",";
}
System.out.println(result);
writeStringtoFile(storePath,result + "\n",true);
}
}
}
}
}
}
}
public static void main(String[] args) throws Exception {
// getBrands();
// getManufacturer("奔腾");
// getType("本田","本田");
getSpecData("保时捷","保时捷","Macan","后轴","刹车片");
// getParam("http://qichechaxun15.cw503.4everdns.com/query_detail.aspx?user_email=17");
// execute("D:/tmp/brakes_0419_test.txt");
}
}