package reptile.zhihu; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import utils.HttpUtils; import utils.OfficeUtils; import com.google.gson.Gson; public class ZhiHuReptile { private static String hostUrl = "http://www.zhihu.com"; private static String allTopicsUrl = hostUrl + "/topics"; public static void main(String[] args) throws Exception { List<Topic> topics = getAllTopics(); List<Answer> allAnswers = new ArrayList<Answer>(); for(int i=0; i<topics.size(); i++) { String topicUrl = getTopicUrl(topics.get(i)); List<Answer> ansersOfPage = getAnsersOfPage(topicUrl); allAnswers.addAll(ansersOfPage); } File file = new File("temp" + File.separator + "reptile" + File.separator + "zhihu_topics_answers.txt"); OfficeUtils.saveCVS(allAnswers, file); } private static List<Answer> getAnsersOfPage(String topicUrl) throws Exception { // http://www.zhihu.com/topic/19550517/top-answers String topAnswersOfTopic = topicUrl + "/top-answers"; // <div class="content"> String response = HttpUtils.getString(topAnswersOfTopic); Document parse = Jsoup.parse(response); Elements elements = parse.getElementsByAttributeValue("class", "content"); List<Answer> answers = new ArrayList<Answer>(); for(Element element : elements) { Answer answer = parseAnswer(element); answers.add(answer); } return answers; } public static Answer parseAnswer(Element answerRootElement) throws Exception { // <a class="question_link" target="_blank" href="/question/25504353">经常上知乎会带来什么错觉?</a> Element questionElement = answerRootElement.getElementsByAttributeValue("class", "question_link").get(0); String questionName = questionElement.text(); String questionLink = hostUrl + questionElement.attr("href"); // <div class="zm-item-vote-info " data-votecount="40947"> Element voteCountElement = answerRootElement.getElementsByAttributeValueContaining("class", "zm-item-vote-info").get(0); String voteCount = voteCountElement.attr("data-votecount"); // <span class="answer-date-link-wrap"> // <a class="answer-date-link last_updated meta-item" data-tip="s$t$发布于 2014-09-24" target="_blank" href="/question/25504353/answer/30949097">编辑于 2014-09-24</a> // </span> String answerLink = ""; Elements answerLinkElements = answerRootElement.getElementsByAttributeValueContaining("href", "answer"); if(answerLinkElements != null && answerLinkElements.size() > 0) { Element answerLinkElement = answerLinkElements.get(0); answerLink = hostUrl + answerLinkElement.attr("href"); } else { System.out.println(answerRootElement + " ... no answer href"); } // <div class="zh-summary summary clearfix"> // 211 985 高考就是纸老虎gpa 3.8 托福雅思都是渣研究生 博士后 本科毕业像条狗北上广 英美欧 要想成功出亚洲 白瘦美 高富帅 满街都是官二代设计师 程序猿 就我一人还没钱大长腿 一八零 六块腹肌才算赢健身房 瑜伽馆 二十开练都算晚ipad mbp 4k才能玩游戏flym… // <a href="/question/25504353/answer/30949097" class="toggle-expand">显示全部</a> // </div> Element summaryElement = answerRootElement.getElementsByAttributeValue("class", "zh-summary summary clearfix").get(0); String summary = summaryElement.text(); // remove 显示全部 summary = summary.substring(0, summary.length() - 4); Answer answer = new Answer(); answer.questionName = questionName; answer.questionLink = questionLink; answer.voteCount = voteCount; answer.answerLink = answerLink; answer.summary = summary; return answer; } static class Answer { String questionName; String questionLink; String voteCount; String answerLink; String summary; @Override public String toString() { return "Answer [questionName=" + questionName + ", questionLink=" + questionLink + ", voteCount=" + voteCount + ", answerLink=" + answerLink + ", summary=" + summary + "]"; } @Override public boolean equals(Object obj) { if(obj instanceof Answer) { Answer answer = (Answer) obj; return this.answerLink.equals(answer.answerLink); } return super.equals(obj); } } public static List<Topic> getAllTopics() throws Exception { // <li data-id="99" class="current"><a href="#互联网">互联网</a></li> List<Topic> topicses = new ArrayList<Topic>(); String response = HttpUtils.getString(allTopicsUrl); Document parse = Jsoup.parse(response); Elements elements = parse.getElementsByAttributeValueStarting("href", "#"); for (Element element : elements) { Topic topic = new Topic(); topic.name = element.text(); topic.topic_id = element.parent().attr("data-id"); topicses.add(topic); } return topicses; } static class Topic { String name; String topic_id; @Override public String toString() { return "Topic [name=" + name + ", topic_id=" + topic_id + "]"; } } public static String getTopicUrl(Topic topic) throws Exception { // AJAX // Request URL:http://www.zhihu.com/node/TopicsPlazzaListV2 // Request Method:POST // Status Code:200 OK // Request Headersview source // Accept:*/* // Accept-Encoding:gzip,deflate,sdch // Accept-Language:zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2 // Connection:keep-alive // Content-Length:127 // Content-Type:application/x-www-form-urlencoded; charset=UTF-8 // Cookie:_za=fd370992-b3c6-40c1-8878-fe6a16e1a9d3; // _xsrf=cef4e8af1bed27a4bf55bc9208519455; // tc=AQAAAMEmW164TAAA+stBMYtLPmY+q7Ig; // q_c1=c893fb9042e04fea9535bad53e1bcc01|1441596419000|1438925628000; // cap_id="OGQwZTBjMThjMmU3NDU4NmE2MTBmZTllMjcwNzZjYjE=|1441603641|ae140c8417ace5f4f2417cd6d1b87c75946b3d3e"; // __utmt=1; // __utma=51854390.1029615844.1441599927.1441599927.1441600830.2; // __utmb=51854390.50.9.1441602466036; __utmc=51854390; // __utmz=51854390.1441600830.2.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; // __utmv=51854390.000--|2=registration_date=20130923=1^3=entry_date=20150807=1; // n_c=1 // Host:www.zhihu.com // Origin:http://www.zhihu.com // Referer:http://www.zhihu.com/topics // User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 // (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36 // X-Requested-With:XMLHttpRequest // Form Dataview sourceview URL encoded // method:next // params:{"topic_id":99,"offset":0,"hash_id":""} // _xsrf:cef4e8af1bed27a4bf55bc9208519455 PostBean postBean = new PostBean(); postBean.topic_id = topic.topic_id; postBean.offset = 0; postBean.hash_id = ""; Map<String, String> postParams = new HashMap<String, String>(); postParams.put("method", "next"); postParams.put("params", new Gson().toJson(postBean)); postParams.put("_xsrf", "cef4e8af1bed27a4bf55bc9208519455"); String response = HttpUtils.postString( "http://www.zhihu.com/node/TopicsPlazzaListV2", postParams, null); ResponseBean responseBean = new Gson().fromJson(response, ResponseBean.class); Document parse = Jsoup.parse(responseBean.msg.get(0)); // <a target="_blank" href="/topic/19550517"> Elements elements = parse.getElementsByAttributeValueStarting("href", "/topic"); return hostUrl + elements.get(0).attr("href"); } static class PostBean { String topic_id; int offset; String hash_id; } static class ResponseBean { String r; List<String> msg; } }