package com.brianway.webporter.collector.zhihu.download; import com.brianway.webporter.collector.zhihu.ZhihuConfiguration; import com.brianway.webporter.collector.zhihu.processor.MemberURLTokenGenerator; import com.brianway.webporter.util.StringHelper; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; /** * Created by brian on 16/12/19. * * 爬取每个用户的详细信息 * step 2: 运行该类爬去用户详细数据 */ public class ZhihuMemberPageProcessor implements PageProcessor { private Site site = new ZhihuConfiguration().getSite(); public void process(Page page) { page.putField(ZhihuPipeline.URL, page.getUrl()); page.putField(ZhihuPipeline.RESPONSE, page.getRawText()); } public Site getSite() { return site; } private static String generateMemberUrl(String urlToken) { final String URL_TEMPLATE = "https://www.zhihu.com/api/v4/members/%s"; final String QUERY_PARAMS = "?include=locations%2Cemployments%2Cgender%2Ceducations%2Cbusiness%2Cvoteup_count%2Cthanked_Count%2Cfollower_count%2Cfollowing_count%2Ccover_url%2Cfollowing_topic_count%2Cfollowing_question_count%2Cfollowing_favlists_count%2Cfollowing_columns_count%2Canswer_count%2Carticles_count%2Cpins_count%2Cquestion_count%2Cfavorite_count%2Cfavorited_count%2Clogs_count%2Cmarked_answers_count%2Cmarked_answers_text%2Cmessage_thread_token%2Caccount_status%2Cis_active%2Cis_force_renamed%2Cis_bind_sina%2Csina_weibo_url%2Csina_weibo_name%2Cshow_sina_weibo%2Cis_blocking%2Cis_blocked%2Cmutual_followees_count%2Cvote_to_count%2Cvote_from_count%2Cthank_to_count%2Cthank_from_count%2Cthanked_count%2Cdescription%2Chosted_live_count%2Cparticipated_live_count%2Callow_message%2Cindustry_category%2Corg_name%2Corg_homepage%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics"; String encoded = StringHelper.urlEncode(urlToken); return String.format(URL_TEMPLATE, encoded) + QUERY_PARAMS; } /** * 根据提取的 url_token 逐个下载用户的完整信息 * @param args 无须其他参数 */ public static void main(String[] args) { ZhihuConfiguration configuration = new ZhihuConfiguration(); String pipelinePath = configuration.getMemberPath(); Spider spider = Spider.create(new ZhihuMemberPageProcessor()) .setScheduler(new FileCacheQueueScheduler(pipelinePath)) .addPipeline(new ZhihuPipeline(pipelinePath)) .thread(20); MemberURLTokenGenerator generator = new MemberURLTokenGenerator(); generator.generateURLTokens().stream() .map(ZhihuMemberPageProcessor::generateMemberUrl) .forEach(spider::addUrl); spider.run(); } }