package com.brianway.webporter.collector.zhihu.download;
import com.brianway.webporter.collector.zhihu.ZhihuConfiguration;
import com.brianway.webporter.util.StringHelper;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.selector.Json;
import java.util.ArrayList;
import java.util.List;
/**
* Created by brian on 16/11/24.
*
* 爬取知乎用户的关注者
* step 1: 运行该类的 main 方法开始爬取
*/
public class ZhihuFolloweePageProcessor implements PageProcessor {
private Site site = new ZhihuConfiguration().getSite();
public void process(Page page) {
Json json = page.getJson();
//System.out.println(json);
page.putField(ZhihuPipeline.URL, page.getUrl());
page.putField(ZhihuPipeline.RESPONSE, json);
String isEnd = json.jsonPath("$.paging.is_end").get();
if (!Boolean.parseBoolean(isEnd)) {
page.addTargetRequest(json.jsonPath("$.paging.next").get());
}
List<String> urlTokens = json.jsonPath("$.data[*].url_token").all();
List<String> urls = generateFolloweeUrls(urlTokens);
page.addTargetRequests(urls);
}
public Site getSite() {
return site;
}
public static String generateFolloweeUrl(String urlToken) {
final String URL_TEMPLATE = "https://www.zhihu.com/api/v4/members/%s/followees";
final String QUERY_PARAMS = "?include=data%5B*%5D.url_token&offset=0&per_page=30&limit=30";
String encoded = StringHelper.urlEncode(urlToken);
return String.format(URL_TEMPLATE, encoded) + QUERY_PARAMS;
}
public static List<String> generateFolloweeUrls(List<String> urlTokens) {
List<String> urls = new ArrayList<>(20);
urlTokens.stream().map(ZhihuFolloweePageProcessor::generateFolloweeUrl).forEach(urls::add);
return urls;
}
/**
* 下载关注列表的用户数据,用于提取 url_tokens
* @param args 无须其他参数
*/
public static void main(String[] args) {
String pipelinePath = new ZhihuConfiguration().getFolloweePath();
int crawlSize = 100_0000;
Spider.create(new ZhihuFolloweePageProcessor())
.setScheduler(//new QueueScheduler()
new FileCacheQueueScheduler(pipelinePath)
.setDuplicateRemover(new BloomFilterDuplicateRemover(crawlSize)))
.addPipeline(new ZhihuPipeline(pipelinePath))
.addUrl(generateFolloweeUrl("hydro-ding"))
.thread(20)
.run();
}
}