package com.geccocrawler.gecco.demo; import com.geccocrawler.gecco.GeccoEngine; import com.geccocrawler.gecco.annotation.Gecco; import com.geccocrawler.gecco.annotation.Href; import com.geccocrawler.gecco.annotation.HtmlField; import com.geccocrawler.gecco.annotation.Request; import com.geccocrawler.gecco.annotation.RequestParameter; import com.geccocrawler.gecco.annotation.Text; import com.geccocrawler.gecco.request.HttpRequest; import com.geccocrawler.gecco.spider.HtmlBean; @Gecco(matchUrl="https://github.com/{user}/{project}", pipelines="consolePipeline", timeout=1000) public class MyGithub implements HtmlBean { private static final long serialVersionUID = -7127412585200687225L; @Request private HttpRequest request; @RequestParameter("user") private String user; @RequestParameter("project") private String project; @Text(own=false) @HtmlField(cssPath=".repository-meta-content") private String title; @Text(own=false) @HtmlField(cssPath=".pagehead-actions li:nth-child(2) .social-count") private int star; @Text @HtmlField(cssPath=".pagehead-actions li:nth-child(3) .social-count") private int fork; @Href @HtmlField(cssPath="ul.numbers-summary > li:nth-child(4) > a") private String contributors; @HtmlField(cssPath=".entry-content") private String readme; public HttpRequest getRequest() { return request; } public void setRequest(HttpRequest request) { this.request = request; } public String getReadme() { return readme; } public void setReadme(String readme) { this.readme = readme; } public String getUser() { return user; } public void setUser(String user) { this.user = user; } public String getProject() { return project; } public void setProject(String project) { this.project = project; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public int getStar() { return star; } public void setStar(int star) { this.star = star; } public int getFork() { return fork; } public void setFork(int fork) { this.fork = fork; } public String getContributors() { return contributors; } public void setContributors(String contributors) { this.contributors = contributors; } public static void main(String[] args) { GeccoEngine.create() .classpath("com.geccocrawler.gecco.demo") //开始抓取的页面地址 .start("https://github.com/xtuhcy/gecco") .start("https://github.com/xtuhcy/gecco-spring") //开启几个爬虫线程,线程数量最好不要大于start request数量 .thread(2) //单个爬虫每次抓取完一个请求后的间隔时间 .interval(2000) //循环抓取 .loop(true) //采用pc端userAgent .mobile(false) //是否开启debug模式,跟踪页面元素抽取 .debug(false) //非阻塞方式运行 .start(); } }