package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.*;
import us.codecraft.webmagic.model.annotation.ExprType;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractBy2;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.Collection;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午8:17 <br>
*/
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
public class News163 implements PagedModel {
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
private String pageKey;
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
private String page;
@ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false)
@ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExprType.REGEX)
private List<String> otherPage;
@ExtractBy("//h1[@id=\"h1title\"]/text()")
private String title;
@ExtractBy("//div[@id=\"epContentLeft\"]")
private String content;
@Override
public String getPageKey() {
return pageKey;
}
@Override
public Collection<String> getOtherPages() {
return otherPage;
}
@Override
public String getPage() {
if (page == null) {
return "1";
}
return page;
}
@Override
public PagedModel combine(PagedModel pagedModel) {
News163 news163 = new News163();
news163.title = this.title;
News163 pagedModel1 = (News163) pagedModel;
news163.content = this.content + pagedModel1.content;
return news163;
}
@Override
public String toString() {
return "News163{" +
"content='" + content + '\'' +
", title='" + title + '\'' +
", otherPage=" + otherPage +
'}';
}
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
.scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
}
}