package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; import java.util.List; /** * <pre class="zh"> * Page保存了上一次抓取的结果,并可定义待抓取的链接内容。 * * 主要方法: * {@link #getUrl()} 获取页面的Url * {@link #getHtml()} 获取页面的html内容 * {@link #putField(String, Object)} 保存抽取的结果 * {@link #getResultItems()} 获取抽取的结果,在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用 * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接 * * </pre> * <pre class="en"> * Store extracted result and urls to be crawled. * * Main method: * {@link #getUrl()} get url of current page * {@link #getHtml()} get content of current page * {@link #putField(String, Object)} save extracted result * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline} * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl * * </pre> * * @author code4crafter@gmail.com <br> */ public class Page { private Request request; private ResultItems resultItems = new ResultItems(); private Selectable html; private Selectable url; private List<Request> targetRequests = new ArrayList<Request>(); public Page() { } public Page setSkip(boolean skip) { resultItems.setSkip(skip); return this; } /** * * * @param key 结果的key * @param field 结果的value */ public void putField(String key, Object field) { resultItems.put(key, field); } /** * 获取页面的html内容 * * @return html 页面的html内容 */ public Selectable getHtml() { return html; } public void setHtml(Selectable html) { this.html = html; } public List<Request> getTargetRequests() { return targetRequests; } /** * 添加待抓取的链接 * * @param requests 待抓取的链接 */ public void addTargetRequests(List<String> requests) { synchronized (targetRequests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { break; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); } } } /** * 添加待抓取的链接 * * @param requestString 待抓取的链接 */ public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } synchronized (targetRequests) { requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); } } /** * 添加待抓取的页面,在需要传递附加信息时使用 * * @param request 待抓取的页面 */ public void addTargetRequest(Request request) { synchronized (targetRequests) { targetRequests.add(request); } } /** * 获取页面的Url * * @return url 当前页面的url,可用于抽取 */ public Selectable getUrl() { return url; } /** * 设置url * * @param url */ public void setUrl(Selectable url) { this.url = url; } /** * 获取抓取请求 * * @return request 抓取请求 */ public Request getRequest() { return request; } public void setRequest(Request request) { this.request = request; this.resultItems.setRequest(request); } public ResultItems getResultItems() { return resultItems; } public void putHttpResponse(String key, String value) { this.resultItems.putHttpHeaderResponse(key, value); } @Override public String toString() { return "Page{" + "request=" + request + ", resultItems=" + resultItems + ", html=" + html + ", url=" + url + ", targetRequests=" + targetRequests + '}'; } }