package us.codecraft.webmagic; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; import java.util.List; import java.util.Map; /** * Object storing extracted result and urls to fetch.<br> * Not thread safe.<br> * Main method: <br> * {@link #getUrl()} get url of current page <br> * {@link #getHtml()} get content of current page <br> * {@link #putField(String, Object)} save extracted result <br> * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br> * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br> * * @author code4crafter@gmail.com <br> * @see us.codecraft.webmagic.downloader.Downloader * @see us.codecraft.webmagic.processor.PageProcessor * @since 0.1.0 */ public class Page { private Request request; private ResultItems resultItems = new ResultItems(); private Html html; private Json json; private String rawText; private Selectable url; private Map<String,List<String>> headers; private int statusCode = HttpConstant.StatusCode.CODE_200; private boolean downloadSuccess = true; private List<Request> targetRequests = new ArrayList<Request>(); public Page() { } public static Page fail(){ Page page = new Page(); page.setDownloadSuccess(false); return page; } public Page setSkip(boolean skip) { resultItems.setSkip(skip); return this; } /** * store extract results * * @param key key * @param field field */ public void putField(String key, Object field) { resultItems.put(key, field); } /** * get html content of page * * @return html */ public Html getHtml() { if (html == null) { html = new Html(rawText, request.getUrl()); } return html; } /** * get json content of page * * @return json * @since 0.5.0 */ public Json getJson() { if (json == null) { json = new Json(rawText); } return json; } /** * @param html html * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ public void setHtml(Html html) { this.html = html; } public List<Request> getTargetRequests() { return targetRequests; } /** * add urls to fetch * * @param requests requests */ public void addTargetRequests(List<String> requests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); } } /** * add urls to fetch * * @param requests requests * @param priority priority */ public void addTargetRequests(List<String> requests, long priority) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s).setPriority(priority)); } } /** * add url to fetch * * @param requestString requestString */ public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); } /** * add requests to fetch * * @param request request */ public void addTargetRequest(Request request) { targetRequests.add(request); } /** * get url of current page * * @return url of current page */ public Selectable getUrl() { return url; } public void setUrl(Selectable url) { this.url = url; } /** * get request of current page * * @return request */ public Request getRequest() { return request; } public void setRequest(Request request) { this.request = request; this.resultItems.setRequest(request); } public ResultItems getResultItems() { return resultItems; } public int getStatusCode() { return statusCode; } public void setStatusCode(int statusCode) { this.statusCode = statusCode; } public String getRawText() { return rawText; } public Page setRawText(String rawText) { this.rawText = rawText; return this; } public Map<String, List<String>> getHeaders() { return headers; } public void setHeaders(Map<String, List<String>> headers) { this.headers = headers; } public boolean isDownloadSuccess() { return downloadSuccess; } public void setDownloadSuccess(boolean downloadSuccess) { this.downloadSuccess = downloadSuccess; } @Override public String toString() { return "Page{" + "request=" + request + ", resultItems=" + resultItems + ", html=" + html + ", json=" + json + ", rawText='" + rawText + '\'' + ", url=" + url + ", headers=" + headers + ", statusCode=" + statusCode + ", success=" + downloadSuccess + ", targetRequests=" + targetRequests + '}'; } }