package com.github.xjtushilei.model;
import com.github.xjtushilei.utils.HttpUtils;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by shilei on 2017/4/10.
*/
public class Page {
private Logger logger = LoggerFactory.getLogger(getClass());
//该页面的url信息
private UrlSeed urlSeed;
//该页面的jsoup文档,设置了baseUrl
private Document document;
//新种子
private List<UrlSeed> newUrlSeed;
//待存储的json
private Map<Object, Object> items;
public static Page create() {
return new Page();
}
public Page(UrlSeed urlSeed, String html) {
newUrlSeed = new ArrayList<>();
items = new HashMap<>();
this.urlSeed = urlSeed;
this.document = Jsoup.parse(html, urlSeed.getUrl());
}
public Page() {
newUrlSeed = new ArrayList<>();
items = new HashMap<>();
}
public List<UrlSeed> links() {
List<UrlSeed> result = new ArrayList<>();
Elements elements = document.select("a");
List<String> links = new ArrayList<String>(elements.size());
for (Element element0 : elements) {
// logger.debug(element0.attr("abs:href"));
if (!StringUtil.isBlank(element0.baseUri())) {
links.add(element0.attr("abs:href"));
} else {
links.add(element0.attr("href"));
}
}
links.forEach(str -> {
result.add(new UrlSeed(5, str));
});
return result;
}
public Page setNewUrlSeed(List<UrlSeed> newUrlSeed) {
this.newUrlSeed = newUrlSeed;
return this;
}
public void addNewUrlSeed(String url, long priority) {
newUrlSeed.add(new UrlSeed(priority, url));
}
public void addNewUrlSeed(UrlSeed urlSeed) {
newUrlSeed.add(urlSeed);
}
public void addNewUrlSeed(String url) {
newUrlSeed.add(new UrlSeed(5, url));
}
public UrlSeed getUrlSeed() {
return urlSeed;
}
public Page setUrlSeed(UrlSeed urlSeed) {
this.urlSeed = urlSeed;
return this;
}
public Document getDocument() {
return document;
}
public Page setDocument(Document document) {
this.document = document;
return this;
}
public Map<Object, Object> getItems() {
return items;
}
public Page setItems(Map items) {
this.items = items;
return this;
}
public List<UrlSeed> getNewUrlSeed() {
return newUrlSeed;
}
public static void main(String[] args) throws IOException {
String url = "http://news.xjtu.edu.cn/index.htm";
Document doc = Jsoup.connect(url).get();
// System.out.println(doc.html());
Page.create().setUrlSeed(new UrlSeed(url)).setDocument(Jsoup.parse(HttpUtils.getInstance().get(url), url)).links();
}
}