package xyz.anduo.crawler;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HtmlParserTool {
public static Set<String> extracLinks(String url, LinkFilter filter) {
Set<String> result = new HashSet<String>();
Document doc;
try {
doc = Jsoup.connect(url).timeout(5000).get();
Elements links = doc.select("a[href]");
Elements frames = doc.select("frame[src]");
Elements iframes = doc.select("iframe[src]");
for (Element e : links) {
System.out.println(e.absUrl("href"));
if (filter.accept(e.absUrl("href")))
result.add(e.absUrl("href"));
}
for (Element e : frames) {
if (filter.accept(e.absUrl("src")))
result.add(e.absUrl("src"));
}
for (Element e : iframes) {
if (filter.accept(e.absUrl("src")))
result.add(e.absUrl("src"));
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
}