package com.bmk.crawler.processer; import java.io.IOException; import java.util.HashSet; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.bmk.crawler.Filter; import com.bmk.crawler.HttpConnnectionManager; import com.bmk.crawler.LinkFilter; import com.bmk.crawler.PropertiesUtils; /** * @Intro 处理list书籍数据 * @author Lee * @Date 2013-8-7 */ public class Processer2 implements Runnable{ public static boolean isRunning = false; @Override public void run() { //当还有连接未处理,并且没有结束 while (!Step2Link.unVisitedUrlsEmpty() || !isRunning) { //System.out.println(isRunning); if (Step2Link.unVisitedUrlsEmpty()){ try { TimeUnit.SECONDS.sleep(2); } catch (InterruptedException e) { e.printStackTrace(); } continue; } //从未访问的URL队列取出第一个连接 String visitUrl = Step2Link.unVisitedUrlDeQueue(); Set<String> links = this.extracLinks(visitUrl); for(String link : links){ Step2Link.addUnvisitedUrl(link); } } } /** * 抽取满足第二级条件的连接,返回给调用函数,即抽取下一页数据 * * 抽取满足第三级级条件的加入到第二级队列,即抽取书籍详细数据 * * @param temp */ private Set<String> extracLinks(String visitUrl) { //获取当前文档对象 Document doc = Jsoup.parse(HttpConnnectionManager.getHtml(visitUrl));//Jsoup.connect(visitUrl).get(); //获取分页数据 Set<String> links = new HashSet<String>(); //获取当页详细书本数据的连接 setBookDetail(doc,Filter.filter3); //获取list页面 links = getBehindPaging(doc,Filter.filter2); return links; } /** * 获取当页详细书本数据的连接 * 将获取到的连接插入到:step3link中 */ public static void setBookDetail(Document doc,LinkFilter filter) { //解析左边的 Elements cas = doc.select(PropertiesUtils.getProperties().getProperty("setBookDetail")); if(cas.size() > 0){ for(Element e : cas){ String url = e.attr("href"); if(filter.accept(url)){ Step3Link.addUnvisitedUrl(url); } } } } /** * 获取分下下面的链接 * @return */ public static Set<String> getBehindPaging(Document doc,LinkFilter filter) { Set<String> links = new HashSet<String>(); //解析左边的 Elements cas = doc.select(PropertiesUtils.getProperties().getProperty("getBehindPaging")); if(cas.size() > 0){ for(Element e : cas){ String url = e.attr("href"); if(filter.accept(url)){ links.add(PropertiesUtils.getProperties().getProperty("base") + url); } } } return links; } public static void start(int threadCount){ // 创建一个可重用固定线程数的线程池 ExecutorService pool = Executors.newFixedThreadPool(threadCount); Processer2 processer2 = new Processer2(); pool.execute(processer2); } public static void main(String[] args) { //获取当前文档对象 Document doc = Jsoup.parse(HttpConnnectionManager.getHtml("http://category.dangdang.com/all/?category_path=01.38.05.00.00.00"));//Jsoup.connect(visitUrl).get(); //获取当页详细书本数据的连接 getBehindPaging(doc,Filter.filter2); } }