package com.bmk.crawler.processer; import java.io.IOException; import java.util.HashSet; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.bmk.crawler.Filter; import com.bmk.crawler.HttpConnnectionManager; import com.bmk.crawler.LinkFilter; import com.bmk.crawler.PropertiesUtils; /** * @Intro 处理书籍分类的信息 * @author Lee * @Date 2013-8-7 */ public class Processer1 implements Runnable{ public Processer1(){ Step1Link.addUnvisitedUrl("http://category.dangdang.com/all/?category_path=01.00.00.00.00.00"); } @Override public void run() { //当还有连接未处理,并且没有结束 while (!Step1Link.unVisitedUrlsEmpty() && Step1Link.getVisitedUrlNum() <= 1000) { //从未访问的URL队列取出第一个连接 if (Step1Link.unVisitedUrlsEmpty()) break; String visitUrl = Step1Link.unVisitedUrlDeQueue(); Set<String> links = getLeftCategroy(visitUrl,Filter.filter1); for(String link : links){ //当向分类集合中插入一则数据,则会向向list集合中插入一则数据,从而想p2发起任务!!! Step1Link.addUnvisitedUrl(link); } //休息几秒钟 try { TimeUnit.SECONDS.sleep(2); } catch (InterruptedException e) { e.printStackTrace(); } } } /** * 获取左边分类栏的链接 * @return */ public static Set<String> getLeftCategroy(String visitUrl,LinkFilter filter) { Set<String> links = new HashSet<String>(); Document doc = Jsoup.parse(HttpConnnectionManager.getHtml(visitUrl));//Jsoup.connect(visitUrl).get(); //解析左边的 Elements cas = doc.select(PropertiesUtils.getProperties().getProperty("getLeftCategroy")); if(cas.size() > 0){ for(Element e : cas){ links.add(e.attr("href")); } } return links; } public static void start(int threadCount){ // 创建一个可重用固定线程数的线程池 ExecutorService pool = Executors.newFixedThreadPool(threadCount); Processer1 processer1 = new Processer1(); // new Thread(processer1).start(); pool.execute(processer1); // 关闭启动线程 // if(Step1Link.unVisitedUrlsEmpty()){ // pool.shutdown(); // System.out.println("关闭启动线程p2"); // } // 等待子线程结束,再继续执行下面的代码 // try { // pool.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS); // } catch (InterruptedException e) { // e.printStackTrace(); // } // System.out.println("all thread complete"); } public static void main(String[] args) throws InterruptedException { Processer1.start(1); Processer2.start(2); Processer3.start(5); } }