package com.bmk.crawler.processer;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import com.bimoku.common.bean.BookDD;
import com.bimoku.integrate.DDIntegrated;
import com.bmk.crawler.HttpConnnectionManager;
import com.bmk.crawler.PropertiesUtils;
/**
* @Intro 处理单本书籍的数据
* @author Lee
* @Date 2013-8-7
*/
public class Processer3 implements Runnable{
private static DDIntegrated ddIntegraed;
public Processer3(){
this.ddIntegraed = getIntegraed();
}
public static boolean isRunning = false;
@Override
public void run() {
//待抓取的链接不空
while (!Step3Link.unVisitedUrlsEmpty() || !isRunning) {
//System.out.println(isRunning);
if (Step3Link.unVisitedUrlsEmpty()){
try {
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
e.printStackTrace();
}
continue;
}
//从未访问的URL队列取出第一个连接
String visitUrl = Step3Link.unVisitedUrlDeQueue();
process(visitUrl);
}
}
/**
* 处理详细页面的信息
* @param visitUrl
*/
public static void process(String visitUrl){
//下载页面
Document doc = Jsoup.parse(HttpConnnectionManager.getHtml(visitUrl));//Jsoup.connect(visitUrl).get();
BookDD book = new BookDD();
//解析数据
book.setAuthor(doc.select(PropertiesUtils.getProperties().getProperty("author")).text());
book.setIsbn(doc.select(PropertiesUtils.getProperties().getProperty("isbn")).text());
book.setPrice(Double.parseDouble(doc.select(PropertiesUtils.getProperties().getProperty("price")).text().substring(1)));
book.setOutLine(doc.select(PropertiesUtils.getProperties().getProperty("outline")).text());
book.setBookName(doc.select(PropertiesUtils.getProperties().getProperty("bookName")).text());
book.setCover_pic(doc.select(PropertiesUtils.getProperties().getProperty("pic")).attr("wsrc").trim());
String outline = doc.select(PropertiesUtils.getProperties().getProperty("outline")).text().trim();
if(outline.startsWith("<p>")){
outline = Jsoup.parse(outline).select("p").text();
}
book.setOutLine(outline.length() < 2000 ? outline : outline.substring(0, 2000));
//封装到实体
System.out.println("bookName---->"+book.getBookName()+"<-->"+book.getAuthor()+"<-->"+book.getOutLine()+"<-->"+book.getIsbn());
//TODO
}
public static void start(int threadCount){
// 创建一个可重用固定线程数的线程池
ExecutorService pool = Executors.newFixedThreadPool(threadCount);
Processer3 processer3 = new Processer3();
pool.execute(processer3);
}
public static void main(String[] args) {
process("http://product.dangdang.com/product.aspx?product_id=22544222#ddclick?act=click&pos=22544222_27_1_p&cat=01.19.00.00.00.00&key=&qinfo=&pinfo=8824_1_48&minfo=&ninfo=&custid=&permid=20130808112126035747584195810198296&ref=&rcount=&type=&t=1375932112000");
}
public static DDIntegrated getIntegraed(){
ApplicationContext ctx = new ClassPathXmlApplicationContext("classpath:/beans.xml");
return (DDIntegrated) ctx.getBean("ddIntegraed");
}
}