package bimoku.extract.parser; import java.io.File; import java.util.Map; import org.apache.commons.collections.map.HashedMap; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import bimoku.extract.common.PropertyUtil; import com.bimoku.common.bean.BookDB; import com.bimoku.common.bean.BookDetail; import com.bimoku.integrate.DBIntegrated; import com.bimoku.integrate.Integrated; @Component("parserDouban") public class ParserDouban extends Parser{ @Autowired DBIntegrated doubanIntegrated; @Override protected Integrated getIntegratedDao() { if(doubanIntegrated == null) throw new RuntimeException("spring bean 实例化出错"); return doubanIntegrated; } @Override protected BookDetail fieldFilter(Map<String, String> map) { BookDB bookdouban = new BookDB(); //String author_trans = map.get(PropertyUtil.AUTHOR_TRANSLATOR); //System.out.println(author_trans); //String authorIntro = map.get(PropertyUtil.AUTHOR_INTRO); String bookName = map.get(PropertyUtil.BOOKNAME); String cover_pic = map.get(PropertyUtil.COVER_PIC); //String directory = map.get(PropertyUtil.DIRECTORY); //String isbn = map.get(PropertyUtil.ISBN); //String press = map.get(PropertyUtil.PRESS); //String price = map.get(PropertyUtil.PRICE); //String PUBLISHED_PRICE = map.get(PropertyUtil.PUBLISHED_PRICE); //String translator = map.get(PropertyUtil.TRANSLATOR); //String version = map.get(PropertyUtil.VERSION); String intro_clearfix = map.get(PropertyUtil.intro_clearfix); String EXTRACT = map.get(PropertyUtil.EXTRACT); //TODO 这个过程多处理 System.out.println(intro_clearfix); String author = ""; String isbn = ""; String press = ""; String translator= ""; String version = ""; Double pric = 0.0; Double pub_pric = 0.0; // System.out.println(intro_clearfix); String[] infoparam = Patternmatch_Douban.patternmatchContent(intro_clearfix); try{ author = infoparam[0].trim(); }catch(NullPointerException e){ author = ""; } try{ press = infoparam[1].trim(); }catch(NullPointerException e){ press = ""; } try{ pric = Double.valueOf(infoparam[2]); }catch(NumberFormatException e){ pric = 0.0; }catch(NullPointerException e){ pric = 0.0; } try{ isbn = infoparam[3].trim(); }catch(NullPointerException e){ isbn = ""; } try{ translator = infoparam[4].trim(); }catch(NullPointerException e){ translator = ""; } try{ cover_pic = cover_pic.substring(0, cover_pic.length()>45?45:cover_pic.length()); }catch(StringIndexOutOfBoundsException e){ cover_pic = ""; } try{ bookName = bookName.substring(0, bookName.length()>45?45:bookName.length()); }catch(StringIndexOutOfBoundsException e){ bookName = ""; } EXTRACT = EXTRACT.substring(0, EXTRACT.length()>2000?2000:EXTRACT.length()); //TODO /*String[] infoparam_auth = Patternmatch_Amazon.patternmatchAUT_TRANS(author_trans); author = infoparam_auth[0]==null?"": infoparam_auth[0].substring(0, infoparam_auth[0].length()>45?45:infoparam_auth[0].length());; */ bookdouban.setAuthor(author); //bookamazon.setAuthorIntro(authorIntro); bookdouban.setBookName(bookName); bookdouban.setCover_pic(cover_pic); //bookamazon.setDirectory(directory); bookdouban.setIsbn(isbn); bookdouban.setPress(press); bookdouban.setPrice(pric); bookdouban.setTranslator(translator); //bookdouban.setVersion(version); // bookamazon.setPub_price(pub_pric); bookdouban.setOutLine(EXTRACT); System.out.println(bookdouban.toString()); return bookdouban; } @Override protected Map<String, String> getElementsInfo(String filepath) throws Exception{ Map<String, String> map = new HashedMap(); File input = new File(filepath); Document doc = Jsoup.parse(input, "UTF-8"); map.put(PropertyUtil.BOOKNAME, doc.select(PropertyUtil.readProperty(PropertyUtil.BOOKNAME)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.BOOKNAME)).first().text()); //map.put(PropertyUtil.AUTHOR_TRANSLATOR, doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_TRANSLATOR)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_TRANSLATOR)).first().text()); //map.put(PropertyUtil.BOOK_DESCIPTION, doc.select(PropertyUtil.readProperty(PropertyUtil.BOOK_DESCIPTION)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.BOOK_DESCIPTION)).first().text()); //map.put(PropertyUtil.PRESS, doc.select(PropertyUtil.readProperty(PropertyUtil.PRESS)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.PRESS)).first().text()); //map.put(PropertyUtil.VERSION, doc.select(PropertyUtil.readProperty(PropertyUtil.VERSION)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.VERSION)).first().text()); //TODO map.put(PropertyUtil.ITEM_ID, doc.select(PropertyUtil.readProperty(PropertyUtil.ITEM_ID)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.ITEM_ID)).first().text()); //map.put(PropertyUtil.ISBN, doc.select(PropertyUtil.readProperty(PropertyUtil.ISBN)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.ISBN)).first().text()); map.put(PropertyUtil.intro_clearfix, doc.select(PropertyUtil.readProperty(PropertyUtil.intro_clearfix)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.intro_clearfix)).first().text()); //map.put(PropertyUtil.PRICE, doc.select(PropertyUtil.readProperty(PropertyUtil.PRICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.PRICE)).first().text()); //map.put(PropertyUtil.PUBLISHED_PRICE, doc.select(PropertyUtil.readProperty(PropertyUtil.PUBLISHED_PRICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.PUBLISHED_PRICE)).first().text()); // Elements linksElements = doc.select(PropertyUtil.readProperty(PropertyUtil.CLASSFY)); // String CLASSFY = ""; // for (Element ele : linksElements) { // CLASSFY += ele.text() + ">"; // } // map.put(PropertyUtil.CLASSFY, CLASSFY); map.put(PropertyUtil.COVER_PIC, doc.select(PropertyUtil.readProperty(PropertyUtil.COVER_PIC)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.COVER_PIC)).first().attr("src")); // map.put(PropertyUtil.EDITOR_CHOICE, doc.select(PropertyUtil.readProperty(PropertyUtil.EDITOR_CHOICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.EDITOR_CHOICE)).first().text()); // map.put(PropertyUtil.CONTENT_CHOICE, doc.select(PropertyUtil.readProperty(PropertyUtil.CONTENT_CHOICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.CONTENT_CHOICE)).first().text()); // map.put(PropertyUtil.AUTHOR_INTRO, doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_INTRO)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_INTRO)).first().text()); // map.put(PropertyUtil.DIRECTORY, doc.select(PropertyUtil.readProperty(PropertyUtil.DIRECTORY)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.DIRECTORY)).first().text()); // map.put(PropertyUtil.MEDIA_REVIEWS, doc.select(PropertyUtil.readProperty(PropertyUtil.MEDIA_REVIEWS)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.MEDIA_REVIEWS)).first().text()); if(doc.select(PropertyUtil.readProperty(PropertyUtil.EXTRACT)).last()!=null){ map.put(PropertyUtil.EXTRACT, doc.select(PropertyUtil.readProperty(PropertyUtil.EXTRACT)).last().text()); }else{ map.put(PropertyUtil.EXTRACT, doc.select("div.related_info>div#link-report.indent>div>div.intro").last().text()); } //map.put(PropertyUtil.ATTACH_IMAGE_SHOW, doc.select(PropertyUtil.readProperty(PropertyUtil.ATTACH_IMAGE_SHOW)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.ATTACH_IMAGE_SHOW)).first().text()); //map.put(PropertyUtil.COMMENTURL, doc.select(PropertyUtil.readProperty(PropertyUtil.COMMENTURL)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.COMMENTURL)).first().text()); return map; } }