package bimoku.extract.parser; import java.io.File; import java.util.Map; import org.apache.commons.collections.map.HashedMap; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import bimoku.extract.common.PropertyUtil; import com.bimoku.common.bean.BookAmazon; import com.bimoku.common.bean.BookDetail; import com.bimoku.integrate.AmazonIntegrated; import com.bimoku.integrate.Integrated; @Component("parserAmazon") public class ParserAmazon extends Parser{ @Autowired AmazonIntegrated amazonIntegrated; @Override protected Integrated getIntegratedDao() { if(amazonIntegrated == null) throw new RuntimeException("spring bean 实例化出错"); return amazonIntegrated; } @Override protected BookDetail fieldFilter(Map<String, String> map) { BookAmazon bookamazon = new BookAmazon(); String author_trans = map.get(PropertyUtil.AUTHOR_TRANSLATOR); System.out.println(author_trans); //String authorIntro = map.get(PropertyUtil.AUTHOR_INTRO); String bookName = map.get(PropertyUtil.BOOKNAME); String cover_pic = map.get(PropertyUtil.COVER_PIC); //String directory = map.get(PropertyUtil.DIRECTORY); //String isbn = map.get(PropertyUtil.ISBN); //String press = map.get(PropertyUtil.PRESS); String price = map.get(PropertyUtil.PRICE); String PUBLISHED_PRICE = map.get(PropertyUtil.PUBLISHED_PRICE); //String translator = map.get(PropertyUtil.TRANSLATOR); //String version = map.get(PropertyUtil.VERSION); String intro_clearfix = map.get(PropertyUtil.intro_clearfix); String book_desciption = map.get(PropertyUtil.BOOK_DESCIPTION); //TODO 这个过程多处理 String author = ""; String isbn = ""; String press = ""; String translator= ""; String version = ""; Double pric = 0.0; Double pub_pric = 0.0; // System.out.println(intro_clearfix); String[] infoparam = Patternmatch_Amazon.patternmatchContent(intro_clearfix); press = infoparam[0]; version = infoparam[1]; try{ isbn = infoparam[3].trim(); }catch(NullPointerException e){ isbn = ""; } try{ pric = Double.valueOf(price.replaceAll("\\?|¥ ", "")); }catch(NumberFormatException e){ pric = 0.0; } try{ pub_pric = Double.valueOf(PUBLISHED_PRICE.replaceAll("\\?|¥ ", "")); }catch(NumberFormatException e){ pub_pric = 0.0; } try{ cover_pic = cover_pic.substring(0, cover_pic.length()>252?252:cover_pic.length()-1); }catch(StringIndexOutOfBoundsException e){ cover_pic = ""; } try{ bookName = bookName.substring(0, bookName.length()>252?252:bookName.length()-1); }catch(StringIndexOutOfBoundsException e){ bookName = ""; } try{ book_desciption = book_desciption.substring(0, book_desciption.length()>2000?2000:book_desciption.length()-1); }catch(StringIndexOutOfBoundsException e){ book_desciption = ""; } //TODO String[] infoparam_auth = Patternmatch_Amazon.patternmatchAUT_TRANS(author_trans); author = infoparam_auth[0]==null?"": infoparam_auth[0].substring(0, infoparam_auth[0].length()>45?45:infoparam_auth[0].length()-1);; bookamazon.setAuthor(author); //bookamazon.setAuthorIntro(authorIntro); bookamazon.setBookName(bookName); bookamazon.setCover_pic(cover_pic); //bookamazon.setDirectory(directory); bookamazon.setIsbn(isbn); bookamazon.setPress(press); bookamazon.setPrice(pric); bookamazon.setTranslator(translator); bookamazon.setVersion(version); // bookamazon.setPub_price(pub_pric); bookamazon.setOutLine(book_desciption); System.out.println(bookamazon.toString()); return bookamazon; } @Override protected Map<String, String> getElementsInfo(String filepath) throws Exception{ Map<String, String> map = new HashedMap(); File input = new File(filepath); Document doc = Jsoup.parse(input, "UTF-8"); map.put(PropertyUtil.BOOKNAME, doc.select(PropertyUtil.readProperty(PropertyUtil.BOOKNAME)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.BOOKNAME)).first().text()); map.put(PropertyUtil.AUTHOR_TRANSLATOR, doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_TRANSLATOR)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_TRANSLATOR)).first().text()); map.put(PropertyUtil.BOOK_DESCIPTION, doc.select(PropertyUtil.readProperty(PropertyUtil.BOOK_DESCIPTION)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.BOOK_DESCIPTION)).first().text()); //map.put(PropertyUtil.PRESS, doc.select(PropertyUtil.readProperty(PropertyUtil.PRESS)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.PRESS)).first().text()); //map.put(PropertyUtil.VERSION, doc.select(PropertyUtil.readProperty(PropertyUtil.VERSION)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.VERSION)).first().text()); //TODO map.put(PropertyUtil.ITEM_ID, doc.select(PropertyUtil.readProperty(PropertyUtil.ITEM_ID)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.ITEM_ID)).first().text()); //map.put(PropertyUtil.ISBN, doc.select(PropertyUtil.readProperty(PropertyUtil.ISBN)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.ISBN)).first().text()); map.put(PropertyUtil.intro_clearfix, doc.select(PropertyUtil.readProperty(PropertyUtil.intro_clearfix)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.intro_clearfix)).first().text()); map.put(PropertyUtil.PRICE, doc.select(PropertyUtil.readProperty(PropertyUtil.PRICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.PRICE)).first().text()); map.put(PropertyUtil.PUBLISHED_PRICE, doc.select(PropertyUtil.readProperty(PropertyUtil.PUBLISHED_PRICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.PUBLISHED_PRICE)).first().text()); // Elements linksElements = doc.select(PropertyUtil.readProperty(PropertyUtil.CLASSFY)); // String CLASSFY = ""; // for (Element ele : linksElements) { // CLASSFY += ele.text() + ">"; // } // map.put(PropertyUtil.CLASSFY, CLASSFY); map.put(PropertyUtil.COVER_PIC, doc.select(PropertyUtil.readProperty(PropertyUtil.COVER_PIC)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.COVER_PIC)).first().attr("src")); // map.put(PropertyUtil.EDITOR_CHOICE, doc.select(PropertyUtil.readProperty(PropertyUtil.EDITOR_CHOICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.EDITOR_CHOICE)).first().text()); // map.put(PropertyUtil.CONTENT_CHOICE, doc.select(PropertyUtil.readProperty(PropertyUtil.CONTENT_CHOICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.CONTENT_CHOICE)).first().text()); // map.put(PropertyUtil.AUTHOR_INTRO, doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_INTRO)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_INTRO)).first().text()); // map.put(PropertyUtil.DIRECTORY, doc.select(PropertyUtil.readProperty(PropertyUtil.DIRECTORY)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.DIRECTORY)).first().text()); // map.put(PropertyUtil.MEDIA_REVIEWS, doc.select(PropertyUtil.readProperty(PropertyUtil.MEDIA_REVIEWS)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.MEDIA_REVIEWS)).first().text()); //map.put(PropertyUtil.EXTRACT, doc.select(PropertyUtil.readProperty(PropertyUtil.EXTRACT)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.EXTRACT)).first().text()); //map.put(PropertyUtil.ATTACH_IMAGE_SHOW, doc.select(PropertyUtil.readProperty(PropertyUtil.ATTACH_IMAGE_SHOW)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.ATTACH_IMAGE_SHOW)).first().text()); //map.put(PropertyUtil.COMMENTURL, doc.select(PropertyUtil.readProperty(PropertyUtil.COMMENTURL)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.COMMENTURL)).first().text()); return map; } }