package bimoku.extract.parser; import java.io.File; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.collections.map.HashedMap; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import bimoku.extract.common.PropertyUtil; import com.bimoku.common.bean.BookDD; import com.bimoku.common.bean.BookDetail; import com.bimoku.integrate.DDIntegrated; import com.bimoku.integrate.Integrated; @Component("parserDD") public class ParserDD extends Parser{ @Autowired DDIntegrated ddIntegrated; @Override protected Integrated getIntegratedDao() { if(ddIntegrated == null) throw new RuntimeException("spring bean 实例化出错"); return ddIntegrated; } @Override protected BookDetail fieldFilter(Map<String, String> map) { BookDD bookDD = new BookDD(); //String author = map.get(PropertyUtil.AUTHOR); String authorIntro = map.get(PropertyUtil.AUTHOR_INTRO); String bookName = map.get(PropertyUtil.BOOKNAME); String catelog = map.get(PropertyUtil.CLASSFY); String cover_pic = map.get(PropertyUtil.COVER_PIC); String directory = map.get(PropertyUtil.DIRECTORY); //String isbn = map.get(PropertyUtil.ISBN); //String press = map.get(PropertyUtil.PRESS); String price = map.get(PropertyUtil.PRICE); //String publicprice = map.get(PropertyUtil.PUBLISHED_PRICE); //String translator = map.get(PropertyUtil.TRANSLATOR); //String version = map.get(PropertyUtil.VERSION); String intro_clearfix = map.get(PropertyUtil.intro_clearfix); //TODO 这个过程多处理 String author = new String(); String isbn = null; String press = null; String translator = null; String version = null; Double pric = 0.0; String[] infoparam = PatternmatchDD.patternmatch(intro_clearfix); //基本信息模块用模式匹配 author = infoparam[0]; translator = infoparam[1]; press = infoparam[2]; version = infoparam[3]; try{ isbn = infoparam[4].trim(); }catch(NullPointerException e){ isbn = ""; } pric = Double.valueOf(price.replaceAll("\\?|¥", "")); //控制字段长度 authorIntro = trimTag(authorIntro); try { authorIntro = authorIntro.substring(0, authorIntro.length()>2000?2000:authorIntro.length()-1); }catch(StringIndexOutOfBoundsException e){ authorIntro = ""; } directory = trimTag(directory); try{ directory = directory.substring(0, directory.length()>2000?2000:directory.length()-1); }catch(StringIndexOutOfBoundsException e){ directory = ""; } try{ cover_pic = cover_pic.substring(0, cover_pic.length()>250?250:cover_pic.length()-1); }catch(StringIndexOutOfBoundsException e){ cover_pic = ""; } try{ bookName = bookName.substring(0, bookName.length()>252?252:bookName.length()-1); }catch(StringIndexOutOfBoundsException e){ bookName = ""; } try{ catelog = catelog.substring(0, catelog.length()>252?252:catelog.length()-1); }catch(StringIndexOutOfBoundsException e){ catelog = ""; } bookDD.setAuthor(author); bookDD.setAuthorIntro(authorIntro); bookDD.setBookName(bookName); bookDD.setCatelog(catelog); bookDD.setCover_pic(cover_pic); bookDD.setDirectory(directory); bookDD.setIsbn(isbn); bookDD.setPress(press); bookDD.setPrice(pric); bookDD.setTranslator(translator); bookDD.setVersion(version); //System.out.println(bookDD.toString()); return bookDD; } @Override protected Map<String, String> getElementsInfo(String filepath) throws Exception{ Map<String, String> map = new HashedMap(); File input = new File(filepath); Document doc = Jsoup.parse(input, "gb2312"); //System.out.println(doc.select(PropertyUtil.readProperty(PropertyUtil.BOOKNAME)).first().text()); map.put(PropertyUtil.BOOKNAME, doc.select(PropertyUtil.readProperty(PropertyUtil.BOOKNAME)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.BOOKNAME)).first().text()); //map.put(PropertyUtil.AUTHOR, doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR)).first().text()); //map.put(PropertyUtil.TRANSLATOR, doc.select(PropertyUtil.readProperty(PropertyUtil.TRANSLATOR)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.TRANSLATOR)).first().text()); //map.put(PropertyUtil.PRESS, doc.select(PropertyUtil.readProperty(PropertyUtil.PRESS)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.PRESS)).first().text()); //map.put(PropertyUtil.VERSION, doc.select(PropertyUtil.readProperty(PropertyUtil.VERSION)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.VERSION)).first().text()); map.put(PropertyUtil.ITEM_ID, doc.select(PropertyUtil.readProperty(PropertyUtil.ITEM_ID)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.ITEM_ID)).first().text()); //map.put(PropertyUtil.ISBN, doc.select(PropertyUtil.readProperty(PropertyUtil.ISBN)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.ISBN)).first().text()); map.put(PropertyUtil.intro_clearfix, doc.select(PropertyUtil.readProperty(PropertyUtil.intro_clearfix)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.intro_clearfix)).first().text()); map.put(PropertyUtil.PRICE, doc.select(PropertyUtil.readProperty(PropertyUtil.PRICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.PRICE)).first().text()); map.put(PropertyUtil.PUBLISHED_PRICE, doc.select(PropertyUtil.readProperty(PropertyUtil.PUBLISHED_PRICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.PUBLISHED_PRICE)).first().text()); Elements linksElements = doc.select(PropertyUtil.readProperty(PropertyUtil.CLASSFY)); String CLASSFY = ""; for (Element ele : linksElements) { CLASSFY += ele.text() + ">"; } map.put(PropertyUtil.CLASSFY, CLASSFY); map.put(PropertyUtil.COVER_PIC, doc.select(PropertyUtil.readProperty(PropertyUtil.COVER_PIC)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.COVER_PIC)).first().attr("src")); // map.put(PropertyUtil.EDITOR_CHOICE, doc.select(PropertyUtil.readProperty(PropertyUtil.EDITOR_CHOICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.EDITOR_CHOICE)).first().text()); // map.put(PropertyUtil.CONTENT_CHOICE, doc.select(PropertyUtil.readProperty(PropertyUtil.CONTENT_CHOICE)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.CONTENT_CHOICE)).first().text()); map.put(PropertyUtil.AUTHOR_INTRO, doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_INTRO)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.AUTHOR_INTRO)).first().text()); map.put(PropertyUtil.DIRECTORY, doc.select(PropertyUtil.readProperty(PropertyUtil.DIRECTORY)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.DIRECTORY)).first().text()); // map.put(PropertyUtil.MEDIA_REVIEWS, doc.select(PropertyUtil.readProperty(PropertyUtil.MEDIA_REVIEWS)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.MEDIA_REVIEWS)).first().text()); //map.put(PropertyUtil.EXTRACT, doc.select(PropertyUtil.readProperty(PropertyUtil.EXTRACT)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.EXTRACT)).first().text()); //map.put(PropertyUtil.ATTACH_IMAGE_SHOW, doc.select(PropertyUtil.readProperty(PropertyUtil.ATTACH_IMAGE_SHOW)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.ATTACH_IMAGE_SHOW)).first().text()); //map.put(PropertyUtil.COMMENTURL, doc.select(PropertyUtil.readProperty(PropertyUtil.COMMENTURL)).first()==null?"":doc.select(PropertyUtil.readProperty(PropertyUtil.COMMENTURL)).first().text()); return map; } //处理掉大字段中的标签 public static String trimTag(String content) { String regEx = "<[^>]+>"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(content); String result = content; if (m.find()) { result = m.replaceAll(""); } result = result.replace("<", "").replace(">", ""); return result; } }