package us.codecraft.webmagic.model; import java.lang.annotation.Annotation; import java.lang.reflect.Field; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.ComboExtract; import us.codecraft.webmagic.model.annotation.ConfigInfo; import us.codecraft.webmagic.model.annotation.ExprType; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy2; import us.codecraft.webmagic.model.annotation.ExtractBy3; import us.codecraft.webmagic.model.annotation.ExtractByRaw; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.selector.AbstractedSelector; import us.codecraft.webmagic.selector.AndSelector; import us.codecraft.webmagic.selector.ContainSelector; import us.codecraft.webmagic.selector.CssSelector; import us.codecraft.webmagic.selector.OrSelector; import us.codecraft.webmagic.selector.RegexSelector; import us.codecraft.webmagic.selector.Selector; import us.codecraft.webmagic.selector.XpathSelector; /** * Model主要逻辑类。将一个带注解的POJO转换为一个PageModelExtractor。<br> * * @author code4crafter@gmail.com <br> * Date: 13-8-1 <br> * Time: 下午9:33 <br> */ class PageModelExtractor { private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>(); private Selector targetUrlRegionSelector; private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>(); private Selector helpUrlRegionSelector; private Class<?> clazz; private List<FieldExtractor> fieldExtractors; private Extractor extractor; public static PageModelExtractor create(Class<?> clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); return pageModelExtractor; } private void init(Class<?> clazz) { this.clazz = clazz; initClassExtractors(); fieldExtractors = new ArrayList<FieldExtractor>(); for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); FieldExtractor fieldExtractor = null; boolean hasComboExtract = false; fieldExtractor = getAnnotationExtract(clazz, field); if (fieldExtractor == null) { fieldExtractor = getAnnotationExtractBy(clazz, field); } else { hasComboExtract = true; } FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { throw new IllegalStateException( "Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); } else if (fieldExtractor == null && fieldExtractorTmp != null) { fieldExtractor = fieldExtractorTmp; } if (!hasComboExtract) { // ExtractBy2 & ExtractBy3 if (fieldExtractor != null) { addAnnotationExtractBy2(fieldExtractor); addAnnotationExtractBy3(fieldExtractor); } } fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { throw new IllegalStateException( "Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); } else if (fieldExtractor == null && fieldExtractorTmp != null) { fieldExtractor = fieldExtractorTmp; } if (fieldExtractor != null) { if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be string"); } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) { throw new IllegalStateException("Field " + field.getName() + " must be list"); } fieldExtractors.add(fieldExtractor); } } } private FieldExtractor getAnnotationExtractByUrl(Class<?> clazz, Field field) { FieldExtractor fieldExtractor = null; ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); if (extractByUrl != null) { String regexPattern = extractByUrl.value(); if (regexPattern.trim().equals("")) { regexPattern = ".*"; } fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } } return fieldExtractor; } private FieldExtractor getAnnotationExtract(Class<?> clazz, Field field) { FieldExtractor fieldExtractor = null; ComboExtract extract = field.getAnnotation(ComboExtract.class); if (extract != null) { ExtractBy[] extractBys = extract.value(); ComboExtract.OP op = extract.op(); for (int i = 0; i < extractBys.length; i++) { ExtractBy extractBy = extractBys[i]; if (i == 0) { fieldExtractor = getAnnotationExtractBy(extractBy, clazz, field); } else { ConfigInfo configInfo = extractBy.configure(); boolean isOuterHtml = configInfo.isOuterHtml(); String attrName = configInfo.attrName(); String defaultValue = configInfo.defaultValue(); boolean isTrim = configInfo.isTrim(); boolean isRemoveTag = configInfo.isRemoveTag(); ConfigInfoObj configInfoObj = new ConfigInfoObj(); configInfoObj.setOuterHtml(isOuterHtml); configInfoObj.setAttrName(attrName); configInfoObj.setDefaultValue(defaultValue); configInfoObj.setTrim(isTrim); configInfoObj.setRemoveTag(isRemoveTag); fieldExtractor = addExtractBy(fieldExtractor, extractBy.type(), extractBy.value(), op, configInfoObj); } } } return fieldExtractor; } private Selector getSelector(ExprType type, String expr) { ConfigInfoObj configInfoObj = new ConfigInfoObj(); configInfoObj.setOuterHtml(true); return getSelector(type, expr, configInfoObj); } private Selector getSelector(ExprType type, String expr, ConfigInfoObj configInfoObj) { if (configInfoObj == null) configInfoObj = new ConfigInfoObj(); boolean isOuterHtml = configInfoObj.isOuterHtml(); String attrName = configInfoObj.getAttrName(); String defaultValue = configInfoObj.getDefaultValue(); boolean isTrim = configInfoObj.isTrim(); boolean isRemoveTag = configInfoObj.isRemoveTag(); AbstractedSelector.Temp tempObj = new AbstractedSelector.Temp(); tempObj.setDefaultValue(defaultValue); tempObj.setTrim(isTrim); tempObj.setRemoveTag(isRemoveTag); Selector selector = null; switch (type) { case CSS: selector = new CssSelector(expr, isOuterHtml, attrName, tempObj); break; case REGEX: selector = new RegexSelector(expr, tempObj); break; case XPATH: selector = new XpathSelector(expr, tempObj); break; case CONTAINS: selector = new ContainSelector(expr, tempObj); break; default: selector = new XpathSelector(expr, tempObj); } return selector; } private FieldExtractor addExtractBy(FieldExtractor fieldExtractor, ExprType type, String expr, ComboExtract.OP op, ConfigInfoObj configInfoObj) { if (fieldExtractor == null) return null; Selector selector = getSelector(type, expr, configInfoObj); if (ComboExtract.OP.AND.equals(op)) { fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); } else { fieldExtractor.setSelector(new OrSelector(fieldExtractor.getSelector(), selector)); } return fieldExtractor; } private FieldExtractor addExtractBy(FieldExtractor fieldExtractor, ExprType type, String expr, ComboExtract.OP op) { ConfigInfoObj configInfoObj = new ConfigInfoObj(); configInfoObj.setOuterHtml(true); return addExtractBy(fieldExtractor, type, expr, op, configInfoObj); } private FieldExtractor getAnnotationExtractBy(Class<?> clazz, Field field) { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); FieldExtractor fieldExtractor = getAnnotationExtractBy(extractBy, clazz, field); return fieldExtractor; } private FieldExtractor getAnnotationExtractBy(ExtractBy extractBy, Class<?> clazz, Field field) { FieldExtractor fieldExtractor = null; if (extractBy != null) { String value = extractBy.value(); ConfigInfo configInfo = extractBy.configure(); boolean isOuterHtml = configInfo.isOuterHtml(); String attrName = configInfo.attrName(); String defaultValue = configInfo.defaultValue(); boolean isTrim = configInfo.isTrim(); boolean isRemoveTag = configInfo.isRemoveTag(); ConfigInfoObj configInfoObj = new ConfigInfoObj(); configInfoObj.setOuterHtml(isOuterHtml); configInfoObj.setAttrName(attrName); configInfoObj.setDefaultValue(defaultValue); configInfoObj.setTrim(isTrim); configInfoObj.setRemoveTag(isRemoveTag); Selector selector = getSelector(extractBy.type(), value, configInfoObj); fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } } return fieldExtractor; } @SuppressWarnings("deprecation") private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) { ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class); if (extractBy != null) { String expr = extractBy.value(); ExprType type = extractBy.type(); addExtractBy(fieldExtractor, type, expr, ComboExtract.OP.AND); } } @SuppressWarnings("deprecation") private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) { ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class); if (extractBy != null) { String expr = extractBy.value(); ExprType type = extractBy.type(); addExtractBy(fieldExtractor, type, expr, ComboExtract.OP.AND); } } private FieldExtractor getAnnotationExtractByRaw(Class<?> clazz, Field field) { FieldExtractor fieldExtractor = null; ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class); if (extractByRaw != null) { String value = extractByRaw.value(); Selector selector = getSelector(extractByRaw.type(), value); fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); } } return fieldExtractor; } public static Method getSetterMethod(Class<?> clazz, Field field) { String name = "set" + StringUtils.capitalize(field.getName()); try { Method declaredMethod = clazz.getDeclaredMethod(name, field.getType()); declaredMethod.setAccessible(true); return declaredMethod; } catch (NoSuchMethodException e) { return null; } } private void initClassExtractors() { Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { targetUrlPatterns.add(Pattern.compile(".*")); } else { TargetUrl targetUrl = (TargetUrl) annotation; String[] value = targetUrl.value(); for (String s : value) { Pattern pattern = normalizeRegex(s); if (pattern != null) targetUrlPatterns.add(pattern); } if (!targetUrl.sourceRegion().equals("")) { targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion()); } } annotation = clazz.getAnnotation(HelpUrl.class); if (annotation != null) { HelpUrl helpUrl = (HelpUrl) annotation; String[] value = helpUrl.value(); for (String s : value) { Pattern pattern = normalizeRegex(s); if (pattern != null) helpUrlPatterns.add(pattern); } if (!helpUrl.sourceRegion().equals("")) { helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion()); } } annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); } } private Pattern normalizeRegex(String s) { Pattern pattern = null; if (StringUtils.isEmpty(s)) return pattern; pattern = Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"); return pattern; } public Object process(Page page) { if (extractor == null) { return processSingle(page, page.getHtml().toString()); } else { if (extractor.multi) { List<Object> os = new ArrayList<Object>(); List<String> list = extractor.getSelector().selectList(page.getHtml().toString()); for (String s : list) { Object o = processSingle(page, s); if (o != null) { os.add(o); } } return os; } else { String select = extractor.getSelector().select(page.getHtml().toString()); Object o = processSingle(page, select); return o; } } } private Object processSingle(Page page, String html) { Object o = null; try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { if (fieldExtractor.isMulti()) { List<String> value; switch (fieldExtractor.getSource()) { case RawHtml: value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); break; case Html: value = fieldExtractor.getSelector().selectList(html); break; case Url: value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); break; default: value = fieldExtractor.getSelector().selectList(html); } if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { return null; } setField(o, fieldExtractor, value); } else { String value; switch (fieldExtractor.getSource()) { case RawHtml: value = fieldExtractor.getSelector().select(page.getHtml().toString()); break; case Html: value = fieldExtractor.getSelector().select(html); break; case Url: value = fieldExtractor.getSelector().select(page.getUrl().toString()); break; default: value = fieldExtractor.getSelector().select(html); } if (StringUtils.isEmpty(value) && fieldExtractor.isNotNull()) { return null; } setField(o, fieldExtractor, value); } } if (AfterExtractor.class.isAssignableFrom(clazz)) { ((AfterExtractor) o).afterProcess(page); } } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { e.printStackTrace(); } catch (InvocationTargetException e) { e.printStackTrace(); } return o; } private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { if (fieldExtractor.getSetterMethod() != null) { fieldExtractor.getSetterMethod().invoke(o, value); } fieldExtractor.getField().set(o, value); } Class<?> getClazz() { return clazz; } List<Pattern> getTargetUrlPatterns() { return targetUrlPatterns; } List<Pattern> getHelpUrlPatterns() { return helpUrlPatterns; } Selector getTargetUrlRegionSelector() { return targetUrlRegionSelector; } Selector getHelpUrlRegionSelector() { return helpUrlRegionSelector; } }