package com.brucezee.jspider.parser;
import com.alibaba.fastjson.util.TypeUtils;
import com.brucezee.jspider.parser.define.FieldDefine;
import com.brucezee.jspider.parser.define.FieldTypeEnum;
import com.brucezee.jspider.parser.expression.JsoupExpression;
import com.brucezee.jspider.parser.expression.RegexExpression;
import com.brucezee.jspider.parser.expression.SegmentExpression;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 基于Jsoup或正则表达式解析html类型数据的解析器
* Created by zhoubing on 2016/11/29.
*/
public class JsoupParser extends Parser {
public void parseData(String html, FieldDefine fieldDefine, Map result) {
parseHtml(Jsoup.parse(html), fieldDefine, result);
}
public void parseHtml(Document document, FieldDefine fieldDefine, Map result) {
parseHtml(document, document, fieldDefine, result);
}
private void parseHtml(Document document, Element element, FieldDefine fieldDefine, Map result) {
FieldDefine[] defines = fieldDefine.getDefines();
if (defines == null || defines.length == 0) {
return;
}
JsoupExpression jsoupExpression = new JsoupExpression();//复用对象
for (FieldDefine define : defines) {
String type = define.getType();
String selector = define.getSelector();
if (StringUtils.isBlank(selector)) {
if (FieldTypeEnum.Object.isEqual(type) || FieldTypeEnum.Map.isEqual(type)) {
Map child = result.containsKey(define.getName()) ? (Map) result.get(define.getName()) : new HashMap();
parseHtml(document, document, define, child);
result.put(define.getName(), child);
continue;
}
if (define.getProcessor() != null) {
Object value = define.getProcessor().process(document, document.html());
result.put(define.getName(), TypeUtils.cast(value, typeToClass(define.getType()), null));
continue;
}
throw new IllegalArgumentException("unhandled html parser define type ["+type+"] when selector is empty.");
}
if (RegexExpression.isRegexExpression(selector)) {
List<String> list = RegexExpression.matcher(document.html(), element.html(), selector);
result.put(define.getName(), getValue(list, document, define));
continue;
}
if (SegmentExpression.isSegmentExpression(selector)) {
String value = SegmentExpression.matcher(document.html(), element.html(), selector);
result.put(define.getName(), castValue(define, document, value));
continue;
}
jsoupExpression.parse(selector);
Elements elements = null;
if (jsoupExpression.isRelative()) {
elements = element.select(jsoupExpression.getCssQuery());
} else {
elements = document.select(jsoupExpression.getCssQuery());
}
if (elements == null || elements.isEmpty()) {
continue;
}
if (FieldTypeEnum.Object.isEqual(type) || FieldTypeEnum.Map.isEqual(type)) {
Map child = result.containsKey(define.getName()) ? (Map) result.get(define.getName()) : new HashMap();
parseHtml(document, elements.first(), define, child);
result.put(define.getName(), child);
continue;
}
if (FieldTypeEnum.Array.isEqual(type) || FieldTypeEnum.List.isEqual(type)) {
int size = elements.size();
Object[] array = result.containsKey(define.getName()) ? (Object[]) result.get(define.getName()) : new Object[size];
int min = Math.min(size, array.length);
for (int i = jsoupExpression.getSkip(); i < min; i++) {
int j = i - jsoupExpression.getSkip();
Map item = array[j] != null ? (Map) array[j] : new HashMap();
parseHtml(document, elements.get(i), define.firstDefine(), item);
array[j] = item;
}
result.put(define.getName(), array);
continue;
}
result.put(define.getName(), getElementValue(document, elements, jsoupExpression, define));
}
}
private Object getElementValue(Document document, Elements elements, JsoupExpression jsoupExpression, FieldDefine define) {
return castValue(define, document, getValueText(elements, jsoupExpression));
}
private Object getValueText(Elements elements, JsoupExpression jsoupExpression) {
if (elements == null || elements.isEmpty()) {
return null;
}
Element element = elements.get(0);
if (jsoupExpression.isTextMethod()) {
return StringUtils.trim(element.text());
}
if (jsoupExpression.isValMethod()) {
return StringUtils.trim(element.val());
}
if (jsoupExpression.isAttrMethod()) {
return StringUtils.trim(element.attr(jsoupExpression.getParameter()));
}
if (jsoupExpression.isOuterHtmlMethod()) {
return StringUtils.trim(element.outerHtml());
}
if (jsoupExpression.isOwnTextMethod()) {
return StringUtils.trim(element.ownText());
}
if (jsoupExpression.isHtmlMethod()) {
return StringUtils.trim(element.html());
}
return StringUtils.trim(element.text());
}
}