package com.brucezee.jspider.parser;
import com.alibaba.fastjson.util.TypeUtils;
import com.brucezee.jspider.parser.define.FieldDefine;
import com.brucezee.jspider.parser.define.FieldTypeEnum;
import com.brucezee.jspider.parser.expression.RegexExpression;
import com.brucezee.jspider.parser.expression.SegmentExpression;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import org.dom4j.tree.DefaultAttribute;
import org.dom4j.tree.DefaultElement;
import org.dom4j.tree.DefaultText;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 基于xpath解析xml类型数据的解析器
* Created by zhoubing on 2016/11/28.
*/
public class XmlParser extends Parser {
public void parseData(String xml, FieldDefine fieldDefine, Map result) {
parseXml(parseDocument(xml), fieldDefine, result);
}
public void parseXml(Document document, FieldDefine fieldDefine, Map result) {
parseXml(document, document, fieldDefine, result);
}
public Document parseDocument(String xml) {
Document document = null;
try {
document = DocumentHelper.parseText(xml);
} catch (DocumentException e) {
throw new IllegalArgumentException(e);
}
return document;
}
private void parseXml(Document document, Node node, FieldDefine fieldDefine, Map result) {
FieldDefine[] defines = fieldDefine.getDefines();
if (defines == null || defines.length == 0) {
return;
}
for (FieldDefine define : defines) {
String type = define.getType();
String selector = define.getSelector();
if (StringUtils.isBlank(selector)) {
if (FieldTypeEnum.Object.isEqual(type) || FieldTypeEnum.Map.isEqual(type)) {
Map child = result.containsKey(define.getName()) ? (Map) result.get(define.getName()) : new HashMap();
parseXml(document, document, define, child);
result.put(define.getName(), child);
continue;
}
if (define.getProcessor() != null) {
Object value = define.getProcessor().process(document, document.asXML());
result.put(define.getName(), TypeUtils.cast(value, typeToClass(define.getType()), null));
continue;
}
throw new IllegalArgumentException("unhandled xml parser define type ["+type+"] when selector is empty.");
}
if (RegexExpression.isRegexExpression(selector)) {
List<String> list = RegexExpression.matcher(document.asXML(), node.asXML(), selector);
result.put(define.getName(), getValue(list, document, define));
continue;
}
if (SegmentExpression.isSegmentExpression(selector)) {
String value = SegmentExpression.matcher(document.asXML(), node.asXML(), selector);
result.put(define.getName(), castValue(define, document, value));
continue;
}
List nodes = null;
if (isRelativeSelector(selector)) {
nodes = node.selectNodes(selector);
} else if (isAbsoluteSelector(selector)) {
nodes = document.selectNodes(selector);
} else {
throw new IllegalArgumentException("unsupported xml selector ["+selector+"].");
}
if (nodes == null || nodes.isEmpty()) {
continue;
}
if (FieldTypeEnum.Object.isEqual(type) || FieldTypeEnum.Map.isEqual(type)) {
Map child = result.containsKey(define.getName()) ? (Map) result.get(define.getName()) : new HashMap();
parseXml(document, (Node) nodes.get(0), define, child);
result.put(define.getName(), child);
continue;
}
if (FieldTypeEnum.Array.isEqual(type) || FieldTypeEnum.List.isEqual(type)) {
int size = nodes.size();
Object[] array = result.containsKey(define.getName()) ? (Object[]) result.get(define.getName()) : new Object[size];
int min = Math.min(size, array.length);
for (int i = 0; i < min; i++) {
Map item = array[i] != null ? (Map) array[i] : new HashMap();
parseXml(document, (Node) nodes.get(i), define.firstDefine(), item);
array[i] = item;
}
result.put(define.getName(), array);
continue;
}
result.put(define.getName(), getXmlValue(document, nodes, define));
}
}
private Object getXmlValue(Document document, List nodes, FieldDefine define) {
return castValue(define, document, getXmlValueText(nodes));
}
private Object getXmlValueText(List nodes) {
if (nodes == null || nodes.isEmpty()) {
return null;
}
Object node = nodes.get(0);
if (node instanceof DefaultText) {
return StringUtils.trim(((DefaultText) node).getText());
}
if (node instanceof DefaultAttribute) {
return StringUtils.trim(((DefaultAttribute) node).getText());
}
if (node instanceof DefaultElement) {
return StringUtils.trim(((DefaultElement) node).getText());
}
throw new IllegalArgumentException("unsupported node type ["+node+"].");
}
private boolean isAbsoluteSelector(String selector) {
return selector != null && !isRelativeSelector(selector) && selector.startsWith("/");
}
private boolean isRelativeSelector(String selector) {
return selector != null && selector.startsWith(".");
}
}