/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.step;
import java.io.StringReader;
import java.util.*;
import java.util.regex.Pattern;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import javax.xml.transform.dom.DOMSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import no.trank.openpipe.api.BasePipelineStep;
import no.trank.openpipe.api.PipelineException;
import no.trank.openpipe.api.PipelineStepStatus;
import no.trank.openpipe.api.document.Document;
import no.trank.openpipe.api.document.DomRawData;
import no.trank.openpipe.config.annotation.NotNull;
/**
*
* @version $Revision$
*/
public class ParseXML extends BasePipelineStep {
private static final Logger log = LoggerFactory.getLogger(ParseXML.class);
private static final Pattern WS_PATTERN = Pattern.compile("\\s+");
private XMLInputFactory factory;
private String fieldName;
@NotNull
private Set<String> ignoredTags = Collections.emptySet();
@NotNull
private Map<String, String> tagToFieldName = Collections.emptyMap();
private boolean failOnXMLError = true;
@Override
public PipelineStepStatus execute(Document doc) throws PipelineException {
if (fieldName != null)
return execute_field(doc);
else
return execute_Dom(doc);
}
public PipelineStepStatus execute_field(Document doc) throws PipelineException {
final List<String> list = doc.getFieldValues(fieldName);
for (String text : list) {
try {
final XMLEventReader reader = factory.createXMLEventReader(new StringReader(text));
parseXML(doc, reader);
} catch (XMLStreamException e) {
if (failOnXMLError) {
throw new PipelineException("Could not parse XML in field '" + fieldName + "'", e);
} else {
log.warn("Could not parse XML in field '" + fieldName + "'", e);
}
}
}
return PipelineStepStatus.DEFAULT;
}
//TODO streamline this
private PipelineStepStatus execute_Dom(Document doc) throws PipelineException {
DomRawData domRawData = (DomRawData) doc.getRawData();
try {
final XMLEventReader reader = factory.createXMLEventReader(new DOMSource(domRawData.getDom()));
parseXML(doc, reader);
} catch (XMLStreamException e) {
if (failOnXMLError) {
throw new PipelineException("Could not parse XML", e);
} else {
log.warn("Could not parse XML", e);
}
}
return PipelineStepStatus.DEFAULT;
}
private void parseXML(Document doc, XMLEventReader reader) throws XMLStreamException {
final Deque<String> stack = new ArrayDeque<String>();
while (reader.hasNext()) {
final XMLEvent event = reader.nextEvent();
if (event.isStartElement()) {
final StartElement elem = event.asStartElement();
stack.push(elem.getName().getLocalPart());
final Iterator<?> it = elem.getAttributes();
while (it.hasNext()) {
final Attribute a = (Attribute) it.next();
final String tag = a.getName().getLocalPart();
final String data = a.getValue();
if (isWanted(tag, data)) {
doc.addFieldValue(findToFieldName(tag), data);
}
}
} else if (event.isEndElement()) {
stack.pop();
} else if (event.isCharacters()) {
final String tag = stack.peek();
final String data = event.asCharacters().getData();
if (isWanted(tag, data)) {
doc.addFieldValue(findToFieldName(tag), data);
}
}
}
}
private boolean isWanted(String tag, String data) {
return !ignoredTags.contains(tag) && !isBlank(data);
}
private static boolean isBlank(String data) {
return WS_PATTERN.matcher(data).matches();
}
private String findToFieldName(String tag) {
final String fieldName = tagToFieldName.get(tag);
return fieldName == null ? tag : fieldName;
}
@Override
public void prepare() throws PipelineException {
super.prepare();
if (factory == null) {
factory = XMLInputFactory.newInstance();
}
if (factory.isPropertySupported(XMLInputFactory.IS_COALESCING)) {
factory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
} else {
log.warn("XMLInputFactory: {} does not support coalescing", factory.getClass().getName());
}
}
@Override
public String getRevision() {
return "$Revision$";
}
public String getFieldName() {
return fieldName;
}
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
public Set<String> getIgnoredTags() {
return ignoredTags;
}
public void setIgnoredTags(Set<String> ignoredTags) {
this.ignoredTags = ignoredTags;
}
public Map<String, String> getTagToFieldName() {
return tagToFieldName;
}
public void setTagToFieldName(Map<String, String> tagToFieldName) {
this.tagToFieldName = tagToFieldName;
}
public boolean isFailOnXMLError() {
return failOnXMLError;
}
public void setFailOnXMLError(boolean failOnXMLError) {
this.failOnXMLError = failOnXMLError;
}
public XMLInputFactory getFactory() {
return factory;
}
public void setFactory(XMLInputFactory factory) {
this.factory = factory;
}
}