/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.xml;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import javax.xml.stream.XMLStreamException;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.CasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.codehaus.stax2.XMLStreamReader2;
import com.ctc.wstx.stax.WstxInputFactory;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field;
/**
* Reader for XML files.
*/
@MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.TEXT_XML})
@TypeCapability(
outputs={
"de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field",
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"})
public class XmlReader extends CasCollectionReader_ImplBase {
/**
* Location from which the input is read.
*/
public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION;
@ConfigurationParameter(name=PARAM_SOURCE_LOCATION, mandatory=true)
private String inputDirectory;
/**
* Set this as the language of the produced documents.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name=PARAM_LANGUAGE, mandatory=false)
private String language;
/**
* optional, tags those should be worked on (if empty, then all tags
* except those ExcludeTags will be worked on)
*/
public static final String PARAM_INCLUDE_TAG = "IncludeTag";
@ConfigurationParameter(name=PARAM_INCLUDE_TAG, mandatory=true, defaultValue={})
private Set<String> includeTags;
/**
* optional, tags those should not be worked on. Out them should no
* text be extracted and also no Annotations be produced.
*/
public static final String PARAM_EXCLUDE_TAG = "ExcludeTag";
@ConfigurationParameter(name=PARAM_EXCLUDE_TAG, mandatory=true, defaultValue={})
private Set<String> excludeTags;
/**
* tag which contains the docId
*/
public static final String PARAM_DOC_ID_TAG = "DocIdTag";
@ConfigurationParameter(name=PARAM_DOC_ID_TAG, mandatory=false)
private String docIdTag;
/**
* The collection ID to set in the {@link DocumentMetaData}.
*/
public static final String PARAM_COLLECTION_ID = "collectionId";
@ConfigurationParameter(name=PARAM_COLLECTION_ID, mandatory=false)
private String collectionId;
private static final String MESSAGE_DIGEST = "de.tudarmstadt.ukp.dkpro.core.io.xml.XmlReader_Messages";
private static final String INVALID_PATH_EXCEPTION = "invalid_path_error";
private static final String EMPTY_DIRECTORY_EXCEPTION = "empty_directory_error";
private static final String MISSING_DOC_ID_EXCEPTION = "missing_doc_id_error";
private static final String EMPTY_DOC_ID_EXCEPTION = "empty_doc_id_error";
private static final String MULTIPLE_DOC_ID_EXCEPTION = "multiple_doc_id_error";
private static final String SUBSTITUTE_EXCEPTION = "substitute_error";
// mandatory, list of xml files to be readed in
private final ArrayList<File> xmlFiles = new ArrayList<File>();
// Xml stream reader
private XMLStreamReader2 xmlReader;
// current be parsed file index
private int currentParsedFile;
private int iDoc;
private boolean useSubstitution;
private Map<String,String> substitution;
private String docIdElementLocalName;
private String docIdAttributeName;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
// mandatory, directory where that those be parsed XML files are
File inDir = new File(inputDirectory);
// get all xml files from the input directory (ignore the
// subdirectories)
if (inDir.isDirectory()) {
File[] files = inDir.listFiles();
for (File file : files) {
if (file.isFile() && (file.toString().endsWith(".xml") || file.toString().endsWith(".sgml"))) {
xmlFiles.add(file);
}
}
Collections.sort(xmlFiles);
}
else {
throw new ResourceInitializationException(
MESSAGE_DIGEST,
INVALID_PATH_EXCEPTION,
new Object[] {inDir});
}
// if xmlFiles is not empty, then initialize the Stax Reader
if (xmlFiles.isEmpty()) {
throw new ResourceInitializationException(
MESSAGE_DIGEST,
EMPTY_DIRECTORY_EXCEPTION,
new Object[] {inDir});
}
currentParsedFile = 0;
if (docIdTag != null && docIdTag.contains("/@")) {
int split = docIdTag.indexOf("/@");
docIdElementLocalName = docIdTag.substring(0, split);
docIdAttributeName = docIdTag.substring(split+2);
}
else {
docIdElementLocalName = docIdTag;
}
}
@Override
public void getNext(CAS aCAS)
throws IOException, CollectionException
{
JCas jcas;
try {
jcas = aCAS.getJCas();
}
catch (CASException e) {
throw new CollectionException(e);
}
// parse the xml file
try {
// if the last file is already done, then work on the next file
if (xmlReader == null) {
WstxInputFactory factory = new WstxInputFactory();
xmlReader = factory.createXMLStreamReader(xmlFiles
.get(currentParsedFile));
iDoc = 0;
}
// ignore the root element of the file
// parse the second layer element, suppose they are all documents
// read in all elements under second layer
parseSubDocument(jcas);
iDoc++;
if (xmlReader.getDepth() < 2) {
xmlReader.closeCompletely();
xmlReader = null;
currentParsedFile++;
}
} catch (XMLStreamException e) {
e.printStackTrace();
throw new CollectionException(e);
} catch (Exception e) {
e.printStackTrace();
throw new CollectionException(e);
}
}
@Override
public Progress[] getProgress()
{
return new Progress[] { new ProgressImpl(currentParsedFile, xmlFiles
.size(), Progress.ENTITIES) };
}
@Override
public boolean hasNext()
throws IOException, CollectionException
{
if (xmlReader != null) {
// There is still more to parse in the current file
return true;
}
if (currentParsedFile >= 0 && currentParsedFile < xmlFiles.size()) {
// There are additional files to parse
return true;
}
else {
// There is nothing more
return false;
}
}
@Override
public void close()
throws IOException
{
// Nothing to do
}
private void parseSubDocument(JCas jcas)
throws XMLStreamException, IOException, CollectionException
{
// set the jcas document language if the parameter exists
if (language != null) {
jcas.setDocumentLanguage(language);
}
LinkedList<String> openTagStack = new LinkedList<String>();
// get document tag
String docTag = seekSubDocumentRoot();
StringBuilder documentText = new StringBuilder();
String docId = null;
while (xmlReader.hasNext() && xmlReader.getDepth() > 1) {
if (xmlReader.isStartElement()) {
String tagName = xmlReader.getName().getLocalPart();
openTagStack.push(tagName);
// If the docId is an attribute, try to fetch it now
String id = null;
if (isDocIdElement(tagName) && docIdAttributeName != null) {
id = xmlReader.getAttributeValue(null, docIdAttributeName);
}
xmlReader.next();
String elementText = collectText();
if (elementText.length() > 0) {
// If the docId is an element value, we may capture it now
if (isDocIdElement(tagName) && docIdAttributeName == null) {
id = elementText;
}
// Process the current span of text
processText(jcas, tagName, elementText, documentText);
}
// If a docId has been captured, check if it valid and unique
if (id != null) {
if (docId != null) {
throw new CollectionException(
MULTIPLE_DOC_ID_EXCEPTION,
new Object[] { docIdTag });
}
if (id.length() == 0) {
throw new CollectionException(EMPTY_DOC_ID_EXCEPTION,
new Object[] { docIdTag });
}
docId = id;
}
}
else if(xmlReader.isCharacters()) {
String tagName = openTagStack.peek();
String elementText = collectText();
if(elementText.length()==0) {
continue;
}
// Process the current span of text
processText(jcas, tagName, elementText, documentText);
}
else if (xmlReader.isEndElement()) {
String tagName = xmlReader.getName().getLocalPart();
// if it is end of document then stop processing
if (docTag.equals(tagName)) {
xmlReader.nextTag();
break;
}
openTagStack.poll();
xmlReader.next();
}
}
jcas.setDocumentText(documentText.toString());
// Add Document MetaData
String fileName = xmlFiles.get(currentParsedFile).getName();
// String fileExtension = "";
int dotPlace = fileName.lastIndexOf ( '.' );
if(docIdTag!=null) {
if(docId==null) {
throw new CollectionException(
MESSAGE_DIGEST,
MISSING_DOC_ID_EXCEPTION,
new Object[] {docIdTag});
}
} else {
if ( dotPlace >= 0 ) {
// fileExtension = fileName.substring( dotPlace + 1 );
docId = fileName.substring(0, dotPlace)+"-"+iDoc;
}
}
String docUri = xmlFiles.get(currentParsedFile).toURI().toString();
DocumentMetaData docMetaData = DocumentMetaData.create(jcas);
docMetaData.setDocumentId(docId);
docMetaData.setDocumentUri(docUri+"#"+docId);
docMetaData.setCollectionId(collectionId);
// System.out.println("Fetched document: "+docUri+"#"+docId);
}
/**
* Create a field annotation for the given element name at the given location.
* If substitutions are used, the field is created using the substituted name.
*
* @param jcas the JCas.
* @param localName the local name of the current XML element.
* @param begin the start offset.
* @param end the end offset.
*/
private void createFieldAnnotation(JCas jcas, String localName, int begin, int end)
{
String fieldName = null;
if (useSubstitution) {
fieldName = substitution.get(localName);
if (fieldName == null) {
fieldName = localName;
}
}
else {
fieldName = localName;
}
Field field = new Field(jcas, begin, end);
field.setName(fieldName);
field.addToIndexes();
}
private boolean isIncluded(final String tagName)
{
boolean needToBeParsed = (includeTags.size() == 0) || includeTags.contains(tagName);
if (excludeTags.size() > 0 && excludeTags.contains(tagName)) {
needToBeParsed = false;
}
return needToBeParsed;
}
/**
* Process the text found within the given element. If text from the given
* element should be included in the document, then it is added and a proper
* {@link Field} annotation is created.
*
* @param jcas the JCas.
* @param localName the element in which the text was found
* @param elementText the text
* @param documentText the document text buffer
*/
private void processText(JCas jcas, String localName, String elementText,
StringBuilder documentText)
{
if (isIncluded(localName)) {
int begin = documentText.length();
documentText = documentText.append(elementText);
documentText = documentText.append("\n\n");
int end = documentText.length()-1;
createFieldAnnotation(jcas, localName, begin, end);
}
}
/**
* Collect all consecutive text starting at the current point.
*
* @return the concatenated consecutive text.
*/
private String collectText() throws XMLStreamException
{
StringBuilder elementText = new StringBuilder();
while(xmlReader.isCharacters()) {
elementText.append(xmlReader.getText().replaceAll("\r", "").trim());
xmlReader.next();
}
return elementText.toString();
}
/**
* Seek to the root element of the next sub-document and return its local name.
*
* @return the local name of the sub-document root element.
*/
private String seekSubDocumentRoot()
throws XMLStreamException, IOException
{
// if this is not the first document in the file then the current
// element is the docTag
String docTag = null;
if (xmlReader.isStartElement() && xmlReader.getDepth() > 1) {
docTag = xmlReader.getName().getLocalPart();
}
else {
while (xmlReader.hasNext() && xmlReader.getDepth() < 2) {
xmlReader.next();
}
while (xmlReader.hasNext() && !xmlReader.isStartElement()) {
xmlReader.next();
}
if (xmlReader.getDepth() == 2 && xmlReader.isStartElement()) {
docTag = xmlReader.getName().getLocalPart();
}
else {
throw new IOException("file is empty: "
+ xmlFiles.get(currentParsedFile));
}
}
return docTag;
}
private boolean isDocIdElement(String localName)
{
return docIdElementLocalName != null && docIdElementLocalName.equals(localName);
}
}