/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.tei; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_FUNCTION; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_LEMMA; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_POS; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.ATTR_TYPE; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_CHARACTER; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_MULTIWORD; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_PARAGRAPH; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_PHRASE; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_RS; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_SUNIT; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_TEI_DOC; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_TEXT; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_TITLE; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_U; import static de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants.TAG_WORD; import static java.util.Arrays.asList; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.commons.lang.StringUtils.isNotBlank; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Stack; import java.util.zip.GZIPInputStream; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Logger; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import org.dom4j.io.SAXWriter; import org.jaxen.JaxenException; import org.jaxen.XPath; import org.jaxen.dom4j.Dom4jXPath; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; /** * Reader for the TEI XML. */ @MimeTypeCapability({MimeTypes.APPLICATION_TEI_XML}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"}) public class TeiReader extends ResourceCollectionReaderBase { /** * Write token annotations to the CAS. */ public static final String PARAM_READ_TOKEN = ComponentParameters.PARAM_READ_TOKEN; @ConfigurationParameter(name = PARAM_READ_TOKEN, mandatory = true, defaultValue = "true") private boolean readToken; /** * Write part-of-speech annotations to the CAS. */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPOS; /** * Write lemma annotations to the CAS. */ public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") private boolean readLemma; /** * Write sentence annotations to the CAS. */ public static final String PARAM_READ_SENTENCE = ComponentParameters.PARAM_READ_SENTENCE; @ConfigurationParameter(name = PARAM_READ_SENTENCE, mandatory = true, defaultValue = "true") private boolean readSentence; /** * Write constituent annotations to the CAS. */ public static final String PARAM_READ_CONSTITUENT = ComponentParameters.PARAM_READ_CONSTITUENT; @ConfigurationParameter(name = PARAM_READ_CONSTITUENT, mandatory = true, defaultValue = "true") private boolean readConstituent; /** * Write named entity annotations to the CAS. */ public static final String PARAM_READ_NAMED_ENTITY = ComponentParameters.PARAM_READ_NAMED_ENTITY; @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") private boolean readNamedEntity; /** * Write paragraphs annotations to the CAS. */ public static final String PARAM_READ_PARAGRAPH = "readParagraph"; @ConfigurationParameter(name = PARAM_READ_PARAGRAPH, mandatory = true, defaultValue = "true") private boolean readParagraph; /** * Use the xml:id attribute on the TEI elements as document ID. Mind that many TEI files * may not have this attribute on all TEI elements and you may end up with no document ID * at all. Also mind that the IDs should be unique. */ public static final String PARAM_USE_XML_ID = "useXmlId"; @ConfigurationParameter(name = PARAM_USE_XML_ID, mandatory = true, defaultValue = "false") private boolean useXmlId; /** * When not using the XML ID, use only the filename instead of the whole URL as ID. Mind that * the filenames should be unique in this case. */ public static final String PARAM_USE_FILENAME_ID = "useFilenameId"; @ConfigurationParameter(name = PARAM_USE_FILENAME_ID, mandatory = true, defaultValue = "false") private boolean useFilenameId; /** * Do not write <em>ignoreable whitespace</em> from the XML file to the CAS. */ // REC: This does not seem to work. Maybe because SAXWriter does not generate this event? public static final String PARAM_OMIT_IGNORABLE_WHITESPACE = "omitIgnorableWhitespace"; @ConfigurationParameter(name = PARAM_OMIT_IGNORABLE_WHITESPACE, mandatory = true, defaultValue = "false") private boolean omitIgnorableWhitespace; /** * Location of the mapping file for part-of-speech tags to UIMA types. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String mappingPosLocation; /** * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; /** * Interpret utterances "u" as sentenes "s". (EXPERIMENTAL) */ public static final String PARAM_UTTERANCES_AS_SENTENCES = "utterancesAsSentences"; @ConfigurationParameter(name = PARAM_UTTERANCES_AS_SENTENCES, mandatory = true, defaultValue = "false") private boolean utterancesAsSentences; private Iterator<Element> teiElementIterator; private Element currentTeiElement; private Resource currentResource; private int currentTeiElementNumber; private MappingProvider posMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); if (readPOS && !readToken) { throw new ResourceInitializationException(new IllegalArgumentException( "Setting readPOS to 'true' requires writeToken to be 'true' too.")); } try { // Init with an empty iterator teiElementIterator = asList(new Element[0]).iterator(); // Make sure we know about the first element; nextTeiElement(); } catch (CollectionException | IOException e) { throw new ResourceInitializationException(e); } posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, posTagset, getLanguage()); } private void nextTeiElement() throws CollectionException, IOException { if (teiElementIterator == null) { currentTeiElement = null; return; } while (!teiElementIterator.hasNext() && super.hasNext()) { currentResource = nextFile(); InputStream is = null; try { is = currentResource.getInputStream(); if (currentResource.getPath().endsWith(".gz")) { is = new GZIPInputStream(is); } InputSource source = new InputSource(is); source.setPublicId(currentResource.getLocation()); source.setSystemId(currentResource.getLocation()); SAXReader reader = new SAXReader(); Document xml = reader.read(source); final XPath teiPath = new Dom4jXPath("//tei:TEI"); teiPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0"); List<Element> teiElements = teiPath.selectNodes(xml); // System.out.printf("Found %d TEI elements in %s.%n", teiElements.size(), // currentResource.getLocation()); teiElementIterator = teiElements.iterator(); currentTeiElementNumber = 0; } catch (DocumentException e) { throw new IOException(e); } catch (JaxenException e) { throw new IOException(e); } finally { closeQuietly(is); } } currentTeiElement = teiElementIterator.hasNext() ? teiElementIterator.next() : null; currentTeiElementNumber++; if (!super.hasNext() && !teiElementIterator.hasNext()) { // Mark end of processing. teiElementIterator = null; } } @Override public boolean hasNext() throws IOException, CollectionException { return teiElementIterator != null || currentTeiElement != null; } @Override public void getNext(CAS aCAS) throws IOException, CollectionException { initCas(aCAS, currentResource); // Set up language if (getConfigParameterValue(PARAM_LANGUAGE) != null) { aCAS.setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); } // Configure mapping only now, because now the language is set in the CAS try { posMappingProvider.configure(aCAS); } catch (AnalysisEngineProcessException e1) { throw new IOException(e1); } InputStream is = null; try { JCas jcas = aCAS.getJCas(); // Create handler Handler handler = newSaxHandler(); handler.setJCas(jcas); handler.setLogger(getLogger()); // Parse TEI text SAXWriter writer = new SAXWriter(handler); writer.write(currentTeiElement); handler.endDocument(); } catch (CASException e) { throw new CollectionException(e); } catch (SAXException e) { throw new IOException(e); } finally { closeQuietly(is); } // Move currentTeiElement to the next text nextTeiElement(); } protected Handler newSaxHandler() { return new TeiHandler(); } protected abstract static class Handler extends DefaultHandler { private JCas jcas; private Logger logger; public void setJCas(final JCas aJCas) { jcas = aJCas; } protected JCas getJCas() { return jcas; } public void setLogger(Logger aLogger) { logger = aLogger; } public Logger getLogger() { return logger; } } public class TeiHandler extends Handler { private String documentId = null; private boolean titleSet = false; private boolean inTextElement = false; private boolean captureText = false; private int paragraphStart = -1; private int sentenceStart = -1; private int tokenStart = -1; private String posTag = null; private String lemma = null; private Stack<ConstituentWrapper> constituents = new Stack<>(); private Stack<NamedEntity> namedEntities = new Stack<>(); private final StringBuilder buffer = new StringBuilder(); @Override public void endDocument() throws SAXException { getJCas().setDocumentText(buffer.toString()); } protected StringBuilder getBuffer() { return buffer; } @Override public void startElement(String aUri, String aLocalName, String aName, Attributes aAttributes) throws SAXException { // System.out.printf("%b START %s %n", captureText, aLocalName); if (!inTextElement && TAG_TEI_DOC.equals(aName)) { if (useXmlId) { documentId = aAttributes.getValue("xml:id"); } else if (useFilenameId) { documentId = FilenameUtils.getName(currentResource.getPath()) + "#" + currentTeiElementNumber; } else { documentId = currentResource.getPath()+"#"+currentTeiElementNumber; } } else if (!inTextElement && TAG_TITLE.equals(aName)) { captureText = true; } else if (TAG_TEXT.equals(aName)) { captureText = true; inTextElement = true; } else if (inTextElement && (TAG_SUNIT.equals(aName) || (utterancesAsSentences && TAG_U.equals(aName)))) { sentenceStart = getBuffer().length(); } else if (inTextElement && TAG_PARAGRAPH.equals(aName)) { paragraphStart = getBuffer().length(); } else if (readNamedEntity && inTextElement && TAG_RS.equals(aName)) { NamedEntity ne = new NamedEntity(getJCas()); ne.setBegin(getBuffer().length()); ne.setValue(aAttributes.getValue(ATTR_TYPE)); namedEntities.push(ne); } else if (readConstituent && inTextElement && TAG_PHRASE.equals(aName)) { if (constituents.isEmpty()) { ROOT root = new ROOT(getJCas()); root.setBegin(getBuffer().length()); root.setConstituentType("ROOT"); constituents.push(new ConstituentWrapper(root)); } Constituent constituent = new Constituent(getJCas()); constituent.setBegin(getBuffer().length()); constituent.setConstituentType(aAttributes.getValue(ATTR_TYPE)); constituent.setSyntacticFunction(aAttributes.getValue(ATTR_FUNCTION)); constituents.push(new ConstituentWrapper(constituent)); } else if (inTextElement && (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName) || TAG_MULTIWORD .equals(aName))) { tokenStart = getBuffer().length(); if (StringUtils.isNotEmpty(aAttributes.getValue(ATTR_POS))) { posTag = aAttributes.getValue(ATTR_POS); } else { posTag = aAttributes.getValue(ATTR_TYPE); } lemma = aAttributes.getValue(ATTR_LEMMA); } } @Override public void endElement(String aUri, String aLocalName, String aName) throws SAXException { // System.out.printf("%b END %s %n", captureText, aLocalName); if (!inTextElement && TAG_TITLE.equals(aName)) { DocumentMetaData meta = DocumentMetaData.get(getJCas()); // Read only the first title and hope it is the main title if (!titleSet) { meta.setDocumentTitle(getBuffer().toString().trim()); titleSet = true; } meta.setDocumentId(documentId); getBuffer().setLength(0); captureText = false; } else if (TAG_TEXT.equals(aName)) { captureText = false; inTextElement = false; } else if (inTextElement && (TAG_SUNIT.equals(aName) || (utterancesAsSentences && TAG_U.equals(aName)))) { if (readSentence) { new Sentence(getJCas(), sentenceStart, getBuffer().length()).addToIndexes(); } sentenceStart = -1; } else if (inTextElement && TAG_PARAGRAPH.equals(aName)) { if (readParagraph) { new Paragraph(getJCas(), paragraphStart, getBuffer().length()).addToIndexes(); } paragraphStart = -1; } else if (readNamedEntity && inTextElement && TAG_RS.equals(aName)) { NamedEntity ne = namedEntities.pop(); ne.setEnd(getBuffer().length()); ne.addToIndexes(); } else if (readConstituent && inTextElement && TAG_PHRASE.equals(aName)) { ConstituentWrapper wrapper = constituents.pop(); wrapper.constituent.setEnd(getBuffer().length()); if (!constituents.isEmpty()) { ConstituentWrapper parent = constituents.peek(); wrapper.constituent.setParent(parent.constituent); parent.children.add(wrapper.constituent); } wrapper.constituent.setChildren(FSCollectionFactory.createFSArray(getJCas(), wrapper.children)); wrapper.constituent.addToIndexes(); // Close off the ROOT if (constituents.peek().constituent instanceof ROOT) { ConstituentWrapper rootWrapper = constituents.pop(); rootWrapper.constituent.setEnd(getBuffer().length()); rootWrapper.constituent.setChildren(FSCollectionFactory.createFSArray( getJCas(), rootWrapper.children)); rootWrapper.constituent.addToIndexes(); } } else if (inTextElement && (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName) || TAG_MULTIWORD .equals(aName))) { if (isNotBlank(getBuffer().substring(tokenStart, getBuffer().length()))) { Token token = new Token(getJCas(), tokenStart, getBuffer().length()); trim(token); if (posTag != null && readPOS) { Type posTagType = posMappingProvider.getTagType(posTag); POS pos = (POS) getJCas().getCas().createAnnotation(posTagType, token.getBegin(), token.getEnd()); pos.setPosValue(posTag); pos.setCoarseValue(pos.getClass().equals(POS.class) ? null : pos.getType().getShortName().intern()); pos.addToIndexes(); token.setPos(pos); } if (lemma != null && readLemma) { Lemma l = new Lemma(getJCas(), token.getBegin(), token.getEnd()); l.setValue(lemma); l.addToIndexes(); token.setLemma(l); } // FIXME: if readToken is disabled, the JCas wrapper should not be generated // at all! if (readToken) { if (!constituents.isEmpty()) { ConstituentWrapper parent = constituents.peek(); token.setParent(parent.constituent); parent.children.add(token); } token.addToIndexes(); } } tokenStart = -1; } } @Override public void characters(char[] aCh, int aStart, int aLength) throws SAXException { if (captureText) { buffer.append(aCh, aStart, aLength); } } @Override public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException { if (captureText && !omitIgnorableWhitespace) { buffer.append(aCh, aStart, aLength); } } private void trim(Annotation aAnnotation) { StringBuilder buffer = getBuffer(); int s = aAnnotation.getBegin(); int e = aAnnotation.getEnd(); while (Character.isWhitespace(buffer.charAt(s))) { s++; } while ((e > s+1) && Character.isWhitespace(buffer.charAt(e-1))) { e--; } aAnnotation.setBegin(s); aAnnotation.setEnd(e); } } private static class ConstituentWrapper { public Constituent constituent; public List<Annotation> children = new ArrayList<Annotation>(); public ConstituentWrapper(Constituent aConstituent) { constituent = aConstituent; } } }