/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.clarin.webanno.tei; import static java.util.Arrays.asList; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.commons.lang3.StringUtils.isNotBlank; import java.io.IOException; import java.io.InputStream; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.zip.GZIPInputStream; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Logger; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import org.dom4j.io.SAXWriter; import org.jaxen.JaxenException; import org.jaxen.XPath; import org.jaxen.dom4j.Dom4jXPath; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * Reader for the TEI XML. */ @TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) public class TeiReader extends ResourceCollectionReaderBase { /** * Write token annotations to the CAS. */ public static final String PARAM_WRITE_TOKEN = ComponentParameters.PARAM_WRITE_TOKEN; @ConfigurationParameter(name = PARAM_WRITE_TOKEN, mandatory = true, defaultValue = "true") private boolean writeTokens; /** * Write part-of-speech annotations to the CAS. */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePOS; /** * Write lemma annotations to the CAS. */ public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") private boolean writeLemma; /** * Write sentence annotations to the CAS. */ public static final String PARAM_WRITE_SENTENCE = ComponentParameters.PARAM_WRITE_SENTENCE; @ConfigurationParameter(name = PARAM_WRITE_SENTENCE, mandatory = true, defaultValue = "true") private boolean writeSentences; /** * Use the xml:id attribute on the TEI elements as document ID. Mind that many TEI files may not * have this attribute on all TEI elements and you may end up with no document ID at all. Also * mind that the IDs should be unique. */ public static final String PARAM_USE_XML_ID = "useXmlId"; @ConfigurationParameter(name = PARAM_USE_XML_ID, mandatory = true, defaultValue = "false") private boolean useXmlId; /** * When not using the XML ID, use only the filename instead of the whole URL as ID. Mind that * the filenames should be unique in this case. */ public static final String PARAM_USE_FILENAME_ID = "useFilenameId"; @ConfigurationParameter(name = PARAM_USE_FILENAME_ID, mandatory = true, defaultValue = "false") private boolean useFilenameId; /** * Do not write <em>ignoreable whitespace</em> from the XML file to the CAS. */ // REC: This does not seem to work. Maybe because SAXWriter does not generate this event? public static final String PARAM_OMIT_IGNORABLE_WHITESPACE = "omitIgnorableWhitespace"; @ConfigurationParameter(name = PARAM_OMIT_IGNORABLE_WHITESPACE, mandatory = true, defaultValue = "false") private boolean omitIgnorableWhitespace; /** * Location of the mapping file for part-of-speech tags to UIMA types. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String mappingPosLocation; /** * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; /** * (character) contains a significant punctuation mark identified with an attribute type of p * for non space characters and s for space character. */ private static final String TAG_CHARACTER = "c"; private static final String TAG_LANG = "language"; private static final String SPACE_CHAR = " "; private static final String ANA = "ana"; private static final String FROM = "from"; private static final String INDENT = "ident"; /** * (word) represents a grammatical (not necessarily orthographic) word. */ private static final String TAG_WORD = "w"; /** * (s-unit) contains a sentence-like division of a text. */ private static final String TAG_SUNIT = "s"; /** * A tag for a group of annotations such as lemm, pos and sense (Named Enity layer used here) * annotations */ private static final String TAG_SPAN_GRP = "spanGrp"; /** * An annotation which comprises of actual annotations together with the id of the token( * TAG_WORD/TAG_CHARACTER) */ private static final String TAG_SPAN = "span"; private Iterator<Element> teiElementIterator; private Element currentTeiElement; private Resource currentResource; @SuppressWarnings("unused") private int currentTeiElementNumber; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); if (writePOS && !writeTokens) { throw new ResourceInitializationException(new IllegalArgumentException( "Setting writePOS to 'true' requires writeToken to be 'true' too.")); } try { // Init with an empty iterator teiElementIterator = asList(new Element[0]).iterator(); // Make sure we know about the first element; nextTeiElement(); } catch (CollectionException e) { new ResourceInitializationException(e); } catch (IOException e) { new ResourceInitializationException(e); } } private void nextTeiElement() throws CollectionException, IOException { if (teiElementIterator == null) { currentTeiElement = null; return; } while (!teiElementIterator.hasNext() && super.hasNext()) { currentResource = nextFile(); InputStream is = null; try { is = currentResource.getInputStream(); if (currentResource.getPath().endsWith(".gz")) { is = new GZIPInputStream(is); } InputSource source = new InputSource(is); source.setPublicId(currentResource.getLocation()); source.setSystemId(currentResource.getLocation()); SAXReader reader = new SAXReader(); Document xml = reader.read(source); final XPath teiPath = new Dom4jXPath("//tei:TEI"); teiPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0"); @SuppressWarnings("unchecked") List<Element> teiElements = teiPath.selectNodes(xml); teiElementIterator = teiElements.iterator(); currentTeiElementNumber = 0; } catch (DocumentException e) { throw new IOException(e); } catch (JaxenException e) { throw new IOException(e); } finally { closeQuietly(is); } } currentTeiElement = teiElementIterator.hasNext() ? teiElementIterator.next() : null; currentTeiElementNumber++; if (!super.hasNext() && !teiElementIterator.hasNext()) { // Mark end of processing. teiElementIterator = null; } } @Override public boolean hasNext() throws IOException, CollectionException { return teiElementIterator != null || currentTeiElement != null; } @Override public void getNext(CAS aCAS) throws IOException, CollectionException { initCas(aCAS, currentResource); InputStream is = null; try { JCas jcas = aCAS.getJCas(); // Create handler Handler handler = newSaxHandler(); handler.setJCas(jcas); handler.setLogger(getLogger()); // Parse TEI text SAXWriter writer = new SAXWriter(handler); writer.write(currentTeiElement); handler.endDocument(); } catch (CASException e) { throw new CollectionException(e); } catch (SAXException e) { throw new IOException(e); } catch (Exception e) { throw new IOException("This is not a valid WebAnno CPH TEI file"); } finally { closeQuietly(is); } // Move currentTeiElement to the next text nextTeiElement(); } protected Handler newSaxHandler() { return new TeiHandler(); } /** */ protected abstract static class Handler extends DefaultHandler { private JCas jcas; private Logger logger; public void setJCas(final JCas aJCas) { jcas = aJCas; } protected JCas getJCas() { return jcas; } public void setLogger(Logger aLogger) { logger = aLogger; } public Logger getLogger() { return logger; } } public class TeiHandler extends Handler { private boolean isSpaceChar = false; private boolean addLemma = false; private boolean addPos = false; private boolean addNe = false; private boolean captureText = false; private int sentenceStart = -1; private int tokenStart = -1; String tokenId = null; private String lemma = null; private String posTag = null; private String neTag = null; private String language = null; Map<String, Token> tokenIds = new LinkedHashMap<String, Token>(); private final StringBuilder buffer = new StringBuilder(); @Override public void endDocument() throws SAXException { getJCas().setDocumentText(buffer.toString()); // Set up language if (language != null) { getJCas().setDocumentLanguage(language); } else if (getConfigParameterValue(PARAM_LANGUAGE) != null) { getJCas().setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); } } protected StringBuilder getBuffer() { return buffer; } @Override public void startElement(String aUri, String aLocalName, String aName, Attributes aAttributes) throws SAXException { if (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName)) { if (aAttributes.getValue("type") != null && aAttributes.getValue("type").equals("s")) { isSpaceChar = true; } else { isSpaceChar = false; } tokenId = aAttributes.getValue("xml:id"); captureText = true; tokenStart = getBuffer().length(); } else if (TAG_SUNIT.equals(aName)) { captureText = false; sentenceStart = getBuffer().length(); } else if (TAG_SPAN_GRP.equals(aName)) { if (aAttributes.getValue(ANA).equals("#ePOSlemmatizer")) { addLemma = true; addPos = false; addNe = false; } else if (aAttributes.getValue(ANA).equals("#ePOStagger")) { addLemma = false; addPos = true; addNe = false; } else if (aAttributes.getValue(ANA).equals("#automatic-supersense-from-dannet")) { addLemma = false; addPos = false; addNe = true; } captureText = false; } else if (TAG_SPAN.equals(aName)) { captureText = true; tokenId = aAttributes.getValue(FROM); } else if (TAG_LANG.equals(aName)) { captureText = false; language = aAttributes.getValue(INDENT); } else { captureText = false; } } @Override public void endElement(String aUri, String aLocalName, String aName) throws SAXException { if (TAG_SUNIT.equals(aName)) { if (writeSentences) { new Sentence(getJCas(), sentenceStart, getBuffer().length()).addToIndexes(); } sentenceStart = -1; } else if (TAG_WORD.equals(aName) || TAG_CHARACTER.equals(aName)) { if (isNotBlank(getBuffer().substring(tokenStart, getBuffer().length()))) { Token token = new Token(getJCas(), tokenStart, getBuffer().length()); tokenIds.put(tokenId, token); if (writeTokens) { token.addToIndexes(); } } tokenStart = -1; } else if (TAG_SPAN.equals(aName)) { Token token = tokenIds.get(tokenId.substring(1)); if (addPos) { boolean duplicate = false; for (POS pos : JCasUtil.selectCovered(getJCas(), POS.class, token.getBegin(), token.getEnd())) { if (pos.getBegin() == token.getBegin() && pos.getEnd() == token.getEnd()) { if (pos.getPosValue().equals(this.posTag)) { duplicate = true; break; } } } if (!duplicate) { POS pos = new POS(getJCas(), token.getBegin(), token.getEnd()); pos.setPosValue(this.posTag); pos.addToIndexes(); token.setPos(pos); token.addToIndexes(); } } else if (addLemma) { boolean duplicate = false; for (Lemma lemma : JCasUtil.selectCovered(getJCas(), Lemma.class, token.getBegin(), token.getEnd())) { if (lemma.getBegin() == token.getBegin() && lemma.getEnd() == token.getEnd()) { if (lemma.getValue().equals(this.lemma)) { duplicate = true; break; } } } if (!duplicate) { Lemma lemma = new Lemma(getJCas(), token.getBegin(), token.getEnd()); lemma.setValue(this.lemma); lemma.addToIndexes(); token.setLemma(lemma); token.addToIndexes(); } } else if (addNe) { boolean duplicate = false; for (NamedEntity ne : JCasUtil.selectCovered(getJCas(), NamedEntity.class, token.getBegin(), token.getEnd())) { if (ne.getBegin() == token.getBegin() && ne.getEnd() == token.getEnd()) { if (ne.getValue().equals(this.neTag)) { duplicate = true; break; } } } if (!duplicate) { NamedEntity ne = new NamedEntity(getJCas(), token.getBegin(), token.getEnd()); ne.setValue(this.neTag); ne.addToIndexes(); } } } } @Override public void characters(char[] aCh, int aStart, int aLength) throws SAXException { StringBuffer sb = new StringBuffer(); sb.append(aCh, aStart, aLength); if (captureText) { if (isSpaceChar && !buffer.toString().isEmpty()) { buffer.append(SPACE_CHAR); } else if (addLemma) { lemma = sb.toString().trim(); } else if (addPos) { posTag = sb.toString().trim(); } else if (addNe) { neTag = sb.toString().trim(); } else { buffer.append(sb.toString().trim()); } } } } }