/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.ancora; import static de.tudarmstadt.ukp.dkpro.core.io.ancora.internal.AncoraConstants.ATTR_LEMMA; import static de.tudarmstadt.ukp.dkpro.core.io.ancora.internal.AncoraConstants.ATTR_POS; import static de.tudarmstadt.ukp.dkpro.core.io.ancora.internal.AncoraConstants.ATTR_WORD; import static de.tudarmstadt.ukp.dkpro.core.io.ancora.internal.AncoraConstants.TAG_SENTENCE; import static java.util.Arrays.asList; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.internal.ExtendedLogger; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * Read AnCora XML format. */ @MimeTypeCapability({MimeTypes.APPLICATION_XML, MimeTypes.APPLICATION_X_ANCORA_XML}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}) public class AncoraReader extends JCasResourceCollectionReader_ImplBase { /** * Write token annotations to the CAS. */ public static final String PARAM_READ_TOKEN = ComponentParameters.PARAM_READ_TOKEN; @ConfigurationParameter(name = PARAM_READ_TOKEN, mandatory = true, defaultValue = "true") private boolean readToken; /** * Write part-of-speech annotations to the CAS. */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPOS; /** * Write lemma annotations to the CAS. */ public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") private boolean readLemma; /** * Write sentence annotations to the CAS. */ public static final String PARAM_READ_SENTENCE = ComponentParameters.PARAM_READ_SENTENCE; @ConfigurationParameter(name = PARAM_READ_SENTENCE, mandatory = true, defaultValue = "true") private boolean readSentence; /** * Location of the mapping file for part-of-speech tags to UIMA types. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String mappingPosLocation; /** * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; public static final String PARAM_SPLIT_MULTI_WORD_TOKENS = "splitMultiWordTokens"; @ConfigurationParameter(name = PARAM_SPLIT_MULTI_WORD_TOKENS, mandatory = true, defaultValue="true") protected boolean splitMultiWordTokens; public static final String PARAM_DROP_SENTENCES_WITH_MISSING_POS = "dropSentencesMissingPosTags"; @ConfigurationParameter(name = PARAM_DROP_SENTENCES_WITH_MISSING_POS, mandatory = true, defaultValue="false") protected boolean dropSentencesMissingPosTags; private MappingProvider posMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, posTagset, getLanguage()); } @Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); // Set up language if (getLanguage() != null) { aJCas.setDocumentLanguage(getLanguage()); } // Configure mapping only now, because now the language is set in the CAS try { posMappingProvider.configure(aJCas.getCas()); } catch (AnalysisEngineProcessException e1) { throw new IOException(e1); } InputStream is = null; try { is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()); // Create handler AncoraHandler handler = new AncoraHandler(); handler.setJCas(aJCas); handler.setLogger(getLogger()); // Parse XML SAXParserFactory pf = SAXParserFactory.newInstance(); SAXParser parser = pf.newSAXParser(); InputSource source = new InputSource(is); source.setPublicId(res.getLocation()); source.setSystemId(res.getLocation()); parser.parse(source, handler); } catch (ParserConfigurationException | SAXException e) { throw new IOException(e); } finally { closeQuietly(is); } if (dropSentencesMissingPosTags) { List<FeatureStructure> toRemove = new ArrayList<>(); // Remove sentences without pos TAGs for (Sentence s : select(aJCas, Sentence.class)) { boolean remove = false; for (Token t : selectCovered(Token.class, s)) { if (t.getPos() == null) { toRemove.add(s); remove = true; break; } } if (remove) { for (Token t : selectCovered(Token.class, s)) { toRemove.add(t); if (t.getLemma() != null) { toRemove.add(t.getLemma()); } if (t.getPos() != null) { toRemove.add(t.getPos()); } } } } for (FeatureStructure fs : toRemove) { aJCas.getCas().removeFsFromIndexes(fs); } // Remove tokens without pos tags that are located *BETWEEN* sentences! toRemove.clear(); for (Token t : select(aJCas, Token.class)) { if (t.getPos() == null) { toRemove.add(t); if (t.getLemma() != null) { toRemove.add(t.getLemma()); } if (t.getPos() != null) { toRemove.add(t.getPos()); } } } for (FeatureStructure fs : toRemove) { aJCas.getCas().removeFsFromIndexes(fs); } } } public class AncoraHandler extends DefaultHandler { private int sentenceStart = -1; private final StringBuilder buffer = new StringBuilder(); private JCas jcas; private ExtendedLogger logger; public void setJCas(final JCas aJCas) { jcas = aJCas; } protected JCas getJCas() { return jcas; } public void setLogger(ExtendedLogger aLogger) { logger = aLogger; } public ExtendedLogger getLogger() { return logger; } @Override public void endDocument() throws SAXException { getJCas().setDocumentText(buffer.toString()); } protected StringBuilder getBuffer() { return buffer; } private void addToken(String aWord, String aLemma, String aPos) { // Add spacing to previous token (if present) if (buffer.length() > 0) { buffer.append(' '); } // Add current token int start = getBuffer().length(); buffer.append(aWord); int end = getBuffer().length(); Token token = null; if (readToken) { token = new Token(getJCas(), start, end); } if (aPos != null && readPOS) { Type posTagType = posMappingProvider.getTagType(aPos); POS pos = (POS) getJCas().getCas().createAnnotation(posTagType, start, end); pos.setPosValue(aPos.intern()); pos.setCoarseValue(pos.getClass().equals(POS.class) ? null : posTagType.getShortName().intern()); pos.addToIndexes(); if (token != null) { token.setPos(pos); } } if (aLemma != null && readLemma) { Lemma l = new Lemma(getJCas(), start, end); l.setValue(aLemma); l.addToIndexes(); if (token != null) { token.setLemma(l); } } if (token != null) { token.addToIndexes(); } } @Override public void startElement(String aUri, String aLocalName, String aName, Attributes aAttributes) throws SAXException { String wd = aAttributes.getValue(ATTR_WORD); if (TAG_SENTENCE.equals(aName)) { sentenceStart = getBuffer().length(); } else if (wd != null && sentenceStart == -1) { getLogger().info("Ignoring token outside sentence boundaries: ["+wd+"]"); } else if (wd != null && sentenceStart != -1) { String posTag = aAttributes.getValue(ATTR_POS); String lemma = aAttributes.getValue(ATTR_LEMMA); // Default case without multiword splitting List<String> words = asList(wd); List<String> lemmas = asList(lemma); // Override default case if multiword splitting is enabled if (splitMultiWordTokens && wd.contains("_")) { words = asList(wd.split("_")); lemmas = asList(wd.split("_")); // If the numbers of words do not match the numbers of lemmas after separation // then something is fishy! assert words.size() == lemmas.size(); } for (int i = 0; i < words.size(); i++) { addToken(words.get(i), lemmas.get(i), posTag); } } } @Override public void endElement(String aUri, String aLocalName, String aName) throws SAXException { if (TAG_SENTENCE.equals(aName)) { // AnCora contains some empty/missing sentences if (sentenceStart < getBuffer().length()) { if (readSentence) { new Sentence(getJCas(), sentenceStart, getBuffer().length()).addToIndexes(); } buffer.append("\n"); } sentenceStart = -1; } } @Override public void characters(char[] aCh, int aStart, int aLength) throws SAXException { // AnCora format exclusively uses attribute values } @Override public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException { // AnCora format exclusively uses attribute values } } }