/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.imscwb; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.internal.util.XMLUtils; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.io.imscwb.util.CorpusSentence; import de.tudarmstadt.ukp.dkpro.core.io.imscwb.util.CorpusText; import de.tudarmstadt.ukp.dkpro.core.io.imscwb.util.TextIterable; /** * Reads a tab-separated format including pseudo-XML tags. */ @MimeTypeCapability({MimeTypes.TEXT_X_IMSCWB}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) public class ImsCwbReader extends ResourceCollectionReaderBase { /** * Character encoding of the output. */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name=PARAM_SOURCE_ENCODING, mandatory=true, defaultValue="UTF-8") private String encoding; @Deprecated public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; /** * Location of the mapping file for part-of-speech tags to UIMA types. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String mappingPosLocation; /** * Specify which tag set should be used to locate the mapping file. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; /** * Read tokens and generate {@link Token} annotations. * * Default: {@code true} */ public static final String PARAM_READ_TOKEN = ComponentParameters.PARAM_READ_TOKEN; @ConfigurationParameter(name = PARAM_READ_TOKEN, mandatory = true, defaultValue = "true") private boolean readTokens; /** * Read part-of-speech tags and generate {@link POS} annotations or subclasses if a * {@link #PARAM_POS_TAG_SET tag set} or {@link #PARAM_POS_MAPPING_LOCATION mapping file} is * used. * * Default: {@code true} */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; /** * Read sentences. * * Default: {@code true} */ public static final String PARAM_READ_SENTENCES = ComponentParameters.PARAM_READ_SENTENCE; @ConfigurationParameter(name = PARAM_READ_SENTENCES, mandatory = true, defaultValue = "true") private boolean readSentences; /** * Read lemmas. * * Default: {@code true} */ public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") private boolean readLemmas; /** * If true, the unit IDs are used only to detect if a new document (CAS) needs to be created, * but for the purpose of setting the document ID, a new ID is generated. (Default: false) */ public static final String PARAM_GENERATE_NEW_IDS = "generateNewIds"; @ConfigurationParameter(name = PARAM_GENERATE_NEW_IDS, mandatory = true, defaultValue = "false") private boolean generateNewIds; /** * If true, the unit text ID encoded in the corpus file is stored as the URI in the document * meta data. This setting has is not affected by {@link #PARAM_GENERATE_NEW_IDS} * (Default: false) */ public static final String PARAM_ID_IS_URL = "idIsUrl"; @ConfigurationParameter(name = PARAM_ID_IS_URL, mandatory = true, defaultValue = "false") private boolean idIsUrl; /** * Replace non-XML characters with spaces. * (Default: true) */ public static final String PARAM_REPLACE_NON_XML = "replaceNonXml"; @ConfigurationParameter(name = PARAM_REPLACE_NON_XML, mandatory = true, defaultValue = "true") private boolean replaceNonXml; private Type tokenType; private Type lemmaType; private Type sentenceType; private TextIterable wackyIterator; private int completed; private MappingProvider posMappingProvider; private int documentCount; private int qualifier; private Resource lastResource; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); wackyIterator = new TextIterable(getResources(), encoding); posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, posTagset, getLanguage()); documentCount = 0; qualifier = 0; lastResource = null; } @Override public boolean hasNext() throws IOException, CollectionException { return wackyIterator.hasNext(); } @Override public void getNext(CAS aCAS) throws IOException, CollectionException { Resource res = wackyIterator.getCurrentResource(); CorpusText text = wackyIterator.next(); // Reset counter when a new file is read. if (!res.equals(lastResource)) { qualifier = 0; lastResource = res; } String documentId; if (generateNewIds) { documentId = String.valueOf(documentCount); } else { documentId = text.getDocumentTitle(); } initCas(aCAS, res, String.valueOf(qualifier)); DocumentMetaData meta = DocumentMetaData.get(aCAS); meta.setDocumentTitle(text.getDocumentTitle()); meta.setDocumentId(documentId); if (idIsUrl) { meta.setDocumentBaseUri(null); meta.setDocumentUri(text.getDocumentTitle()); } try { posMappingProvider.configure(aCAS); } catch (AnalysisEngineProcessException e) { throw new IOException(e); } List<AnnotationFS> tokenAnnotations = new ArrayList<AnnotationFS>(); List<AnnotationFS> lemmaAnnotations = new ArrayList<AnnotationFS>(); List<AnnotationFS> posAnnotations = new ArrayList<AnnotationFS>(); List<AnnotationFS> sentenceAnnotations = new ArrayList<AnnotationFS>(); TypeSystem typeSystem = aCAS.getTypeSystem(); tokenType = typeSystem.getType(Token.class.getName()); lemmaType = typeSystem.getType(Lemma.class.getName()); sentenceType = typeSystem.getType(Sentence.class.getName()); StringBuilder sb = new StringBuilder(); int offset = 0; for (CorpusSentence sentence : text.getSentences()) { int savedOffset = offset; for (int i=0; i<sentence.getTokens().size(); i++) { String token = doReplaceNonXml(sentence.getTokens().get(i)); String lemma = doReplaceNonXml(sentence.getLemmas().get(i)); String pos = doReplaceNonXml(sentence.getPOS().get(i)); int len = token.length(); if (readPos) { Type posType = posMappingProvider.getTagType(pos); AnnotationFS posAnno = aCAS.createAnnotation(posType, offset, offset + len); posAnno.setStringValue(posType.getFeatureByBaseName("PosValue"), pos); posAnnotations.add(posAnno); } if (readLemmas) { AnnotationFS lemmaAnno = aCAS.createAnnotation( lemmaType, offset, offset + len); lemmaAnno.setStringValue(lemmaType.getFeatureByBaseName("value"), lemma); lemmaAnnotations.add(lemmaAnno); } if (readTokens) { AnnotationFS tokenAnno = aCAS.createAnnotation( tokenType, offset, offset + len); if (readPos) { tokenAnno.setFeatureValue( tokenType.getFeatureByBaseName("pos"), posAnnotations.get(posAnnotations.size()-1)); } if (readLemmas) { tokenAnno.setFeatureValue( tokenType.getFeatureByBaseName("lemma"), lemmaAnnotations.get(lemmaAnnotations.size()-1)); } tokenAnnotations.add(tokenAnno); } sb.append(token); sb.append(" "); // increase offset by size of token + 1 for the space offset += len + 1; } if (readSentences) { AnnotationFS sentenceAnno = aCAS.createAnnotation( sentenceType, savedOffset, offset); sentenceAnnotations.add(sentenceAnno); } } String sText = sb.toString(); aCAS.setDocumentText(sText); // finally add the annotations to the CAS for (AnnotationFS t : tokenAnnotations) { aCAS.addFsToIndexes(t); } for (AnnotationFS l : lemmaAnnotations) { aCAS.addFsToIndexes(l); } for (AnnotationFS p : posAnnotations) { aCAS.addFsToIndexes(p); } for (AnnotationFS s : sentenceAnnotations) { aCAS.addFsToIndexes(s); } completed++; documentCount++; qualifier++; } @Override public Progress[] getProgress() { return new Progress[] { new ProgressImpl(completed, 0, "text") }; } private String doReplaceNonXml(String aString) { if (!replaceNonXml) { return aString; } char[] buf = aString.toCharArray(); int pos = XMLUtils.checkForNonXmlCharacters(buf, 0, buf.length, false); if (pos == -1) { return aString; } while (pos != -1) { buf[pos] = ' '; pos = XMLUtils.checkForNonXmlCharacters(buf, pos, buf.length - pos, false); } return String.valueOf(buf); } }