/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.penntree; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; /** * Penn Treebank chunked format reader. */ @MimeTypeCapability({MimeTypes.TEXT_X_PTB_CHUNKED}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk" }) public class PennTreebankChunkedReader extends JCasResourceCollectionReader_ImplBase { /** * Location of the mapping file for part-of-speech tags to UIMA types. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Write token annotations to the CAS. */ public static final String PARAM_READ_TOKEN = ComponentParameters.PARAM_READ_TOKEN; @ConfigurationParameter(name = PARAM_READ_TOKEN, mandatory = true, defaultValue = "true") private boolean readToken; /** * Write part-of-speech annotations to the CAS. */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPOS; /** * Write sentence annotations to the CAS. */ public static final String PARAM_READ_SENTENCE = ComponentParameters.PARAM_READ_SENTENCE; @ConfigurationParameter(name = PARAM_READ_SENTENCE, mandatory = true, defaultValue = "true") private boolean readSentence; /** * Write chunk annotations to the CAS. */ public static final String PARAM_READ_CHUNK = ComponentParameters.PARAM_READ_CHUNK; @ConfigurationParameter(name = PARAM_READ_CHUNK, mandatory = true, defaultValue = "true") private boolean readChunk; /** * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; /** * Character encoding of the input data. */ public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = "UTF-8") protected String encoding; public static final String ENCODING_AUTO = "auto"; private MappingProvider posMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, posTagset, getLanguage()); } @Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); aJCas.setDocumentLanguage((String) getConfigParameterValue(PARAM_LANGUAGE)); try { posMappingProvider.configure(aJCas.getCas()); } catch (AnalysisEngineProcessException e) { throw new IOException(e); } String readLine = null; List<String> tokens = new ArrayList<String>(); List<String> tags = new ArrayList<String>(); List<int[]> chunkStartEndIdx = new ArrayList<int[]>(); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(res.getInputStream(), encoding)); while ((readLine = br.readLine()) != null) { if (lineIsTrash(readLine)) { continue; } readLine = readLine.trim(); // enforce that all tokens are separated by exactly one blank readLine = readLine.replaceAll("[ ]{2,}", " "); // if the line starts and ends with brackets, it is a chunk int[] chunkIdx = null; if (readLine.startsWith("[") && readLine.endsWith("]")) { chunkIdx = new int[2]; chunkIdx[0] = tokens.size(); // we detected the chunk, we can delete the brackets as they // will cause problems later on if they stay in the text readLine = readLine.replaceAll("\\[", ""); readLine = readLine.replaceAll("\\]", ""); readLine = readLine.trim(); } String[] tokenWithTags = readLine.split(" "); for (String twt : tokenWithTags) { String[] token_tag; // two words might be joined by a forward slash, the same symbol // which separates token from part of speech tag. The word-join // forward slash is escaped if (wordsAreConnectedByForwardSlash(twt)) { token_tag = splitWordsAndTagAndNormalizeEscapedSlash(twt); } else { token_tag = twt.split("/"); } // This should not happen, skip these cases if (token_tag == null) { getLogger() .error("After splitting token from tag value became NULL, skipping this token"); continue; } else if (token_tag.length < 2) { // There are empty lines with nothing to split in it continue; } String token = token_tag[0]; String tag = token_tag[1]; // in ambiguous cases a token might have two or more part of // speech tags. We take the first one named and ignore the other // ones tag = selectFirstTagIfTokenIsAmbiguousInContextAndSeveralAcceptableOnesExist(tag); // A corpus might contain two pos tags for a word if it is // misspelled in the source material. 'The students dormitory' // should have used an apostrophe to mark a possessive case for // the word <code>students'</code>. The // misspelling lead to a plural noun pos-tag although the // possessive // tag would have been correct from the view point of intention. // We chose the incorrect(!) part of speech tag here to avoid // confusion why a misspelled word was tagged correctly. tag = ifWordIsMisspelledSelectTagThatFitsTheMisspelledWord(tag); tokens.add(token); tags.add(tag); } if (chunkIdx != null) { chunkIdx[1] = tokens.size() - 1; chunkStartEndIdx.add(chunkIdx); } } } finally { IOUtils.closeQuietly(br); } String documentText = annotateSenenceTokenPosTypes(aJCas, tokens, tags); aJCas.setDocumentText(documentText); if (readChunk) { annotateChunks(aJCas, chunkStartEndIdx); } } private void annotateChunks(JCas aJCas, List<int[]> aChunkStartEndIdx) { if (readToken) { List<Token> tokens = new ArrayList<Token>(JCasUtil.select(aJCas, Token.class)); for (int[] chunks : aChunkStartEndIdx) { int begin = tokens.get(chunks[0]).getBegin(); int end = tokens.get(chunks[1]).getEnd(); Chunk c = new Chunk(aJCas, begin, end); c.addToIndexes(); } } } private String ifWordIsMisspelledSelectTagThatFitsTheMisspelledWord(String aTag) { // replace by whitespace and trim the one at the beginning away, the remaining one are our // split points if (aTag.contains("^")) { aTag = aTag.replaceAll("\\^", " ").trim(); String[] split = aTag.split(" "); return split[0]; } return aTag; } private boolean lineIsTrash(String aLine) { boolean t3 = aLine.isEmpty(); boolean t1 = aLine.startsWith("========="); boolean t2 = aLine.startsWith("*x*"); return t1 || t2 || t3; } private String selectFirstTagIfTokenIsAmbiguousInContextAndSeveralAcceptableOnesExist( String aTag) { String[] tags = aTag.split("\\|"); return tags[0]; } private String[] splitWordsAndTagAndNormalizeEscapedSlash(String aTwt) { int idx = aTwt.lastIndexOf("/"); if (idx < 0) { return null; } String[] token_tag = new String[2]; token_tag[0] = aTwt.substring(0, idx); token_tag[0] = token_tag[0].replaceAll("\\\\/", "/"); token_tag[1] = aTwt.substring(idx + 1); return token_tag; } private boolean wordsAreConnectedByForwardSlash(String aTwt) { return aTwt.contains("\\/"); } private String annotateSenenceTokenPosTypes(JCas aJCas, List<String> aTokens, List<String> aTags) { StringBuilder textString = new StringBuilder(); int sentStart = 0; for (int i = 0; i < aTokens.size(); i++) { String token = aTokens.get(i); String tag = aTags.get(i); annotateTokenWithTag(aJCas, token, tag, textString.length()); textString.append(token); textString.append(" "); if (tag.equals(".")) { String text = textString.toString().trim(); if (readSentence) { annotateSentence(aJCas, sentStart, text.length()); } sentStart = textString.length(); } } return textString.toString().trim(); } private void annotateSentence(JCas aJCas, int aBegin, int aEnd) { new Sentence(aJCas, aBegin, aEnd).addToIndexes(); } private void annotateTokenWithTag(JCas aJCas, String aToken, String aTag, int aCurrPosInText) { if (readToken) { // Token Token token = new Token(aJCas, aCurrPosInText, aToken.length() + aCurrPosInText); token.addToIndexes(); if (readPOS) { // Tag Type posTag = posMappingProvider.getTagType(aTag); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(aTag); pos.setCoarseValue(pos.getClass().equals(POS.class) ? null : posTag.getShortName().intern()); pos.addToIndexes(); // Set the POS for the Token token.setPos(pos); } } } }