/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universit√§t Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.negra; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.commons.lang.StringUtils.startsWith; import static org.apache.uima.fit.util.JCasUtil.select; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeNode; import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils; /** * This CollectionReader reads a file which is formatted in the NEGRA export format. The texts and * add. information like constituent structure is reproduced in CASes, one CAS per text (article) . */ @MimeTypeCapability({MimeTypes.APPLICATION_X_NEGRA3, MimeTypes.APPLICATION_X_NEGRA4}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent" }) public class NegraExportReader extends JCasCollectionReader_ImplBase { public static enum DocumentUnit { ORIGIN_ID, ORIGIN_NAME, SENTENCE_ID } /** * Location from which the input is read. */ public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION; @ConfigurationParameter(name = PARAM_SOURCE_LOCATION, mandatory = true) private File inputFile; /** * The language. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) private String language; /** * Character encoding of the input data. */ public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; /** * Write part-of-speech information. * * Default: {@code true} */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean posEnabled; /** * Write lemma information. * * Default: {@code true} */ public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") private boolean lemmaEnabled; /** * Write Penn Treebank bracketed structure information. Mind this may not work with all tagsets, * in particular not with such that contain "(" or ")" in their tags. The tree is generated * using the original tag set in the corpus, not using the mapped tagset! * * Default: {@code false} */ public static final String PARAM_READ_PENN_TREE = ComponentParameters.PARAM_READ_PENN_TREE; @ConfigurationParameter(name = PARAM_READ_PENN_TREE, mandatory = true, defaultValue = "false") private boolean pennTreeEnabled; /** * Location of the mapping file for part-of-speech tags to UIMA types. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String mappingPosLocation; /** * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; /** * The collection ID to the written to the document meta data. (Default: none) */ public static final String PARAM_COLLECTION_ID = "collectionId"; @ConfigurationParameter(name = PARAM_COLLECTION_ID, mandatory = false) private String collectionId; /** * If true, the unit IDs are used only to detect if a new document (CAS) needs to be created, * but for the purpose of setting the document ID, a new ID is generated. (Default: false) */ public static final String PARAM_GENERATE_NEW_IDS = "generateNewIds"; @ConfigurationParameter(name = PARAM_GENERATE_NEW_IDS, mandatory = true, defaultValue = "false") private boolean generateNewIds; /** * What indicates if a new CAS should be started. E.g., if set to * {@link DocumentUnit#ORIGIN_NAME ORIGIN_NAME}, a new CAS is generated whenever the origin name * of the current sentence differs from the origin name of the last sentence. (Default: * ORIGIN_NAME) */ public static final String PARAM_DOCUMENT_UNIT = "documentUnit"; @ConfigurationParameter(name = PARAM_DOCUMENT_UNIT, mandatory = true, defaultValue = "ORIGIN_NAME") private DocumentUnit documentUnit; private static final int LINE_ARGUMENT_COUNT = 5; // Fields for a token in a sentence private int TOKEN_TEXT = 0; private int TOKEN_LEMMA = -1; private int TOKEN_POS_TAG = 1; private int TOKEN_MORPH = 2; private int TOKEN_EDGE = 3; private int TOKEN_PARENT_ID = 4; private int TOKEN_SECEDGE = 5; private int TOKEN_COMMENT = 6; // Fields for a constituent in a sentence private int CONSTITUENT_ID = 0; private int CONSTITUENT_TYPE = 1; private int CONSTITUENT_FUNCTION = 3; // #FORMAT fields private static final int FORMAT_FIELD_NUM = 1; // #BOT fields private static final int BOT_FIELD_NAME = 1; // #BOS fields private static final int BOS_FIELD_NUM = 1; private static final int BOS_FIELD_EDITOR_ID = 2; private static final int BOS_FIELD_DATE = 3; private static final int BOS_FIELD_ORIGIN_ID = 4; // ORIGIN table fields private static final int ORIGIN_ID = 0; private static final int ORIGIN_NAME = 1; private static final String FORMAT = "#FORMAT"; private static final String BEGIN_OF_SENTENCE = "#BOS"; private static final String END_OF_SENTENCE = "#EOS"; private static final String BEGIN_OF_TABLE = "#BOT"; private static final String END_OF_TABLE = "#EOT"; private static final String TABLE_ORIGIN = "ORIGIN"; private int format; private int sentenceCount; private int documentCount; private int documentsTotal; private BufferedReader br; private Map<String, String> idxOriginName; private MappingProvider posMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); documentsTotal = 0; idxOriginName = new HashMap<String, String>(); try { // Detect if the file is compressed InputStream fileStream = new FileInputStream(inputFile); InputStream resolvedStream = CompressionUtils.getInputStream(inputFile.getName(), fileStream); br = new BufferedReader(new InputStreamReader(resolvedStream, encoding)); readHeaders(); sentenceCount = 0; documentCount = 0; } catch (IOException e) { throw new ResourceInitializationException(e); } posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation, posTagset, language); } @Override public void getNext(JCas aJCas) throws IOException { JCasBuilder casBuilder = new JCasBuilder(aJCas); String originId = readOriginId(true); String sentenceId = readSentenceHeader(BOS_FIELD_NUM, true); String casId = originId2casId(originId, sentenceId); String documentId; if (generateNewIds) { documentId = String.valueOf(documentCount); } else { documentId = casId; } // Set meta data DocumentMetaData meta = DocumentMetaData.create(aJCas); meta.setDocumentUri(inputFile.toURI() + "#" + documentId); meta.setCollectionId(collectionId); meta.setDocumentId(documentId); aJCas.setDocumentLanguage(language); // Configure mapping only now, because now the language is set in the CAS try { posMappingProvider.configure(aJCas.getCas()); } catch (AnalysisEngineProcessException e) { throw new IOException(e); } // Fill CAS Map<String, Annotation> idMap = new LinkedHashMap<>(); String lastCasId = casId; while (casId != null) { if (!casId.equals(lastCasId)) { // if a new origin ID is encountered, stop this jcas creation break; } // otherwise consume the line readOriginId(false); // read the next sentence readSentence(aJCas, casBuilder, sentenceId, idMap); originId = readOriginId(true); sentenceId = readSentenceHeader(BOS_FIELD_NUM, true); lastCasId = casId; casId = originId2casId(originId, sentenceId); } casBuilder.close(); // for (Entry<String, Annotation> e : idMap.entrySet()) { // System.out.printf("%s - %s%n", e.getKey(), e.getValue().getCoveredText()); // } // Can only do that after the builder is closed, otherwise the text is not yet set in the // CAS and we get "null" for all token strings. if (pennTreeEnabled) { for (ROOT root : select(aJCas, ROOT.class)) { PennTree pt = new PennTree(aJCas, root.getBegin(), root.getEnd()); PennTreeNode rootNode = PennTreeUtils.convertPennTree(root); pt.setPennTree(PennTreeUtils.toPennTree(rootNode)); pt.addToIndexes(); } } documentCount++; } private String originId2casId(String aOriginId, String aSentenceId) { switch (documentUnit) { case SENTENCE_ID: return aSentenceId; case ORIGIN_ID: return aOriginId; case ORIGIN_NAME: String originName = idxOriginName.get(aOriginId); if (originName != null) { return originName; } return aOriginId; default: throw new IllegalStateException("Unknown document unit [" + documentUnit + "]"); } } @Override public boolean hasNext() throws IOException, CollectionException { if (br != null) { return readOriginId(true) != null; } else { // Has already been closed. return false; } } @Override public Progress[] getProgress() { return new Progress[] { new ProgressImpl(documentCount, documentsTotal, "document") }; } @Override public void close() throws IOException { closeQuietly(br); br = null; } /** * Read the originId from the #BOS line that is expected to follow. * * @param aPeek * if true, stream will not advance * @return the next origin id or null if there is none */ private String readOriginId(boolean aPeek) throws IOException { return readSentenceHeader(BOS_FIELD_ORIGIN_ID, aPeek); } /** * Read the originId from the #BOS line that is expected to follow. * * @param aPeek * if true, stream will not advance * @return the next origin id or null if there is none */ private String readSentenceHeader(int aField, boolean aPeek) throws IOException { if (aPeek) { br.mark(16000); } String line = br.readLine(); while (line != null) { if (!line.startsWith("%%")) { String[] parts = line.split("\\s+"); if (aPeek) { br.reset(); } return parts[aField]; } line = br.readLine(); } return null; } private void readHeaders() throws IOException { br.mark(16000); String line = br.readLine(); while (line != null) { if (!line.startsWith("%%")) { if (readHeaderLine(line.split("\\s+"))) { br.reset(); return; } } br.mark(16000); line = br.readLine(); } throw new IOException("Unexpected end of file"); } /** * @return true if all header data has been parsed. */ private boolean readHeaderLine(String[] aLine) throws IOException { // System.out.printf("Parsing line [%s]%n", StringUtils.join(aLine, "\t")); if (FORMAT.equals(aLine[0])) { readFormat(aLine); return false; } else if (BEGIN_OF_TABLE.equals(aLine[0])) { readTable(aLine); return false; } else if (BEGIN_OF_SENTENCE.equals(aLine[0])) { return true; } else { throw new IOException("Illegal file format: [" + StringUtils.join(aLine, "\t") + "]"); } } private void readFormat(String[] aLine) throws IOException { format = Integer.parseInt(aLine[FORMAT_FIELD_NUM]); switch (format) { case 3: TOKEN_TEXT = 0; TOKEN_LEMMA = -1; TOKEN_POS_TAG = 1; TOKEN_MORPH = 2; TOKEN_EDGE = 3; TOKEN_PARENT_ID = 4; TOKEN_SECEDGE = 5; TOKEN_COMMENT = 6; CONSTITUENT_ID = 0; CONSTITUENT_TYPE = 1; CONSTITUENT_FUNCTION = 3; getLogger().log(Level.INFO, "Corpus format 3 detected - no lemmas"); break; case 4: TOKEN_TEXT = 0; TOKEN_LEMMA = 1; TOKEN_POS_TAG = 2; TOKEN_MORPH = 3; TOKEN_EDGE = 4; TOKEN_PARENT_ID = 5; TOKEN_SECEDGE = 6; TOKEN_COMMENT = 7; CONSTITUENT_ID = 0; CONSTITUENT_TYPE = 2; CONSTITUENT_FUNCTION = 4; getLogger().log(Level.INFO, "Corpus format 4 detected"); break; default: throw new IOException("Format version [" + format + "] not supported"); } // System.out.printf("Reading format [%d]%n", format); } private void readTable(String[] aLine) throws IOException { String tableName = aLine[BOT_FIELD_NAME]; // System.out.printf("Reading table [%s]%n", tableName); if (TABLE_ORIGIN.equals(tableName)) { readOriginTable(); } else { skipTable(); } } private void readOriginTable() throws IOException { String line = br.readLine(); while (!startsWith(line, END_OF_TABLE)) { if (line == null) { throw new IOException("Unexpected end of file"); } String[] parts = line.split("\\s+"); idxOriginName.put(parts[ORIGIN_ID], parts[ORIGIN_NAME]); documentsTotal++; line = br.readLine(); } // System.out.printf("Documents [%d]%n", documentsTotal); } private void readSentence(JCas aJCas, JCasBuilder aBuilder, String aSentenceId, Map<String, Annotation> aIdMap) throws IOException { sentenceCount++; // Initialize root node ROOT root = new ROOT(aJCas); root.setBegin(Integer.MAX_VALUE); root.setConstituentType("ROOT"); // Initialize constituents Map<String, Constituent> constituents = new HashMap<String, Constituent>(); constituents.put("0", root); // Initialize dependency relations Map<Constituent, List<Annotation>> relations = new LinkedHashMap<Constituent, List<Annotation>>(); // handle tokens String line; int id = 1; int sentBegin = aBuilder.getPosition(); int sentEnd = -1; for (line = br.readLine(); startsNotWith(line, "#"); line = br.readLine()) { String[] parts = splitLine(line, "\t+"); // create token Token token = aBuilder.add(parts[TOKEN_TEXT], Token.class); sentEnd = token.getEnd(); aBuilder.add(" "); aIdMap.put(aSentenceId+":"+id, token); // get/create parent Constituent parent = constituents.get(parts[TOKEN_PARENT_ID]); if (parent == null) { parent = new Constituent(aJCas); parent.setBegin(Integer.MAX_VALUE); constituents.put(parts[TOKEN_PARENT_ID], parent); } // update begin/end markers of parent if (token.getBegin() < parent.getBegin()) { parent.setBegin(token.getBegin()); } if (token.getEnd() > parent.getEnd()) { parent.setEnd(token.getEnd()); } token.setParent(parent); addChild(relations, parent, token); // create pos if (posEnabled && (TOKEN_POS_TAG >= 0)) { Type posTag = posMappingProvider.getTagType(parts[TOKEN_POS_TAG]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(parts[TOKEN_POS_TAG].intern()); pos.setCoarseValue(pos.getClass().equals(POS.class) ? null : pos.getType().getShortName().intern()); pos.addToIndexes(); token.setPos(pos); } // create lemma if (lemmaEnabled && (TOKEN_LEMMA >= 0)) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(parts[TOKEN_LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } id++; } // handle constituent relations Constituent constituent; for (; startsNotWith(line, END_OF_SENTENCE); line = br.readLine()) { // Ignore trailing coreferential information in Tüba D/Z line = StringUtils.substringBefore(line, "%%").trim(); // substring(1) to get rid of leading # String[] parts = splitLine(line.substring(1), "\t+"); // get/create constituent, set type, function constituent = constituents.get(parts[CONSTITUENT_ID]); if (constituent == null) { constituent = new Constituent(aJCas); constituents.put(parts[CONSTITUENT_ID], constituent); } constituent.setConstituentType(parts[CONSTITUENT_TYPE]); constituent.setSyntacticFunction(parts[CONSTITUENT_FUNCTION]); // get/create parent Constituent parent = constituents.get(parts[TOKEN_PARENT_ID]); if (parent == null) { parent = new Constituent(aJCas); parent.setBegin(Integer.MAX_VALUE); constituents.put(parts[TOKEN_PARENT_ID], parent); } // update begin/end markers of parent if (constituent.getBegin() < parent.getBegin()) { parent.setBegin(constituent.getBegin()); } if (constituent.getEnd() > parent.getEnd()) { parent.setEnd(constituent.getEnd()); } // set parent, add child constituent.setParent(parent); addChild(relations, parent, constituent); } // set all children at the end of the sentence setChildren(aJCas, relations); // Sanity check assert root.getBegin() == sentBegin; assert root.getEnd() == sentEnd; // set sentence annotation Sentence sentence = new Sentence(aJCas, root.getBegin(), root.getEnd()); sentence.addToIndexes(aJCas); // add constituents at the end of the sentence for (Entry<String, Constituent> e : constituents.entrySet()) { e.getValue().addToIndexes(aJCas); aIdMap.put(aSentenceId+":"+e.getKey(), e.getValue()); } } private void skipTable() throws IOException { String line = br.readLine(); while (!startsWith(line, END_OF_TABLE)) { if (line == null) { throw new IOException("Unexpected end of file"); } line = br.readLine(); } } private void addChild(Map<Constituent, List<Annotation>> relations, Constituent parent, Annotation child) { List<Annotation> children = relations.get(parent); if (children == null) { children = new ArrayList<Annotation>(); relations.put(parent, children); } children.add(child); } private void setChildren(JCas jcas, Map<Constituent, List<Annotation>> relations) { for (Entry<Constituent, List<Annotation>> entry : relations.entrySet()) { Constituent parent = entry.getKey(); List<Annotation> children = entry.getValue(); FSArray fsa = new FSArray(jcas, children.size()); for (int i = 0; i < children.size(); i++) { fsa.set(i, children.get(i)); } parent.setChildren(fsa); } } private String[] splitLine(String str, String delimiter) throws IOException { String[] parts = str.split(delimiter); if (parts.length < LINE_ARGUMENT_COUNT) { throw new IOException("Illegal file format: expected [" + LINE_ARGUMENT_COUNT + "] fields, but found [" + parts.length + "] in [" + str + "]"); } return parts; } private boolean startsNotWith(String str, String pre) { if (str == null) { return false; } else { return !startsWith(str, pre); } } }