/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.tcf; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.exists; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.URL; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamedWithReplaceableLayers; import eu.clarin.weblicht.wlfxb.io.WLDObjector; import eu.clarin.weblicht.wlfxb.io.WLFormatException; import eu.clarin.weblicht.wlfxb.tc.api.DependencyParsingLayer; import eu.clarin.weblicht.wlfxb.tc.api.LemmasLayer; import eu.clarin.weblicht.wlfxb.tc.api.NamedEntitiesLayer; import eu.clarin.weblicht.wlfxb.tc.api.PosTagsLayer; import eu.clarin.weblicht.wlfxb.tc.api.Reference; import eu.clarin.weblicht.wlfxb.tc.api.ReferencesLayer; import eu.clarin.weblicht.wlfxb.tc.api.SentencesLayer; import eu.clarin.weblicht.wlfxb.tc.api.TextCorpus; import eu.clarin.weblicht.wlfxb.tc.api.TokensLayer; import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusLayerTag; import eu.clarin.weblicht.wlfxb.tc.xb.TextCorpusStored; import eu.clarin.weblicht.wlfxb.xb.WLData; /** * Writer for the WebLicht TCF format. */ @MimeTypeCapability({MimeTypes.TEXT_TCF}) @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain", "de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) public class TcfWriter extends JCasFileWriter_ImplBase { private static final String REL_TYPE_EXPLETIVE = "expletive"; /** * Specify the suffix of output files. Default value <code>.tcf</code>. If the suffix is not * needed, provide an empty string as value. */ public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".tcf") private String filenameSuffix; /** * If there are no annotations for a particular layer in the CAS, preserve any potentially * existing annotations in the original TCF.<br> * Default: {@code false} */ public static final String PARAM_PRESERVE_IF_EMPTY = "preserveIfEmpty"; @ConfigurationParameter(name = PARAM_PRESERVE_IF_EMPTY, mandatory = true, defaultValue = "false") private boolean preserveIfEmpty; /** * Merge with source TCF file if one is available.<br> * Default: {@code true} */ public static final String PARAM_MERGE = "merge"; @ConfigurationParameter(name = PARAM_MERGE, mandatory = true, defaultValue = "true") private boolean merge; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); // #670 - TcfWriter can currently not properly write to ZIP files because of the "try and // error" approach that we take to trying to merge with an existing file. In particular, if // the attempt fails and we go without merging, we cannot delete the broken entry from the // ZIP file. if (StringUtils.startsWith(getTargetLocation(), JAR_PREFIX)) { throw new ResourceInitializationException(new IllegalStateException( "TcfWriter cannot write to ZIP files.")); } } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { InputStream docIS = null; try { boolean writeWithoutMerging = true; if (merge) { NamedOutputStream docOS = null; try { docOS = getOutputStream(aJCas, filenameSuffix); // Get the original TCF file and preserve it DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas); URL filePathUrl = new URL(documentMetadata.getDocumentUri()); try { docIS = filePathUrl.openStream(); try { getLogger().debug( "Merging with [" + documentMetadata.getDocumentUri() + "]"); casToTcfWriter(docIS, aJCas, docOS); writeWithoutMerging = false; } // See https://github.com/weblicht/wlfxb/issues/7 // catch (WLFormatException ex) { // getLogger().debug("No source file to merge with: " + ex.getMessage()); // } // Workaround: catch all exceptions catch (Exception ex) { getLogger().debug("Source file is not TCF: " + ex.getMessage()); } } catch (IOException e) { getLogger().debug( "Cannot open source file to merge with: " + e.getMessage()); } } finally { if (writeWithoutMerging) { // Have to delete the output file from this try and will try again without // merging. Deleting is necessary as not to trigger the overwrite safeguard // in JCasFileWriter_ImplBase if ((docOS != null) && (docOS.getName() != null)) { FileUtils.deleteQuietly(new File(docOS.getName())); } } closeQuietly(docOS); } } else { getLogger().debug("Merging disabled"); } // If merging failed or is disabled, go on without merging if (writeWithoutMerging) { OutputStream docOS = null; try { docOS = getOutputStream(aJCas, filenameSuffix); casToTcfWriter(aJCas, docOS); } finally { closeQuietly(docOS); } } } catch (Exception e) { throw new AnalysisEngineProcessException(e); } finally { closeQuietly(docIS); } } /** * Create TCF File from scratch * * @param aJCas * the JCas. * @param aOs * the output stream. * @throws WLFormatException * if a TCF problem occurs. */ public void casToTcfWriter(JCas aJCas, OutputStream aOs) throws WLFormatException { // create TextCorpus object, specifying its language from the aJcas Object TextCorpusStored textCorpus = new TextCorpusStored(aJCas.getDocumentLanguage()); // create text annotation layer and add the string of the text into the layer textCorpus.createTextLayer().addText(aJCas.getDocumentText()); write(aJCas, textCorpus); // write the annotated data object into the output stream WLData wldata = new WLData(textCorpus); WLDObjector.write(wldata, aOs); } /** * Merge annotations from CAS into an existing TCF file. * * @param aIs * the TCF file with an existing annotation layers * @param aJCas * an annotated CAS object * @param aOs * the output stream. * @throws WLFormatException * if a TCF problem occurs. */ public void casToTcfWriter(InputStream aIs, JCas aJCas, OutputStream aOs) throws WLFormatException { // If these layers are present in the TCF file, we use them from there, otherwise // we generate them EnumSet<TextCorpusLayerTag> layersToRead = EnumSet.of( TextCorpusLayerTag.TOKENS, TextCorpusLayerTag.SENTENCES); // If we have annotations for these layers in the CAS, we rewrite those layers. List<TextCorpusLayerTag> layersToReplace = new ArrayList<TextCorpusLayerTag>(); if (exists(aJCas, POS.class) || !preserveIfEmpty) { layersToReplace.add(TextCorpusLayerTag.POSTAGS); } if (exists(aJCas, Lemma.class) || !preserveIfEmpty) { layersToReplace.add(TextCorpusLayerTag.LEMMAS); } if (exists(aJCas, NamedEntity.class) || !preserveIfEmpty) { layersToReplace.add(TextCorpusLayerTag.NAMED_ENTITIES); } if (exists(aJCas, Dependency.class) || !preserveIfEmpty) { layersToReplace.add(TextCorpusLayerTag.PARSING_DEPENDENCY); } if (exists(aJCas, CoreferenceChain.class) || !preserveIfEmpty) { layersToReplace.add(TextCorpusLayerTag.REFERENCES); } TextCorpusStreamedWithReplaceableLayers textCorpus = null; try { textCorpus = new TextCorpusStreamedWithReplaceableLayers( aIs, layersToRead, EnumSet.copyOf(layersToReplace), aOs); write(aJCas, textCorpus); } finally { if (textCorpus != null) { try { textCorpus.close(); } catch (IOException e) { // Ignore exception while closing } } } } private void write(JCas aJCas, TextCorpus aTextCorpus) { Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> tokensBeginPositionMap; tokensBeginPositionMap = writeTokens(aJCas, aTextCorpus); writeSentence(aJCas, aTextCorpus, tokensBeginPositionMap); writePosTags(aJCas, aTextCorpus, tokensBeginPositionMap); writeLemmas(aJCas, aTextCorpus, tokensBeginPositionMap); writeDependency(aJCas, aTextCorpus, tokensBeginPositionMap); writeNamedEntity(aJCas, aTextCorpus, tokensBeginPositionMap); writeCoreference(aJCas, aTextCorpus, tokensBeginPositionMap); } private Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> writeTokens(JCas aJCas, TextCorpus aTextCorpus) { boolean tokensLayerCreated = false; // Create tokens layer if it does not exist TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); if (tokensLayer == null) { tokensLayer = aTextCorpus.createTokensLayer(); tokensLayerCreated = true; getLogger().debug("Layer [" + TextCorpusLayerTag.TOKENS.getXmlName() + "]: created"); } else { getLogger().debug("Layer [" + TextCorpusLayerTag.TOKENS.getXmlName() + "]: found"); } Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> tokensBeginPositionMap = new HashMap<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token>(); int j = 0; for (Token token : select(aJCas, Token.class)) { if (tokensLayerCreated) { tokensLayer.addToken(token.getCoveredText()); } tokensBeginPositionMap.put(token.getBegin(), tokensLayer.getToken(j)); j++; } return tokensBeginPositionMap; } private void writePosTags(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) { if (!JCasUtil.exists(aJCas, POS.class)) { // Do nothing if there are no part-of-speech tags in the CAS getLogger().debug("Layer [" + TextCorpusLayerTag.POSTAGS.getXmlName() + "]: empty"); return; } // Tokens layer must already exist TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); // create POS tag annotation layer String posTagSet = "STTS"; for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { if (tagSet.getLayer().equals(POS.class.getName())) { posTagSet = tagSet.getName(); break; } } PosTagsLayer posLayer = aTextCorpus.createPosTagsLayer(posTagSet); getLogger().debug("Layer [" + TextCorpusLayerTag.POSTAGS.getXmlName() + "]: created"); int j = 0; for (Token coveredToken : select(aJCas, Token.class)) { POS pos = coveredToken.getPos(); if (pos != null && posLayer != null ) { String posValue = coveredToken.getPos().getPosValue(); posLayer.addTag(posValue, tokensLayer.getToken(j)); } j++; } } private void writeLemmas(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) { if (!JCasUtil.exists(aJCas, Lemma.class)) { // Do nothing if there are no lemmas in the CAS getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: empty"); return; } // Tokens layer must already exist TokensLayer tokensLayer = aTextCorpus.getTokensLayer(); // create lemma annotation layer LemmasLayer lemmasLayer = aTextCorpus.createLemmasLayer(); getLogger().debug("Layer [" + TextCorpusLayerTag.LEMMAS.getXmlName() + "]: created"); int j = 0; for (Token coveredToken : select(aJCas, Token.class)) { Lemma lemma = coveredToken.getLemma(); if (lemma != null && lemmasLayer != null) { String lemmaValue = coveredToken.getLemma().getValue(); lemmasLayer.addLemma(lemmaValue, tokensLayer.getToken(j)); } j++; } } private void writeSentence(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) { // if not TCF file, add sentence layer (Sentence is required for BRAT) SentencesLayer sentencesLayer = aTextCorpus.getSentencesLayer(); if (sentencesLayer != null) { getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: found"); return; } sentencesLayer = aTextCorpus.createSentencesLayer(); getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: created"); for (Sentence sentence : select(aJCas, Sentence.class)) { List<eu.clarin.weblicht.wlfxb.tc.api.Token> tokens = new ArrayList<eu.clarin.weblicht.wlfxb.tc.api.Token>(); for (Token token : selectCovered(Token.class, sentence)) { tokens.add(aTokensBeginPositionMap.get(token.getBegin())); } sentencesLayer.addSentence(tokens); } } private void writeDependency(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) { if (!JCasUtil.exists(aJCas, Dependency.class)) { // Do nothing if there are no dependencies in the CAS getLogger().debug("Layer [" + TextCorpusLayerTag.PARSING_DEPENDENCY.getXmlName() + "]: empty"); return; } DependencyParsingLayer dependencyParsingLayer = null; String tagSetName = "tiger"; for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { if (tagSet.getLayer().equals(Dependency.class.getName())) { tagSetName = tagSet.getName(); break; } } Optional<Dependency> hasNonBasic = select(aJCas, Dependency.class).stream() .filter(dep -> dep.getFlavor() != null && !DependencyFlavor.BASIC.equals(dep.getFlavor())) .findAny(); dependencyParsingLayer = aTextCorpus.createDependencyParsingLayer(tagSetName, hasNonBasic.isPresent(), true); getLogger().debug("Layer [" + TextCorpusLayerTag.PARSING_DEPENDENCY.getXmlName() + "]: created"); for (Sentence s : select(aJCas, Sentence.class)) { List<eu.clarin.weblicht.wlfxb.tc.api.Dependency> deps = new ArrayList<eu.clarin.weblicht.wlfxb.tc.api.Dependency>(); for (Dependency d : selectCovered(Dependency.class, s)) { eu.clarin.weblicht.wlfxb.tc.api.Dependency dependency = dependencyParsingLayer .createDependency(d.getDependencyType(), aTokensBeginPositionMap.get(d.getDependent().getBegin()), aTokensBeginPositionMap.get(d.getGovernor().getBegin())); deps.add(dependency); } if (deps.size() > 0) { dependencyParsingLayer.addParse(deps); } } } private void writeNamedEntity(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) { if (!JCasUtil.exists(aJCas, NamedEntity.class)) { // Do nothing if there are no named entities in the CAS getLogger().debug("Layer [" + TextCorpusLayerTag.NAMED_ENTITIES.getXmlName() + "]: empty"); return; } String tagSetName = "BART"; for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { if (tagSet.getLayer().equals(NamedEntity.class.getName())) { tagSetName = tagSet.getName(); break; } } NamedEntitiesLayer namedEntitiesLayer = aTextCorpus.createNamedEntitiesLayer(tagSetName); getLogger().debug("Layer [" + TextCorpusLayerTag.NAMED_ENTITIES.getXmlName() + "]: created"); for (NamedEntity namedEntity : select(aJCas, NamedEntity.class)) { List<Token> tokensInCas = selectCovered(aJCas, Token.class, namedEntity.getBegin(), namedEntity.getEnd()); List<eu.clarin.weblicht.wlfxb.tc.api.Token> tokensInTcf = new ArrayList<eu.clarin.weblicht.wlfxb.tc.api.Token>(); for (Token token : tokensInCas) { tokensInTcf.add(aTokensBeginPositionMap.get(token.getBegin())); } namedEntitiesLayer.addEntity(namedEntity.getValue(), tokensInTcf); } } private void writeCoreference(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) { if (!JCasUtil.exists(aJCas, CoreferenceChain.class)) { // Do nothing if there are no coreference chains in the CAS getLogger().debug("Layer [" + TextCorpusLayerTag.REFERENCES.getXmlName() + "]: empty"); return; } String tagSetName = "TueBaDz"; for (TagsetDescription tagSet : select(aJCas, TagsetDescription.class)) { if (tagSet.getLayer().equals(CoreferenceLink.class.getName())) { tagSetName = tagSet.getName(); break; } } ReferencesLayer coreferencesLayer = aTextCorpus.createReferencesLayer(null, tagSetName, null); getLogger().debug("Layer [" + TextCorpusLayerTag.REFERENCES.getXmlName() + "]: created"); for (CoreferenceChain chain : select(aJCas, CoreferenceChain.class)) { CoreferenceLink prevLink = null; Reference prevRef = null; List<Reference> refs = new ArrayList<Reference>(); for (CoreferenceLink link : chain.links()) { // Get covered tokens List<eu.clarin.weblicht.wlfxb.tc.api.Token> tokens = new ArrayList<eu.clarin.weblicht.wlfxb.tc.api.Token>(); for (Token token : selectCovered(Token.class, link)) { tokens.add(aTokensBeginPositionMap.get(token.getBegin())); } // Create current reference Reference ref = coreferencesLayer.createReference(link.getReferenceType(), tokens, null); // Special handling for expletive relations if (REL_TYPE_EXPLETIVE.equals(link.getReferenceRelation())) { coreferencesLayer.addRelation(ref, REL_TYPE_EXPLETIVE); // if the relation is expletive, then there must not be a next element in the // chain, so we bail out here. continue; } // Create relation between previous and current reference if (prevLink != null) { coreferencesLayer.addRelation(prevRef, prevLink.getReferenceRelation(), ref); } prevLink = link; prevRef = ref; refs.add(ref); } coreferencesLayer.addReferent(refs); } } }