/* * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dkpro.core.io.nif.internal; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import org.apache.commons.collections4.iterators.IteratorIterable; import org.apache.jena.rdf.model.Model; import org.apache.jena.rdf.model.Property; import org.apache.jena.rdf.model.Resource; import org.apache.jena.rdf.model.Statement; import org.apache.jena.vocabulary.RDF; import org.apache.uima.cas.Type; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; public class Nif2DKPro { private MappingProvider posMappingProvider; public void setPosMappingProvider(MappingProvider aPosMappingProvider) { posMappingProvider = aPosMappingProvider; } public void convert(Statement aContext, JCas aJCas) { Model m = aContext.getModel(); final Resource tSentence = m.createResource(NIF.TYPE_SENTENCE); final Resource tWord = m.createResource(NIF.TYPE_WORD); final Resource tTitle = m.createResource(NIF.TYPE_TITLE); final Resource tParagraph = m.createResource(NIF.TYPE_PARAGRAPH); final Property pReferenceContext = m.createProperty(NIF.PROP_REFERENCE_CONTEXT); final Property pIsString = m.createProperty(NIF.PROP_IS_STRING); final Property pBeginIndex = m.createProperty(NIF.PROP_BEGIN_INDEX); final Property pEndIndex = m.createProperty(NIF.PROP_END_INDEX); final Property pLemma = m.createProperty(NIF.PROP_LEMMA); final Property pStem = m.createProperty(NIF.PROP_STEM); final Property pPosTag = m.createProperty(NIF.PROP_POS_TAG); final Property pTaIdentRef = m.createProperty(ITS.PROP_TA_IDENT_REF); final Property pTaClassRef = m.createProperty(ITS.PROP_TA_CLASS_REF); // Convert context node -> document text String text = m .getProperty(aContext.getSubject(), pIsString) .getString(); aJCas.setDocumentText(text); // Convert headings/titles Iterator<Resource> headingIterator = m .listResourcesWithProperty(RDF.type, tTitle) .filterKeep(res -> res.getProperty( pReferenceContext).getResource().equals(aContext.getSubject())); for (Resource nifTitle : new IteratorIterable<Resource>(headingIterator)) { int begin = nifTitle.getProperty(pBeginIndex).getInt(); int end = nifTitle.getProperty(pEndIndex).getInt(); Heading uimaHeading = new Heading(aJCas, begin, end); uimaHeading.addToIndexes(); assert assertSanity(nifTitle, uimaHeading); } // Convert paragraphs Iterator<Resource> paragraphIterator = m .listResourcesWithProperty(RDF.type, tParagraph) .filterKeep(res -> res.getProperty( pReferenceContext).getResource().equals(aContext.getSubject())); for (Resource nifParagraph : new IteratorIterable<Resource>(paragraphIterator)) { int begin = nifParagraph.getProperty(pBeginIndex).getInt(); int end = nifParagraph.getProperty(pEndIndex).getInt(); Paragraph uimaParagraph = new Paragraph(aJCas, begin, end); uimaParagraph.addToIndexes(); assert assertSanity(nifParagraph, uimaParagraph); } // Convert sentences List<Resource> nifSentences = m .listResourcesWithProperty(RDF.type, tSentence) .filterKeep(res -> res.getProperty( pReferenceContext).getResource().equals(aContext.getSubject())) .toList(); nifSentences.sort((a, b) -> a.getProperty(pBeginIndex).getInt() - b.getProperty(pBeginIndex).getInt()); for (Resource nifSentence : nifSentences) { int begin = nifSentence.getProperty(pBeginIndex).getInt(); int end = nifSentence.getProperty(pEndIndex).getInt(); Sentence uimaSentence = new Sentence(aJCas, begin, end); uimaSentence.addToIndexes(); assert assertSanity(nifSentence, uimaSentence); } // Convert tokens Iterator<Resource> tokenIterator = m .listResourcesWithProperty(RDF.type, tWord) .filterKeep(res -> res.getProperty( pReferenceContext).getResource().equals(aContext.getSubject())); for (Resource nifWord : new IteratorIterable<Resource>(tokenIterator)) { int begin = nifWord.getProperty(pBeginIndex).getInt(); int end = nifWord.getProperty(pEndIndex).getInt(); Token uimaToken = new Token(aJCas, begin, end); uimaToken.addToIndexes(); assert assertSanity(nifWord, uimaToken); // Convert lemma if (nifWord.hasProperty(pLemma)) { Lemma uimaLemma = new Lemma(aJCas, uimaToken.getBegin(), uimaToken.getEnd()); uimaLemma.setValue(nifWord.getProperty(pLemma).getString()); uimaLemma.addToIndexes(); uimaToken.setLemma(uimaLemma); } // Convert stem if (nifWord.hasProperty(pLemma)) { Stem uimaStem = new Stem(aJCas, uimaToken.getBegin(), uimaToken.getEnd()); uimaStem.setValue(nifWord.getProperty(pStem).getString()); uimaStem.addToIndexes(); uimaToken.setStem(uimaStem); } // Convert posTag (this is discouraged, the better alternative should be oliaLink) if (nifWord.hasProperty(pPosTag)) { String tag = nifWord.getProperty(pStem).getString(); Type posTag = posMappingProvider.getTagType(tag); POS uimaPos = (POS) aJCas.getCas().createAnnotation(posTag, uimaToken.getBegin(), uimaToken.getEnd()); uimaPos.setPosValue(tag.intern()); uimaPos.setCoarseValue(uimaPos.getClass().equals(POS.class) ? null : uimaPos.getType().getShortName().intern()); uimaPos.addToIndexes(); uimaToken.setPos(uimaPos); } } // Convert named entities // // NIF uses taIdentRef to link to a unique instance of an entity and taClassRef to identify // the category of the entity. Named entity recognizers in DKPro Core just categorizes the // entity, e.g. as a person, location, or whatnot. For what NIF uses, we'd need a named // entity linker, not just a recognizer. Furthermore, the DKPro Core named entity // recognizers are not mapped to a common tag set (unlike e.g. POS which is mapped to // the universal POS tags). // // So, what we do here is treating the URI of the taClassRef in NIF simply as the // named entity category and store it. // // Here we use duck-typing, i.e. it has a taClassRef property then it is likely a named // entity. NIF 2.1 [1] appears to introduce a representation of named entities using the // class "EntityOccurrence", but e.g. kore50 [2] doesn't seem to use that - it uses "Phrase" // instead. // // [1] http://nif.readthedocs.io/en/2.1-rc/prov-and-conf.html // [2] https://datahub.io/dataset/kore-50-nif-ner-corpus Set<Resource> nifNamedEntities1 = m .listResourcesWithProperty(pTaIdentRef) .filterKeep(res -> res.getProperty( pReferenceContext).getResource().equals(aContext.getSubject())) .toSet(); Set<Resource> nifNamedEntities2 = m .listResourcesWithProperty(pTaIdentRef) .filterKeep(res -> res.getProperty( pReferenceContext).getResource().equals(aContext.getSubject())) .toSet(); Set<Resource> nifNamedEntities = new HashSet<Resource>(); nifNamedEntities.addAll(nifNamedEntities1); nifNamedEntities.addAll(nifNamedEntities2); for (Resource nifNamedEntity : nifNamedEntities) { int begin = nifNamedEntity.getProperty(pBeginIndex).getInt(); int end = nifNamedEntity.getProperty(pEndIndex).getInt(); NamedEntity uimaNamedEntity = new NamedEntity(aJCas, begin, end); if (nifNamedEntity.hasProperty(pTaClassRef)) { uimaNamedEntity .setValue(nifNamedEntity.getProperty(pTaClassRef).getResource().getURI()); } if (nifNamedEntity.hasProperty(pTaIdentRef)) { uimaNamedEntity.setIdentifier( nifNamedEntity.getProperty(pTaIdentRef).getResource().getURI()); } uimaNamedEntity.addToIndexes(); assert assertSanity(nifNamedEntity, uimaNamedEntity); } } private static boolean assertSanity(Resource aNif, Annotation aUima) { final Property pAnchorOf = aNif.getModel().createProperty(NIF.PROP_ANCHOR_OF); int docLength = aUima.getCAS().getDocumentText().length(); if (aNif.hasProperty(pAnchorOf)) { String nifText = aNif.getProperty(pAnchorOf).getString(); String uimaText = aUima.getCoveredText(); assert nifText.equals(uimaText); } assert aUima.getBegin() >= 0 && aUima.getBegin() <= docLength; assert aUima.getEnd() >= 0 && aUima.getEnd() <= docLength; return true; } }