/* * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dkpro.core.io.nif.internal; import static org.apache.jena.datatypes.xsd.XSDDatatype.XSDnonNegativeInteger; import static org.apache.jena.datatypes.xsd.XSDDatatype.XSDstring; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import org.apache.commons.lang3.StringUtils; import org.apache.jena.ontology.Individual; import org.apache.jena.ontology.OntModel; import org.apache.jena.rdf.model.Property; import org.apache.jena.rdf.model.Resource; import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; public class DKPro2Nif { public static void convert(JCas aJCas, OntModel aTarget) { // Shorten down variable name for model OntModel m = aTarget; // Set up query instances final Resource tContext = m.createResource(NIF.TYPE_CONTEXT); final Resource tSentence = m.createResource(NIF.TYPE_SENTENCE); final Resource tWord = m.createResource(NIF.TYPE_WORD); final Resource tTitle = m.createResource(NIF.TYPE_TITLE); final Resource tParagraph = m.createResource(NIF.TYPE_PARAGRAPH); final Resource tEntityOccurrence = m.createResource(NIF.TYPE_ENTITY_OCCURRENCE); final Property pReferenceContext = m.createProperty(NIF.PROP_REFERENCE_CONTEXT); final Property pIsString = m.createProperty(NIF.PROP_IS_STRING); final Property pAnchorOf = m.createProperty(NIF.PROP_ANCHOR_OF); final Property pBeginIndex = m.createProperty(NIF.PROP_BEGIN_INDEX); final Property pEndIndex = m.createProperty(NIF.PROP_END_INDEX); final Property pStem = m.createProperty(NIF.PROP_STEM); final Property pLemma = m.createProperty(NIF.PROP_LEMMA); final Property pPosTag = m.createProperty(NIF.PROP_POS_TAG); final Property pWord = m.createProperty(NIF.PROP_WORD); final Property pNextWord = m.createProperty(NIF.PROP_NEXT_WORD); final Property pPreviousWord = m.createProperty(NIF.PROP_PREVIOUS_WORD); final Property pSentence = m.createProperty(NIF.PROP_SENTENCE); final Property pNextSentence = m.createProperty(NIF.PROP_NEXT_SENTENCE); final Property pPreviousSentence = m.createProperty(NIF.PROP_PREVIOUS_SENTENCE); final Property pTaIdentRef = m.createProperty(ITS.PROP_TA_IDENT_REF); final Property pTaClassRef = m.createProperty(ITS.PROP_TA_CLASS_REF); // Get a URI for the document DocumentMetaData dmd = DocumentMetaData.get(aJCas); String docuri = dmd.getDocumentUri() != null ? dmd.getDocumentUri() : "urn:" + dmd.getDocumentId(); // Convert document -> context node Individual context; { String uri = String.format("%s#offset_%d_%d", docuri, 0, aJCas.getDocumentText().length()); context = m.createIndividual(uri, tContext); context.addLiteral(pIsString, m.createTypedLiteral(aJCas.getDocumentText(), XSDstring)); context.addLiteral(pBeginIndex, m.createTypedLiteral(0, XSDnonNegativeInteger)); context.addLiteral(pEndIndex, m.createTypedLiteral(aJCas.getDocumentText().length(), XSDnonNegativeInteger)); } // Convert headings/titles for (Heading uimaHeading : select(aJCas, Heading.class)) { String headingUri = String.format("%s#offset_%d_%d", docuri, uimaHeading.getBegin(), uimaHeading.getEnd()); Individual nifTitle = m.createIndividual(headingUri, tTitle); nifTitle.addProperty(pReferenceContext, context); nifTitle.addLiteral(pAnchorOf, uimaHeading.getCoveredText()); nifTitle.addLiteral(pBeginIndex, m.createTypedLiteral(uimaHeading.getBegin(), XSDnonNegativeInteger)); nifTitle.addLiteral(pEndIndex, m.createTypedLiteral(uimaHeading.getEnd(), XSDnonNegativeInteger)); } // Convert paragraphs for (Paragraph uimaParagraph : select(aJCas, Paragraph.class)) { String paragraphUri = String.format("%s#offset_%d_%d", docuri, uimaParagraph.getBegin(), uimaParagraph.getEnd()); Individual nifParagraph = m.createIndividual(paragraphUri, tParagraph); nifParagraph.addProperty(pReferenceContext, context); nifParagraph.addLiteral(pAnchorOf, uimaParagraph.getCoveredText()); nifParagraph.addLiteral(pBeginIndex, m.createTypedLiteral(uimaParagraph.getBegin(), XSDnonNegativeInteger)); nifParagraph.addLiteral(pEndIndex, m.createTypedLiteral(uimaParagraph.getEnd(), XSDnonNegativeInteger)); } // Convert sentences Individual previousNifSentence = null; for (Sentence uimaSentence : select(aJCas, Sentence.class)) { String sentenceUri = String.format("%s#offset_%d_%d", docuri, uimaSentence.getBegin(), uimaSentence.getEnd()); Individual nifSentence = m.createIndividual(sentenceUri, tSentence); nifSentence.addProperty(pReferenceContext, context); nifSentence.addLiteral(pAnchorOf, uimaSentence.getCoveredText()); nifSentence.addLiteral(pBeginIndex, m.createTypedLiteral(uimaSentence.getBegin(), XSDnonNegativeInteger)); nifSentence.addLiteral(pEndIndex, m.createTypedLiteral(uimaSentence.getEnd(), XSDnonNegativeInteger)); // Link word sequence if (previousNifSentence != null) { previousNifSentence.addProperty(pNextSentence, nifSentence); nifSentence.addProperty(pPreviousSentence, previousNifSentence); } previousNifSentence = nifSentence; // Convert tokens Individual previousNifWord = null; for (Token uimaToken : selectCovered(Token.class, uimaSentence)) { String wordUri = String.format("%s#offset_%d_%d", docuri, uimaToken.getBegin(), uimaToken.getEnd()); Individual nifWord = m.createIndividual(wordUri, tWord); nifWord.addProperty(pReferenceContext, context); nifWord.addLiteral(pAnchorOf, uimaToken.getCoveredText()); nifWord.addLiteral(pBeginIndex, m.createTypedLiteral(uimaToken.getBegin(), XSDnonNegativeInteger)); nifWord.addLiteral(pEndIndex, m.createTypedLiteral(uimaToken.getEnd(), XSDnonNegativeInteger)); // Link sentence <-> word nifWord.addProperty(pSentence, nifSentence); nifSentence.addProperty(pWord, nifWord); // Link word sequence if (previousNifWord != null) { previousNifWord.addProperty(pNextWord, nifWord); nifWord.addProperty(pPreviousWord, previousNifWord); } previousNifWord = nifWord; // Convert stem if (uimaToken.getStemValue() != null) { nifWord.addProperty(pStem, uimaToken.getStemValue()); } // Convert lemma if (uimaToken.getLemmaValue() != null) { nifWord.addProperty(pLemma, uimaToken.getLemmaValue()); } // Convert posTag (this is discouraged, the better alternative should be oliaLink) if (uimaToken.getPosValue() != null) { nifWord.addProperty(pPosTag, uimaToken.getPosValue()); } } } // Convert named entities // // Actually, the named entity in NIF is different from the one in DKPro Core. NIF uses // taIdentRef to link to a unique instance of an entity. Named entity recognizers in DKPro // Core just categorizes the entity, e.g. as a person, location, or whatnot. For what NIF // uses, we'd need a named entity linker, not just a recognizer. // // We create NEs using the NIF 2.1 class "EntityOccurence". // // So here, we check if the DKPro Core NE value/identifier looks like a URI and if yes, then // we store it into the NIF taIdentRef property - otherwise we ignore it because NIF does // not have the concept of a NE category. for (NamedEntity uimaNamedEntity : select(aJCas, NamedEntity.class)) { String neClass = uimaNamedEntity.getValue(); String neIdentifier = uimaNamedEntity.getValue(); boolean neClassIsUri = StringUtils.startsWith(neClass, "http://"); boolean neIdentifierIsUri = StringUtils.startsWith(neIdentifier, "http://"); // The crudest form of checking for a URI, but since "http://" appears to be the default // prefix in the semantic web, let's just stick with it for the moment. if (!neClassIsUri && !neIdentifierIsUri) { continue; } String neUri = String.format("%s#offset_%d_%d", docuri, uimaNamedEntity.getBegin(), uimaNamedEntity.getEnd()); Individual nifNamedEntity = m.createIndividual(neUri, tEntityOccurrence); nifNamedEntity.addProperty(pReferenceContext, context); nifNamedEntity.addLiteral(pAnchorOf, uimaNamedEntity.getCoveredText()); nifNamedEntity.addLiteral(pBeginIndex, m.createTypedLiteral(uimaNamedEntity.getBegin(), XSDnonNegativeInteger)); nifNamedEntity.addLiteral(pEndIndex, m.createTypedLiteral(uimaNamedEntity.getEnd(), XSDnonNegativeInteger)); if (neClassIsUri) { nifNamedEntity.addProperty(pTaClassRef, m.createResource(neClass)); } if (neIdentifierIsUri) { nifNamedEntity.addProperty(pTaClassRef, m.createResource(neIdentifier)); } } } }