/******************************************************************************/ /* Copyright (C) 2010-2011, Sebastian Hellmann */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ /* You may obtain a copy of the License at */ /* */ /* http://www.apache.org/licenses/LICENSE-2.0 */ /* */ /* Unless required by applicable law or agreed to in writing, software */ /* distributed under the License is distributed on an "AS IS" BASIS, */ /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ /* See the License for the specific language governing permissions and */ /* limitations under the License. */ /******************************************************************************/ package org.nlp2rdf.implementation.opennlp; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.nlp2rdf.core.vocab.NIFDatatypeProperties; import org.nlp2rdf.core.vocab.NIFObjectProperties; import org.nlp2rdf.core.vocab.NIFOntClasses; import opennlp.tools.sentdetect.SentenceSample; import opennlp.tools.util.FilterObjectStream; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.Span; import com.hp.hpl.jena.ontology.OntModel; import com.hp.hpl.jena.ontology.OntModelSpec; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.vocabulary.RDF; public class NIFSentenceSampleStream extends FilterObjectStream<Resource, SentenceSample> { public NIFSentenceSampleStream(ObjectStream<Resource> resources) { super(resources); } // extract start and end indexes from uri fragment of the sentence resource private int[] extractSentenceOffsets(String uri) { int[] offsets = new int[2]; //nif 2.0 String[] indexes = new String[2]; if(uri.contains("#char=")) { String charFragment = uri.substring(uri.lastIndexOf("#char=")); if (!charFragment.isEmpty()) { charFragment = charFragment.substring(6); } else { return null; } indexes = charFragment.split(","); //nif 2.1 } else if(uri.contains("#offset_")) { String charFragment = uri.substring(uri.lastIndexOf("#offset_")); if (!charFragment.isEmpty()) { charFragment = charFragment.substring(8); } else { return null; } indexes = charFragment.split("_"); } if (indexes.length >= 2) { offsets[0] = Integer.valueOf(indexes[0]); offsets[1] = Integer.valueOf(indexes[1]); } else { return null; } return offsets; } // get properties to central final class public SentenceSample read() throws IOException { Resource documentRes = samples.read(); if(documentRes==null) { return null; } Model rdfModel = documentRes.getModel(); Property docRefProp = rdfModel .createProperty(NIFObjectProperties.referenceContext.getUri()); Property stringProp = rdfModel .createProperty(NIFDatatypeProperties.isString.getUri()); Property sentenceProp = rdfModel.createProperty(NIFOntClasses.Sentence.getUri()); List<Resource> sentenceResources = rdfModel.listSubjectsWithProperty( docRefProp, documentRes).toList(); //fall back to getting single sentences //no order because no context if(sentenceResources.size()==0) { sentenceResources = rdfModel.listSubjectsWithProperty(RDF.type,sentenceProp).toList(); return makeNoContextSample(sentenceResources, rdfModel); } List<Span> sentenceSpans = new ArrayList<Span>(); String documentContent = documentRes.getProperty(stringProp).getString(); Map<Integer, Resource> sentenceMap = new HashMap<Integer, Resource>(); for (Resource sentenceRes : sentenceResources) { if(sentenceRes.hasProperty(RDF.type, sentenceProp)) { Integer startIndex = this.extractSentenceOffsets(sentenceRes.getURI())[0]; sentenceMap.put(startIndex, sentenceRes); } } List<Integer> sortedIndexes = new ArrayList<Integer>(); sortedIndexes.addAll(sentenceMap.keySet()); Collections.sort(sortedIndexes); for (Integer startIndex : sortedIndexes) { int[] borders = this.extractSentenceOffsets(sentenceMap.get(startIndex).getURI()); Span sentenceBorder = new Span(borders[0], borders[1]); sentenceSpans.add(sentenceBorder); } return new SentenceSample(documentContent, sentenceSpans.toArray(new Span[sentenceSpans.size()])); } private SentenceSample makeNoContextSample(List<Resource> sentenceResources, Model model) { Property anchorProp = model.createProperty(NIFDatatypeProperties.anchorOf.getUri()); Property stringProp = model.createProperty(NIFDatatypeProperties.isString.getUri()); List<Span> sentenceSpans = new ArrayList<Span>(); int pos = 0; StringBuilder builder = new StringBuilder(); for(Resource sentenceResource : sentenceResources) { String sentence = null; if(sentenceResource.hasProperty(anchorProp)) { sentence = sentenceResource.getProperty(anchorProp).getObject().toString().trim(); } else if(sentenceResource.hasProperty(stringProp)) { sentence = sentenceResource.getProperty(stringProp).getObject().toString().trim(); } if(sentence != null) { builder.append(sentence); builder.append("\n"); sentenceSpans.add(new Span(pos,pos+sentence.length())); pos = pos+sentence.length()+1; } } return new SentenceSample(builder.toString(), sentenceSpans.toArray(new Span[sentenceSpans.size()])); } }