/******************************************************************************/ /* Copyright (C) 2010-2011, Sebastian Hellmann */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ /* You may obtain a copy of the License at */ /* */ /* http://www.apache.org/licenses/LICENSE-2.0 */ /* */ /* Unless required by applicable law or agreed to in writing, software */ /* distributed under the License is distributed on an "AS IS" BASIS, */ /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ /* See the License for the specific language governing permissions and */ /* limitations under the License. */ /******************************************************************************/ package org.nlp2rdf.implementation.opennlp; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.nlp2rdf.core.vocab.NIFDatatypeProperties; import org.nlp2rdf.core.vocab.NIFObjectProperties; import org.nlp2rdf.core.vocab.NIFOntClasses; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.vocabulary.RDF; import opennlp.tools.tokenize.TokenSample; import opennlp.tools.util.FilterObjectStream; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.Span; public class NIFTokenSampleStream extends FilterObjectStream<Resource, TokenSample>{ public NIFTokenSampleStream(ObjectStream<Resource> resources) { super(resources); } // extract start and end indexes from uri fragment of the word resource private int[] extractWordOffsets(String uri) { int[] offsets = new int[2]; //nif 2.0 String[] indexes = new String[2]; if(uri.contains("#char=")) { String charFragment = uri.substring(uri.lastIndexOf("#char=")); if (!charFragment.isEmpty()) { charFragment = charFragment.substring(6); } else { return null; } indexes = charFragment.split(","); //nif 2.1 } else if(uri.contains("#offset_")) { String charFragment = uri.substring(uri.lastIndexOf("#offset_")); if (!charFragment.isEmpty()) { charFragment = charFragment.substring(8); } else { return null; } indexes = charFragment.split("_"); } if (indexes.length >= 2) { offsets[0] = Integer.valueOf(indexes[0]); offsets[1] = Integer.valueOf(indexes[1]); } else { return null; } return offsets; } public TokenSample read() throws IOException { Resource documentRes = samples.read(); if(documentRes==null) { return null; } Model rdfModel = documentRes.getModel(); Property docRefProp = rdfModel .createProperty(NIFObjectProperties.referenceContext.getUri()); Property stringProp = rdfModel .createProperty(NIFDatatypeProperties.isString.getUri()); Property wordProp = rdfModel.createProperty(NIFOntClasses.Word.getUri()); List<Resource> wordResources = rdfModel.listSubjectsWithProperty( docRefProp, documentRes).toList(); //fall back to getting single sentences //no order because no context if(wordResources.size()==0) { wordResources = rdfModel.listSubjectsWithProperty(RDF.type,wordProp).toList(); // return makeNoContextSample(wordResources, rdfModel); } List<Span> wordSpans = new ArrayList<Span>(); String documentContent = documentRes.getProperty(stringProp).getString(); Map<Integer, Resource> wordMap = new HashMap<Integer, Resource>(); for (Resource wordRes : wordResources) { if(wordRes.hasProperty(RDF.type, wordProp)) { Integer startIndex = this.extractWordOffsets(wordRes.getURI())[0]; wordMap.put(startIndex, wordRes); } } List<Integer> sortedIndexes = new ArrayList<Integer>(); sortedIndexes.addAll(wordMap.keySet()); Collections.sort(sortedIndexes); for (Integer startIndex : sortedIndexes) { int[] borders = this.extractWordOffsets(wordMap.get(startIndex).getURI()); Span wordBorder = new Span(borders[0], borders[1]); wordSpans.add(wordBorder); } return new TokenSample(documentContent, wordSpans.toArray(new Span[wordSpans.size()])); } }