/******************************************************************************/
/* Copyright (C) 2010-2011, Sebastian Hellmann */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
/* You may obtain a copy of the License at */
/* */
/* http://www.apache.org/licenses/LICENSE-2.0 */
/* */
/* Unless required by applicable law or agreed to in writing, software */
/* distributed under the License is distributed on an "AS IS" BASIS, */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
/* See the License for the specific language governing permissions and */
/* limitations under the License. */
/******************************************************************************/
package org.nlp2rdf.implementation.opennlp;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import org.nlp2rdf.core.NIFParameters;
import org.nlp2rdf.core.NIFWrapper;
import org.nlp2rdf.core.Text2RDF;
import org.nlp2rdf.core.urischemes.URIScheme;
import org.nlp2rdf.core.vocab.NIFAnnotationProperties;
import org.nlp2rdf.core.vocab.NIFDatatypeProperties;
import org.nlp2rdf.core.vocab.NIFObjectProperties;
import org.nlp2rdf.core.vocab.NIFOntClasses;
import org.nlp2rdf.vm.olia.models.OliaInterface;
import org.nlp2rdf.vm.olia.models.Penn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.hp.hpl.jena.ontology.Individual;
import com.hp.hpl.jena.ontology.OntClass;
import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.vocabulary.OWL;
public class OpenNLPWrapper {
private static Logger log = LoggerFactory.getLogger(OpenNLPWrapper.class);
private OntModel nifModel;
private Individual context;
private String documentText;
private String prefix;
private URIScheme uriScheme;
private File modelFolder;
private String lang;
private OliaInterface tagset;
private Set<String> unknownTags;
Text2RDF text2RDF = new Text2RDF();
public OpenNLPWrapper(Individual context, NIFParameters parameters, OntModel model) {
nifModel = model;
this.context = context;
this.documentText = context.getProperty(NIFDatatypeProperties.isString.getDatatypeProperty(nifModel)).getString();
this.prefix = parameters.getPrefix();
this.uriScheme = parameters.getUriScheme();
if(!parameters.getOptions().has("modelFolder")) {
log.error("No model specified, please specify via -modelFolder");
} else {
this.modelFolder = new File(parameters.getOptions().valueOf("modelFolder").toString());
}
if(!parameters.getOptions().has("language")) {
log.warn("No language specified, defaulting to english");
lang = "en";
} else {
lang = parameters.getOptions().valueOf("language").toString();
}
if(!parameters.getOptions().has("tagset")) {
log.warn("No tagset specified, defaulting to Penn");
ClassLoader classLoader = OpenNLPWrapper.class.getClassLoader();
try {
Class tagsetClass = classLoader.loadClass("org.nlp2rdf.vm.olia.models.Penn");
tagset = (OliaInterface) tagsetClass.newInstance();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (InstantiationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
ClassLoader classLoader = OpenNLPWrapper.class.getClassLoader();
try {
Class tagsetClass = classLoader.loadClass("org.nlp2rdf.vm.olia.models."+parameters.getOptions().valueOf("tagset").toString());
tagset = (OliaInterface) tagsetClass.newInstance();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (InstantiationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
unknownTags = new HashSet<String>();
}
public OntModel getNifModel() {
return nifModel;
}
public void processText(Individual context, NIFParameters nifParameters) {
if(modelFolder==null)
return;
//sentence detection
Span[] sentences = sentDetect(documentText);
List<Individual> sentenceResources = addSentences(sentences);
sentences = null;
//tokenizing & pos tagging
Tokenizer tokenizer = null;
InputStream modelIn = null;
try {
// Loading tokenizer model
modelIn = new FileInputStream(modelFolder.getAbsolutePath()+"/"+lang+"-token.bin");
TokenizerModel tokenModel = new TokenizerModel(modelIn);
modelIn.close();
modelIn = new FileInputStream(modelFolder.getAbsolutePath()+"/"+lang+"-pos.bin");
POSModel model = new POSModel(modelIn);
POSTaggerME tagger = new POSTaggerME(model);
modelIn.close();
tokenizer = new TokenizerME(tokenModel);
for(Individual sentence : sentenceResources) {
String sentString = sentence.getProperty(NIFDatatypeProperties.anchorOf.getDatatypeProperty(nifModel)).getString();
Span[] tokenSpans = tokenizer.tokenizePos(sentString);
String tags[] = tagger.tag(getStringsForSpans(tokenSpans, sentString));
//offset relative to the sentence, but has to be relative to document: setting off by start offset of sentence
int sentStart = sentence.getProperty(NIFDatatypeProperties.beginIndex.getDatatypeProperty(nifModel)).getInt();
List<Individual> sentenceWords = addWords(sentence, tokenSpans, sentStart);
for(int i = 0; i < sentenceWords.size(); i++) {
addPos(sentenceWords.get(i), tags[i]);
}
tokenSpans = null;
tags = null;
}
} catch (FileNotFoundException fnf) {
log.error("Model file not found "+ fnf.getMessage());
} catch (final IOException ioe) {
ioe.printStackTrace();
} finally {
if (modelIn != null) {
try {
modelIn.close();
} catch (final IOException e) {}
}
}
}
private String[] getStringsForSpans(Span[] spans, String text) {
String[] strings = new String[spans.length];
for(int i = 0; i < spans.length; i++) {
strings[i] = spans[i].getCoveredText(text).toString();
}
return strings;
}
public Span[] sentDetect(String text) {
SentenceDetector sentenceDetector = null;
InputStream modelIn = null;
try {
// Loading sentence detection model
modelIn = new FileInputStream(modelFolder.getAbsolutePath()+"/"+lang+"-sent.bin");
final SentenceModel sentenceModel = new SentenceModel(modelIn);
modelIn.close();
sentenceDetector = new SentenceDetectorME(sentenceModel);
Span[] spans = sentenceDetector.sentPosDetect(text);
return spans;
} catch (final IOException ioe) {
ioe.printStackTrace();
} finally {
if (modelIn != null) {
try {
modelIn.close();
} catch (final IOException e) {} // oh well!
}
}
return null;
}
public List<Individual> addSentences(Span[] sentences) {
return addSpans(sentences, documentText, NIFOntClasses.Sentence.getOntClass(nifModel),0);
}
public List<Individual> addWords(Individual sentence, Span[] words, int offset) {
List<Individual> wordResources = addSpans(words, documentText, NIFOntClasses.Word.getOntClass(nifModel), offset);
for(Individual wordRes : wordResources) {
wordRes.addProperty(NIFObjectProperties.sentence.getObjectProperty(nifModel), sentence);
sentence.addProperty(NIFObjectProperties.word.getObjectProperty(nifModel), wordRes);
}
return wordResources;
}
//tagset based on training corpus, is Tiger for German models
private void addPos(Individual wordResource, String posTag) {
List<String> oliaIndividual = (List<String>) tagset.getTags().get(posTag);
if (oliaIndividual != null) {
for (String s : oliaIndividual) {
wordResource.addProperty(NIFObjectProperties.oliaLink.getObjectProperty(nifModel), nifModel.createIndividual(s, OWL.Thing));
List<String> taglinks = (List<String>) tagset.getLinks().get(s);
if (taglinks != null) {
for (String oc : taglinks) {
wordResource.addProperty(NIFAnnotationProperties.oliaCategory.getAnnotationProperty(nifModel), nifModel.createClass(oc));
}
} else {
if(!unknownTags.contains(posTag)) {
log.warn("missing oliaLinks for "+posTag);
unknownTags.add(posTag);
}
}
}
} else {
if(!unknownTags.contains(posTag)) {
log.warn("missing tag in olia model: "+posTag);
unknownTags.add(posTag);
}
}
}
public List<Individual> addSpans(Span[] spans, String text, OntClass spanClass) {
return addSpans(spans, text, spanClass,0);
}
public List<Individual> addSpans(Span[] spans, String text, OntClass spanClass, int offset) {
List<Individual> resources = new ArrayList<Individual>();
for(int i = 0; i < spans.length; i++) {
Span span = spans[i];
int start = span.getStart()+offset;
int end = span.getEnd()+offset;
org.nlp2rdf.core.Span nifSpan = new org.nlp2rdf.core.Span(start, end);
Individual spanResource = text2RDF.createCStringIndividual(prefix, context, nifSpan, uriScheme, nifModel);
spanResource.addOntClass(spanClass);
spanResource.addProperty(NIFObjectProperties.referenceContext.getObjectProperty(nifModel), context);
spanResource.addLiteral(NIFDatatypeProperties.anchorOf.getDatatypeProperty(nifModel),nifModel.createLiteral(text.substring(start,end)));
resources.add(spanResource);
}
return resources;
}
}