/******************************************************************************/ /* Copyright (C) 2010-2011, Sebastian Hellmann */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ /* You may obtain a copy of the License at */ /* */ /* http://www.apache.org/licenses/LICENSE-2.0 */ /* */ /* Unless required by applicable law or agreed to in writing, software */ /* distributed under the License is distributed on an "AS IS" BASIS, */ /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ /* See the License for the specific language governing permissions and */ /* limitations under the License. */ /******************************************************************************/ package org.nlp2rdf.implementation.snowball; import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; import com.hp.hpl.jena.ontology.Individual; import com.hp.hpl.jena.ontology.OntModel; import com.hp.hpl.jena.util.iterator.ExtendedIterator; import org.nlp2rdf.core.*; import org.nlp2rdf.core.urischemes.URIScheme; import org.nlp2rdf.core.vocab.NIFDatatypeProperties; import org.nlp2rdf.core.vocab.NIFOntClasses; import org.nlp2rdf.implementation.stanfordcorenlp.StanfordWrapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.ext.PorterStemmer; import java.util.*; /** * A Wrapper for Tartarus' Snowball Stemmer. The name of a class from * org.tartarus.stanfordcore.ext. can be given to initialize the stemmer see: * http://lucene.apache.org/java/2_4_0/api/contrib-stanfordcore/index.html * <p/> * This decorator attaches the stem to each :Word it finds. * <p/> * User: Sebastian Hellmann - * http://bis.informatik.uni-leipzig.de/SebastianHellmann */ public class SnowballWrapper extends NIFWrapper{ private static Logger log = LoggerFactory.getLogger(SnowballWrapper.class); private String stemmerClass = "PorterStemmer"; /** * For the English PorterStemmer */ public SnowballWrapper() { this("PorterStemmer"); } /** * @param stemmerClass * a class from the following list * http://lucene.apache.org/java/2_4_0 * /api/contrib-stanfordcore/index.html */ public SnowballWrapper(String stemmerClass) { this.stemmerClass = stemmerClass; // openNLPTokenizer = new OpenNLPTokenizer(); // try { // decoratee = (SnowballProgram) // Class.forName("org.tartarus.stanfordcore.ext." + // stemmerClass).newInstance(); // } catch (Exception e) { // String msg = // "Correct class was not given please use e.g. \"PorterStemmer\" from: http://lucene.apache.org/java/2_4_0/api/contrib-stanfordcore/index.html\n" // + "Received: " + stemmerClass + // " transformed to org.tartarus.stanfordcore.ext." + stemmerClass; // log.error(msg, e); // throw new InvalidParameterException(msg); } private final Set<String> stopWords = new HashSet<String>( Arrays.asList(new String[] { "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "would", "should", "could", "ought", "i'm", "you're", "he's", "she's", "it's", "we're", "they're", "i've", "you've", "we've", "they've", "i'd", "you'd", "he'd", "she'd", "we'd", "they'd", "i'll", "you'll", "he'll", "she'll", "we'll", "they'll", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "hadn't", "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "can't", "cannot", "couldn't", "mustn't", "let's", "that's", "who's", "what's", "here's", "there's", "when's", "where's", "why's", "how's", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very" })); public void process(Individual context, OntModel inputModel, OntModel outputModel, NIFParameters nifParameters) { new StanfordWrapper().process(context,inputModel,outputModel,nifParameters); String contextString = context .getPropertyValue( NIFDatatypeProperties.isString .getDatatypeProperty(inputModel)).asLiteral() .getString(); int x = 0; for (ExtendedIterator<Individual> it = outputModel .listIndividuals(NIFOntClasses.Word.getOntClass(outputModel)); it .hasNext();) { Individual word = it.next(); /******************************** * Stem ******/ // EnglishStemmer stem = new EnglishStemmer(); int begin = word.getPropertyValue(NIFDatatypeProperties.beginIndex.getDatatypeProperty(outputModel)).asLiteral().getInt(); int end = word.getPropertyValue(NIFDatatypeProperties.endIndex.getDatatypeProperty(outputModel)).asLiteral().getInt(); String targetstring = new Span(begin, end).getCoveredText(contextString).toString() .toLowerCase(); PorterStemmer stemmer = new PorterStemmer(); stemmer.setCurrent(targetstring); stemmer.stem(); String stemmedWord = stemmer.getCurrent(); word.addProperty(NIFDatatypeProperties.stem .getDatatypeProperty(outputModel), stemmedWord, XSDDatatype.XSDstring); x++; } } public void processText2(String prefix, Individual context, URIScheme urischeme, OntModel model) { String contextString = context .getPropertyValue( NIFDatatypeProperties.isString .getDatatypeProperty(model)).asLiteral() .getString(); /** * model.listIndividuals(NIFOntClasses) for (Word w : Word.list(model)) * { try { w.addStem(stem(w.getAnchorOf()).toLowerCase()); if * (stopWords.contains(w.getAnchorOf())) { StopWord.create(w.getURI(), * model); } * * } catch (Exception e) { log.warn("Stemming failed for " + * w.getAnchorOf() + ", " + w.getURI(), e); } } **/ } public SnowballProgram decoratee; /** * public void processText(String prefix, URIGenerator urigenerator, String * text, OntModel model) { TreeMap<Span, List<Span>> tokenizedText = * openNLPTokenizer.tokenizeText(text); Text2RDF text2RDF = new Text2RDF(); * Individual context = text2RDF.createDocumentAnnotation(prefix, text, * urigenerator, model); text2RDF.generateNIFModel(prefix, text, * tokenizedText, urigenerator, context, model); processNIFModel(model); * //add additional data new Text2RDF().addNextAndPreviousProperties(prefix, * text, urigenerator, model); } **/ } /* public void processText3(Individual context, OntModel inputModel, OntModel outputModel, NIFParameters nifParameters) { String contextString = context .getPropertyValue( NIFDatatypeProperties.isString .getDatatypeProperty(inputModel)).asLiteral() .getString(); String prefix = nifParameters.getPrefix(); URIScheme urischeme = nifParameters.getUriScheme(); Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); // ner, dcoref"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // create an empty Annotation just with the given text Annotation document = new Annotation(contextString); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and // has values with custom types List<CoreMap> sentences = document .get(CoreAnnotations.SentencesAnnotation.class); // get all the sentences and words and read it in an intermediate // structure // NOTE: this can be greatly optimized of course // for now it is just simple and cheap to implement it like this int wordCount = 0; TreeMap<Span, List<Span>> tokenizedText = new TreeMap<Span, List<Span>>(); for (CoreMap sentence : sentences) { Span sentenceSpan = new Span( sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); List<Span> wordSpans = new ArrayList<Span>(); for (CoreLabel coreLabel : sentence .get(CoreAnnotations.TokensAnnotation.class)) { wordSpans .add(new Span( coreLabel .get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), coreLabel .get(CoreAnnotations.CharacterOffsetEndAnnotation.class))); wordCount++; } tokenizedText.put(sentenceSpan, wordSpans); } Text2RDF t = new Text2RDF(); t.generateNIFModel(prefix, context, urischeme, inputModel, tokenizedText); outputModel.add(RLOGSLF4JBinding.log(nifParameters.getLogPrefix(), "Finished creating " + tokenizedText.size() + " sentence(s) with " + wordCount + " word(s) ", RLOGIndividuals.DEBUG, this.getClass().getCanonicalName(), null, null)); // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreMap sentence : sentences) { for (CoreLabel token : sentence .get(CoreAnnotations.TokensAnnotation.class)) { Span wordSpan = new Span( token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); // the word should exist already Individual wordIndividual = outputModel.getIndividual(urischeme .generate(prefix, contextString, wordSpan)); if (wordIndividual == null) { log.error("SKIPPING: word was not found in the model: " + urischeme.generate(prefix, contextString, wordSpan)); continue; } ******************************** * Stem ****** // EnglishStemmer stem = new EnglishStemmer(); String word = wordSpan.getCoveredText(contextString).toString() .toLowerCase(); PorterStemmer stem = new PorterStemmer(); stem.setCurrent(word); stem.stem(); String stemmedWord = stem.getCurrent(); if (!(stopWords.contains(word))) if(!(word.equals(stemmedWord))) wordIndividual.addProperty(NIFDatatypeProperties.stem .getDatatypeProperty(outputModel), stemmedWord); } } } */