SnowballWrapper.java example

Explorer

software-master
- java-maven
  - core
    - jena
      - src
        main
        java
        org
        nlp2rdf
        cli
        ParameterException.java
        ParameterParser.java
        Validate.java
        core
        Format.java
        NIFNamespaces.java
        NIFParameters.java
        NIFWrapper.java
        RDFUnitValidatorWrapper.java
        RLOGSLF4JBinding.java
        SPARQLValidator.java
        Span.java
        Text2RDF.java
        urischemes
        AbstractURIScheme.java
        CStringInst.java
        ContextHashBasedString.java
        NIFParserException.java
        OffsetBasedString.java
        RFC5147String.java
        URIScheme.java
        URISchemeHelper.java
        util
        URIComparator.java
        WordComparator.java
        vocab
        NIFAnnotationProperties.java
        NIFDatatypeProperties.java
        NIFObjectProperties.java
        NIFOntClasses.java
        RLOGDatatypeProperties.java
        RLOGIndividuals.java
        RLOGObjectProperties.java
        RLOGOntClasses.java
        webservice
        NIFParameterWebserviceFactory.java
        NIFServlet.java
        test
        java
        org
        nlp2rdf
        core
        SPARQLValidatorTest.java
    - owlapi
      - src
        main
        java
        org
        nlp2rdf
        owlapi
        io
        Render.java
  - implementation
    - bundle
      - src
        main
        java
        org
        nlp2rdf
        implementation
        snowball
        Dummy.java
    - conll
      - src
        main
        java
        org
        nlp2rdf
        implementation
        conll
        ConLLToNIF.java
        ConLLToNIFCLI.java
        ConLLWord.java
        test
        java
        org
        nlp2rdf
        implementation
        conll
        ConLLTest.java
    - lexo
      - src
        main
        java
        org
        nlp2rdf
        core
        vocab
        LExOAnnotationProperties.java
        LExODatatypeProperties.java
        LExOObjectProperties.java
        LExOOntClasses.java
        implementation
        lexo
        LExO.java
        LexoCLI.java
    - opennlp
      - src
        main
        java
        org
        nlp2rdf
        implementation
        opennlp
        NIFNameSampleStream.java
        NIFNameSampleStreamFactory.java
        NIFPOSSampleStream.java
        NIFPOSSampleStreamFactory.java
        NIFSentenceSampleStream.java
        NIFSentenceSampleStreamFactory.java
        NIFTokenSampleStream.java
        OpenNLPWS.java
        OpenNLPWrapper.java
        OpenNLPWrapperCLI.java
        TypedRDFResourceStream.java
        train
        OpenNLPModelTrainer.java
    - snowball
      - src
        main
        java
        org
        nlp2rdf
        implementation
        snowball
        SnowballCLI.java
        SnowballWS.java
        SnowballWrapper.java
    - spotlight
      - src
        main
        java
        org
        nlp2rdf
        implementation
        spotlight
        SpotlightCLI.java
        SpotlightWS.java
        SpotlightWrapper.java
    - stanfordcorenlp
      - src
        main
        java
        org
        nlp2rdf
        implementation
        stanfordcorenlp
        StanfordCLI.java
        StanfordWS.java
        StanfordWrapper.java
        test
        java
        org
        nlp2rdf
        implementation
        stanfordcore
        StanfordTest.java
    - validator
      - src
        main
        java
        org
        nlp2rdf
        implementation
        validator
        ValidateCLI.java
        ValidateWS.java
        test
        java
        org
        nlp2rdf
        implementation
        validator
        ValidateTest.java
  - tools
    - src
      - main
        java
        org
        nlp2rdf
        tools
        Generate.java
  - vocabularymodule
    - OLiA
      - src
        main
        java
        org
        nlp2rdf
        vm
        dep
        StanfordSimple.java
        olia
        models
        Ancorra.java
        Brown.java
        Connexor.java
        Connexor_phrase.java
        Dcr.java
        Dzongkha.java
        E_gold.java
        Eagles.java
        Emille.java
        Genia.java
        Iiit.java
        Ilposts.java
        Is_poesio.java
        Is_stuttgart.java
        Meno.java
        Morphisto.java
        Msd_bg.java
        Msd_cs.java
        Msd_en.java
        Msd_et.java
        Msd_fa.java
        Msd_hu.java
        Msd_mk.java
        Msd_pl.java
        Msd_ro.java
        Msd_ru.java
        Msd_sk.java
        Msd_sl.java
        Msd_sl_rozaj.java
        Msd_sr.java
        Msd_uk.java
        Mte.java
        OliaInterface.java
        Olia_gold.java
        Onto.java
        Penn.java
        Penn_syntax.java
        Qtag.java
        Russ.java
        Russleeds.java
        SfbSixThreeTwo.java
        SfbSixThreeTwo_is.java
        Stanford.java
        Stts.java
        Susa.java
        Tcodex.java
        Tibet.java
        Tiger.java
        Treetagger.java
        Tueba.java
        Urdu.java

/******************************************************************************/
/*  Copyright (C) 2010-2011, Sebastian Hellmann                               */
/*                                                                            */
/*  Licensed under the Apache License, Version 2.0 (the "License");           */
/*  you may not use this file except in compliance with the License.          */
/*  You may obtain a copy of the License at                                   */
/*                                                                            */
/*      http://www.apache.org/licenses/LICENSE-2.0                            */
/*                                                                            */
/*  Unless required by applicable law or agreed to in writing, software       */
/*  distributed under the License is distributed on an "AS IS" BASIS,         */
/*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
/*  See the License for the specific language governing permissions and       */
/*  limitations under the License.                                            */
/******************************************************************************/

package org.nlp2rdf.implementation.snowball;

import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
import com.hp.hpl.jena.ontology.Individual;
import com.hp.hpl.jena.ontology.OntModel;

import com.hp.hpl.jena.util.iterator.ExtendedIterator;

import org.nlp2rdf.core.*;
import org.nlp2rdf.core.urischemes.URIScheme;
import org.nlp2rdf.core.vocab.NIFDatatypeProperties;
import org.nlp2rdf.core.vocab.NIFOntClasses;
import org.nlp2rdf.implementation.stanfordcorenlp.StanfordWrapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.ext.PorterStemmer;

import java.util.*;

/**
 * A Wrapper for Tartarus' Snowball Stemmer. The name of a class from
 * org.tartarus.stanfordcore.ext. can be given to initialize the stemmer see:
 * http://lucene.apache.org/java/2_4_0/api/contrib-stanfordcore/index.html
 * <p/>
 * This decorator attaches the stem to each :Word it finds.
 * <p/>
 * User: Sebastian Hellmann -
 * http://bis.informatik.uni-leipzig.de/SebastianHellmann
 */

public class SnowballWrapper extends NIFWrapper{
	private static Logger log = LoggerFactory.getLogger(SnowballWrapper.class);
	private String stemmerClass = "PorterStemmer";


	/**
	 * For the English PorterStemmer
	 */
	public SnowballWrapper() {
		this("PorterStemmer");
	}

	/**
	 * @param stemmerClass
	 *            a class from the following list
	 *            http://lucene.apache.org/java/2_4_0
	 *            /api/contrib-stanfordcore/index.html
	 */
	public SnowballWrapper(String stemmerClass) {
		this.stemmerClass = stemmerClass;
		// openNLPTokenizer = new OpenNLPTokenizer();
		// try {
		// decoratee = (SnowballProgram)
		// Class.forName("org.tartarus.stanfordcore.ext." +
		// stemmerClass).newInstance();
		// } catch (Exception e) {
		// String msg =
		// "Correct class was not given please use e.g. \"PorterStemmer\"  from: http://lucene.apache.org/java/2_4_0/api/contrib-stanfordcore/index.html\n"
		// + "Received: " + stemmerClass +
		// " transformed to org.tartarus.stanfordcore.ext." + stemmerClass;
		// log.error(msg, e);
		// throw new InvalidParameterException(msg);
	}


	private final Set<String> stopWords = new HashSet<String>(
			Arrays.asList(new String[] { "i", "me", "my", "myself", "we",
					"our", "ours", "ourselves", "you", "your", "yours",
					"yourself", "yourselves", "he", "him", "his", "himself",
					"she", "her", "hers", "herself", "it", "its", "itself",
					"they", "them", "their", "theirs", "themselves", "what",
					"which", "who", "whom", "this", "that", "these", "those",
					"am", "is", "are", "was", "were", "be", "been", "being",
					"have", "has", "had", "having", "do", "does", "did",
					"doing", "would", "should", "could", "ought", "i'm",
					"you're", "he's", "she's", "it's", "we're", "they're",
					"i've", "you've", "we've", "they've", "i'd", "you'd",
					"he'd", "she'd", "we'd", "they'd", "i'll", "you'll",
					"he'll", "she'll", "we'll", "they'll", "isn't", "aren't",
					"wasn't", "weren't", "hasn't", "haven't", "hadn't",
					"doesn't", "don't", "didn't", "won't", "wouldn't",
					"shan't", "shouldn't", "can't", "cannot", "couldn't",
					"mustn't", "let's", "that's", "who's", "what's", "here's",
					"there's", "when's", "where's", "why's", "how's", "a",
					"an", "the", "and", "but", "if", "or", "because", "as",
					"until", "while", "of", "at", "by", "for", "with", "about",
					"against", "between", "into", "through", "during",
					"before", "after", "above", "below", "to", "from", "up",
					"down", "in", "out", "on", "off", "over", "under", "again",
					"further", "then", "once", "here", "there", "when",
					"where", "why", "how", "all", "any", "both", "each", "few",
					"more", "most", "other", "some", "such", "no", "nor",
					"not", "only", "own", "same", "so", "than", "too", "very" }));



	public void process(Individual context, OntModel inputModel,
						OntModel outputModel, NIFParameters nifParameters) {

		new StanfordWrapper().process(context,inputModel,outputModel,nifParameters);
		String contextString = context
				.getPropertyValue(
						NIFDatatypeProperties.isString
								.getDatatypeProperty(inputModel)).asLiteral()
				.getString();
		int x = 0;

		for (ExtendedIterator<Individual> it = outputModel
				.listIndividuals(NIFOntClasses.Word.getOntClass(outputModel)); it
					 .hasNext();) {
			Individual word = it.next();
			/********************************
			 * Stem
			 ******/
			// EnglishStemmer stem = new EnglishStemmer();
			int begin = word.getPropertyValue(NIFDatatypeProperties.beginIndex.getDatatypeProperty(outputModel)).asLiteral().getInt();
			int end = word.getPropertyValue(NIFDatatypeProperties.endIndex.getDatatypeProperty(outputModel)).asLiteral().getInt();
			String targetstring = new Span(begin, end).getCoveredText(contextString).toString()
					.toLowerCase();

			PorterStemmer stemmer = new PorterStemmer();
			stemmer.setCurrent(targetstring);
			stemmer.stem();

			String stemmedWord = stemmer.getCurrent();
			word.addProperty(NIFDatatypeProperties.stem
					.getDatatypeProperty(outputModel), stemmedWord, XSDDatatype.XSDstring);


			x++;
		}
	}

	public void processText2(String prefix, Individual context,
			URIScheme urischeme, OntModel model) {
		String contextString = context
				.getPropertyValue(
						NIFDatatypeProperties.isString
								.getDatatypeProperty(model)).asLiteral()
				.getString();
		/**
		 * model.listIndividuals(NIFOntClasses) for (Word w : Word.list(model))
		 * { try { w.addStem(stem(w.getAnchorOf()).toLowerCase()); if
		 * (stopWords.contains(w.getAnchorOf())) { StopWord.create(w.getURI(),
		 * model); }
		 * 
		 * } catch (Exception e) { log.warn("Stemming failed for " +
		 * w.getAnchorOf() + ", " + w.getURI(), e); } }
		 **/
	}



	public SnowballProgram decoratee;

	/**
	 * public void processText(String prefix, URIGenerator urigenerator, String
	 * text, OntModel model) { TreeMap<Span, List<Span>> tokenizedText =
	 * openNLPTokenizer.tokenizeText(text); Text2RDF text2RDF = new Text2RDF();
	 * Individual context = text2RDF.createDocumentAnnotation(prefix, text,
	 * urigenerator, model); text2RDF.generateNIFModel(prefix, text,
	 * tokenizedText, urigenerator, context, model); processNIFModel(model);
	 * //add additional data new Text2RDF().addNextAndPreviousProperties(prefix,
	 * text, urigenerator, model); }
	 **/
}
	/*
	public void processText3(Individual context, OntModel inputModel,
			OntModel outputModel, NIFParameters nifParameters) {

		String contextString = context
				.getPropertyValue(
						NIFDatatypeProperties.isString
								.getDatatypeProperty(inputModel)).asLiteral()
				.getString();
		String prefix = nifParameters.getPrefix();
		URIScheme urischeme = nifParameters.getUriScheme();

		Properties props = new Properties();
		props.put("annotators", "tokenize, ssplit"); // ner, dcoref");
		StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

		// create an empty Annotation just with the given text
		Annotation document = new Annotation(contextString);
		// run all Annotators on this text
		pipeline.annotate(document);

		// these are all the sentences in this document
		// a CoreMap is essentially a Map that uses class objects as keys and
		// has values with custom types
		List<CoreMap> sentences = document
				.get(CoreAnnotations.SentencesAnnotation.class);

		// get all the sentences and words and read it in an intermediate
		// structure
		// NOTE: this can be greatly optimized of course
		// for now it is just simple and cheap to implement it like this
		int wordCount = 0;
		TreeMap<Span, List<Span>> tokenizedText = new TreeMap<Span, List<Span>>();
		for (CoreMap sentence : sentences) {
			Span sentenceSpan = new Span(
					sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
					sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
			List<Span> wordSpans = new ArrayList<Span>();
			for (CoreLabel coreLabel : sentence
					.get(CoreAnnotations.TokensAnnotation.class)) {
				wordSpans
						.add(new Span(
								coreLabel
										.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
								coreLabel
										.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)));
				wordCount++;
			}
			tokenizedText.put(sentenceSpan, wordSpans);
		}

		Text2RDF t = new Text2RDF();
		t.generateNIFModel(prefix, context, urischeme, inputModel,
				tokenizedText);
		outputModel.add(RLOGSLF4JBinding.log(nifParameters.getLogPrefix(),
				"Finished creating " + tokenizedText.size()
						+ " sentence(s) with " + wordCount + " word(s) ",
				RLOGIndividuals.DEBUG, this.getClass().getCanonicalName(),
				null, null));

		// traversing the words in the current sentence
		// a CoreLabel is a CoreMap with additional token-specific methods
		for (CoreMap sentence : sentences) {

			for (CoreLabel token : sentence
					.get(CoreAnnotations.TokensAnnotation.class)) {
				Span wordSpan = new Span(
						token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
						token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
				// the word should exist already
				Individual wordIndividual = outputModel.getIndividual(urischeme
						.generate(prefix, contextString, wordSpan));

				if (wordIndividual == null) {
					log.error("SKIPPING: word was not found in the model: "
							+ urischeme.generate(prefix, contextString,
									wordSpan));
					continue;
				}

				********************************
				 * Stem
				 ******
				// EnglishStemmer stem = new EnglishStemmer();

				String word = wordSpan.getCoveredText(contextString).toString()
						.toLowerCase();

				PorterStemmer stem = new PorterStemmer();
				stem.setCurrent(word);
				stem.stem();

				String stemmedWord = stem.getCurrent();

				if (!(stopWords.contains(word)))
					if(!(word.equals(stemmedWord)))
					wordIndividual.addProperty(NIFDatatypeProperties.stem
							.getDatatypeProperty(outputModel), stemmedWord);
			}
		}
	} */