StanfordTokenizer.java example

Explorer

KOSHIK-master
- src
  - main
    - java
      - org
        apache
        avro
        mapreduce
        TextStats.java
        mahout
        text
        wikipedia
        XmlInputFormat.java
      - se
        lth
        cs
        koshik
        analysis
        ContentProcessor.java
        is2
        IS2Lemmatizer.java
        IS2POSTagger.java
        IS2SyntacticDependencyParser.java
        lth
        LTHSemanticRoleLabeler.java
        malt
        MaltParserProcessor.java
        opennlp
        SentenceDetectorProcessor.java
        stagger
        StaggerProcessor.java
        stanford
        StanfordTokenizer.java
        wikipedia
        TextConverter.java
        example
        AvroStats.java
        DocumentToText.java
        ExtractWikiSectionsLinks.java
        ReadDocument.java
        input
        TextFileImportMapper.java
        conll
        CoNLL2006FileImportMapper.java
        CoNLL2006Reader.java
        CoNLL2009FileImportMapper.java
        CoNLL2009Reader.java
        CoNLLFeature.java
        CoNLLReader.java
        wikipedia
        WikipediaImportMapper.java
        language
        EnglishWikipediaPage.java
        SwedishWikipediaPage.java
        WikipediaPage.java
        WikipediaPageFactory.java
        io
        hadoop
        WholeTextFileInputFormat.java
        model
        Annotation.java
        Document.java
        avro
        AvroAnnotation.java
        AvroDocument.java
        text
        RootToken.java
        Sentence.java
        Span.java
        Token.java
        wikipedia
        InternalLink.java
        Section.java
        util
        EnglishPipeline.java
        Import.java
        SwedishPipeline.java
  - src
    - main
      - java
        org
        apache
        avro
        mapreduce
        TextStats.java
        mahout
        text
        wikipedia
        XmlInputFormat.java
        se
        lth
        cs
        koshik
        analysis
        ContentProcessor.java
        is2
        IS2Lemmatizer.java
        IS2POSTagger.java
        IS2SyntacticDependencyParser.java
        lth
        CharacterMapper.java
        LTHSemanticRoleLabeler.java
        LTHSimpleChineseLemmatizer.java
        LTHStanfordChineseSegmenterWrapper.java
        SimpleSentenceDetector.java
        malt
        MaltParserProcessor.java
        opennlp
        SentenceDetectorProcessor.java
        stagger
        StaggerProcessor.java
        stanford
        StanfordTokenizer.java
        wikipedia
        TextConverter.java
        example
        AvroStats.java
        DocumentToCoNLL2009.java
        ExtractWikiSectionsLinks.java
        ReadDocument.java
        input
        TextFileImportMapper.java
        conll
        CoNLL2006FileImportMapper.java
        CoNLL2006Reader.java
        CoNLL2009FileImportMapper.java
        CoNLL2009Reader.java
        CoNLLFeature.java
        CoNLLReader.java
        wikipedia
        WikipediaImportMapper.java
        language
        ChineseWikipediaPage.java
        EnglishWikipediaPage.java
        SwedishWikipediaPage.java
        WikipediaPage.java
        WikipediaPageFactory.java
        io
        hadoop
        WholeTextFileInputFormat.java
        model
        Annotation.java
        Document.java
        avro
        AvroAnnotation.java
        AvroDocument.java
        text
        RootToken.java
        Sentence.java
        Span.java
        Token.java
        wikipedia
        InternalLink.java
        Section.java
        util
        ChinesePipeline.java
        EnglishPipeline.java
        Import.java
        SwedishPipeline.java

/**
 * KOSHIK is an NLP framework for large scale processing using Hadoop. 
 * Copyright © 2014 Peter Exner
 * 
 * This file is part of KOSHIK.
 *
 * KOSHIK is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * KOSHIK is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with KOSHIK.  If not, see <http://www.gnu.org/licenses/>.
 */

package se.lth.cs.koshik.analysis.stanford;

import se.lth.cs.srl.preprocessor.tokenization.StanfordPTBTokenizer;
import se.lth.cs.srl.preprocessor.tokenization.Tokenizer;
import se.lth.cs.koshik.analysis.ContentProcessor;
import se.lth.cs.koshik.input.conll.CoNLLFeature;
import se.lth.cs.koshik.model.Document;
import se.lth.cs.koshik.model.text.Sentence;
import se.lth.cs.koshik.model.text.Token;

public class StanfordTokenizer implements ContentProcessor {
	private Tokenizer  tokenizer;
	
	public StanfordTokenizer() {
		tokenizer = new StanfordPTBTokenizer();
	}

	@Override
	public void process(Document document) throws Exception {
		for(Sentence sentence:document.select(Sentence.class)) {
			String tokens[] = tokenizer.tokenize(sentence.getContent());
			
			int offset = 0;
			int begin = 0;
			for(int i=1; i<tokens.length;i++) {
				if(tokens[i].equalsIgnoreCase("-LRB-") || tokens[i].equalsIgnoreCase("-RRB-")) {
					int end = offset + 2;
					if(end > sentence.getContent().length()) {
						end = sentence.getContent().length();
					}
					begin = sentence.getContent().indexOf(sentence.getContent().substring(offset, end).trim(), offset);
				} else {
					begin = sentence.getContent().indexOf(tokens[i], offset);
				}
				
				if(begin != -1) {
					Token koshikToken = new Token(document);
					koshikToken.setBegin(begin + sentence.getBegin());
					if(tokens[i].equalsIgnoreCase("-LRB-") || tokens[i].equalsIgnoreCase("-RRB-")) {
						offset = begin + 1;
					} else {
						offset = begin + tokens[i].length();
					}
					koshikToken.setEnd(offset + sentence.getBegin());
					koshikToken.setFeature(CoNLLFeature.ID, Integer.toString(i));
					koshikToken.setFeature(CoNLLFeature.FORM, tokens[i]);
				}
			}
		}
	}

}