Tokenizer.java example

Explorer
context-master
- src
  - context
/*
 
 * Copyright (c) 2015 University of Illinois Board of Trustees, All rights reserved.   
 * Developed at GSLIS/ the iSchool, by Dr. Jana Diesner, Amirhossein Aleyasen,    
 * Chieh-Li Chin, Shubhanshu Mishra, Kiumars Soltani, and Liang Tao.     
 *   
 * This program is free software; you can redistribute it and/or modify it under   
 * the terms of the GNU General Public License as published by the Free Software   
 * Foundation; either version 2 of the License, or any later version.   
 *    
 * This program is distributed in the hope that it will be useful, but WITHOUT   
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or    
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for   
 * more details.   
 *    
 * You should have received a copy of the GNU General Public License along with   
 * this program; if not, see <http://www.gnu.org/licenses>.   
 *
 
 
 */
package context.core.tokenizer;

import context.core.task.pos.POSTagger;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

/**
 *
 * @author Aale
 */
public class Tokenizer {

    private static StanfordCoreNLP pipeline;

    static {
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit");
        pipeline = new StanfordCoreNLP(props);
    }

    /**
     *
     * @param text
     * @param docId
     * @return
     */
    public static Map<String, CustomToken> tokenize(String text, String docId) {
        Map<String, CustomToken> customTokens = new LinkedHashMap<>();
        Annotation document = new Annotation(text);
        pipeline.annotate(document);

        List<CoreMap> sentences = document.get(SentencesAnnotation.class);
        int sentIndex = 0;
        for (CoreMap sentence : sentences) {
            // traversing the words in the current sentence
            // a CoreLabel is a CoreMap with additional token-specific methods
            int index = 0;

            final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
            final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en");
            for (TaggedWord token : taggedWords) {
                // this is the text of the token
                String word = token.word();
                // this is the POS tag of the token
                String pos = token.tag();
                // this is the NER label of the token
//                String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
                CustomToken ctoken = new CustomToken();
                ctoken.setWord(word);
                ctoken.setBeginPosition(token.beginPosition());
                ctoken.setEndPosition(token.endPosition());
                ctoken.setDocId(docId);
                ctoken.setSentenceIndex(sentIndex);
                ctoken.setMultiword(false);
                ctoken.setIndex(index);
                ctoken.setPos(pos);

                customTokens.put(word + "/" + docId + "/" + sentIndex + "/" + index, ctoken);
                index++;

            }
            sentIndex++;
        }
        return customTokens;
    }

    /**
     *
     * @param args
     */
    public static void main(String[] args) {
        String text = "Amir is a master student in Computer Science at UIUC. Shub is a PhD student in GSLIS at UIUC";
        final Map<String, CustomToken> tokens = Tokenizer.tokenize(text, "1");
        for (String key : tokens.keySet()) {
            System.out.println(key + "\t" + tokens.get(key));
        }

    }
}