/* * Copyright (c) 2015 University of Illinois Board of Trustees, All rights reserved. * Developed at GSLIS/ the iSchool, by Dr. Jana Diesner, Amirhossein Aleyasen, * Chieh-Li Chin, Shubhanshu Mishra, Kiumars Soltani, and Liang Tao. * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation; either version 2 of the License, or any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, see <http://www.gnu.org/licenses>. * */ package context.core.tokenizer; import context.core.task.pos.POSTagger; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Properties; /** * * @author Aale */ public class Tokenizer { private static StanfordCoreNLP pipeline; static { Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } /** * * @param text * @param docId * @return */ public static Map<String, CustomToken> tokenize(String text, String docId) { Map<String, CustomToken> customTokens = new LinkedHashMap<>(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); int sentIndex = 0; for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods int index = 0; final List<CoreLabel> sent = sentence.get(TokensAnnotation.class); final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en"); for (TaggedWord token : taggedWords) { // this is the text of the token String word = token.word(); // this is the POS tag of the token String pos = token.tag(); // this is the NER label of the token // String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); CustomToken ctoken = new CustomToken(); ctoken.setWord(word); ctoken.setBeginPosition(token.beginPosition()); ctoken.setEndPosition(token.endPosition()); ctoken.setDocId(docId); ctoken.setSentenceIndex(sentIndex); ctoken.setMultiword(false); ctoken.setIndex(index); ctoken.setPos(pos); customTokens.put(word + "/" + docId + "/" + sentIndex + "/" + index, ctoken); index++; } sentIndex++; } return customTokens; } /** * * @param args */ public static void main(String[] args) { String text = "Amir is a master student in Computer Science at UIUC. Shub is a PhD student in GSLIS at UIUC"; final Map<String, CustomToken> tokens = Tokenizer.tokenize(text, "1"); for (String key : tokens.keySet()) { System.out.println(key + "\t" + tokens.get(key)); } } }