/** * KOSHIK is an NLP framework for large scale processing using Hadoop. * Copyright © 2014 Peter Exner * * This file is part of KOSHIK. * * KOSHIK is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * KOSHIK is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with KOSHIK. If not, see <http://www.gnu.org/licenses/>. */ package se.lth.cs.koshik.analysis.stanford; import se.lth.cs.srl.preprocessor.tokenization.StanfordPTBTokenizer; import se.lth.cs.srl.preprocessor.tokenization.Tokenizer; import se.lth.cs.koshik.analysis.ContentProcessor; import se.lth.cs.koshik.input.conll.CoNLLFeature; import se.lth.cs.koshik.model.Document; import se.lth.cs.koshik.model.text.Sentence; import se.lth.cs.koshik.model.text.Token; public class StanfordTokenizer implements ContentProcessor { private Tokenizer tokenizer; public StanfordTokenizer() { tokenizer = new StanfordPTBTokenizer(); } @Override public void process(Document document) throws Exception { for(Sentence sentence:document.select(Sentence.class)) { String tokens[] = tokenizer.tokenize(sentence.getContent()); int offset = 0; int begin = 0; for(int i=1; i<tokens.length;i++) { if(tokens[i].equalsIgnoreCase("-LRB-") || tokens[i].equalsIgnoreCase("-RRB-")) { int end = offset + 2; if(end > sentence.getContent().length()) { end = sentence.getContent().length(); } begin = sentence.getContent().indexOf(sentence.getContent().substring(offset, end).trim(), offset); } else { begin = sentence.getContent().indexOf(tokens[i], offset); } if(begin != -1) { Token koshikToken = new Token(document); koshikToken.setBegin(begin + sentence.getBegin()); if(tokens[i].equalsIgnoreCase("-LRB-") || tokens[i].equalsIgnoreCase("-RRB-")) { offset = begin + 1; } else { offset = begin + tokens[i].length(); } koshikToken.setEnd(offset + sentence.getBegin()); koshikToken.setFeature(CoNLLFeature.ID, Integer.toString(i)); koshikToken.setFeature(CoNLLFeature.FORM, tokens[i]); } } } } }