/******************************************************************************* * Copyright (c) 2012 György Orosz, Attila Novák. * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Lesser Public License v3 * which accompanies this distribution, and is available at * http://www.gnu.org/licenses/ * * This file is part of PurePos. * * PurePos is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * PurePos is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * Contributors: * György Orosz - initial API and implementation ******************************************************************************/ package hu.ppke.itk.nlpg.corpusreader; import hu.ppke.itk.nlpg.docmodel.ISentence; import hu.ppke.itk.nlpg.docmodel.IToken; import hu.ppke.itk.nlpg.docmodel.internal.Sentence; import java.util.ArrayList; import java.util.List; /** * Reader class for reading sentences from a corpus. * * @author György Orosz * */ public class SentenceReader extends AbstractDocElementReader<ISentence> { private final AbstractDocElementReader<IToken> wordParser; SentenceReader(AbstractDocElementReader<IToken> wordParser) { this.wordParser = wordParser; separator = "\\s"; } @Override public ISentence read(String text) throws ParsingException { if (text.equals("")) return new Sentence(null); String[] words = text.split(separator); List<IToken> tokens = new ArrayList<IToken>(); for (int i = 0; i < words.length; ++i) { String wordstring = words[i]; if(wordstring.length() == 0) throw new ParsingException("Empty word in: '" + text +"'"); if (wordstring.length() > 0) { IToken word = wordParser.read(wordstring); if (word != null) tokens.add(word); } } ISentence sent = new Sentence(tokens); return sent; } }