/**
* KOSHIK is an NLP framework for large scale processing using Hadoop.
* Copyright © 2014 Peter Exner
*
* This file is part of KOSHIK.
*
* KOSHIK is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* KOSHIK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with KOSHIK. If not, see <http://www.gnu.org/licenses/>.
*/
package se.lth.cs.koshik.input.conll;
import java.util.ArrayList;
import java.util.List;
import se.lth.cs.koshik.model.Document;
import se.lth.cs.koshik.model.text.Sentence;
public abstract class CoNLLReader {
protected static final String SENTENCE_SEPARATOR = " ";
protected static final String TOKEN_SEPARATOR = " ";
public void read(String text, Document document) {
StringBuilder content = new StringBuilder();
String[] textLines = text.split("\n");
List<String> lines = new ArrayList<String>();
Sentence sentence = new Sentence(document);
String separator = "";
for(String textLine:textLines) {
textLine = textLine.trim();
if (isFirstLine(textLine) && lines.size() > 0) {
content.append(separator);
separator = SENTENCE_SEPARATOR;
this.processSentenceLines(lines, content, document, sentence);
lines = new ArrayList<String>();
sentence = new Sentence(document);
}
if(textLine.length() > 0) {
lines.add(textLine);
}
}
if(lines.size() > 0) {
content.append(separator);
this.processSentenceLines(lines, content, document, sentence);
}
content.append("A");
document.setContent(content.toString());
}
private static boolean isFirstLine(String line) {
String[] columns = line.split("\\t");
if(columns.length > 0) {
if(columns[0].trim().equalsIgnoreCase("1")) {
return true;
}
}
return false;
}
protected abstract void processSentenceLines(List<String> sentenceLines, StringBuilder content, Document document, Sentence sentence);
}