/** * KOSHIK is an NLP framework for large scale processing using Hadoop. * Copyright © 2014 Peter Exner * * This file is part of KOSHIK. * * KOSHIK is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * KOSHIK is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with KOSHIK. If not, see <http://www.gnu.org/licenses/>. */ package se.lth.cs.koshik.input.conll; import java.util.List; import se.lth.cs.koshik.model.Document; import se.lth.cs.koshik.model.text.Sentence; import se.lth.cs.koshik.model.text.Token; public class CoNLL2009Reader extends CoNLLReader { @Override protected void processSentenceLines(List<String> sentenceLines, StringBuilder content, Document document, Sentence sentence) { sentence.setBegin(content.length()); buildTokens(sentenceLines, content, document, sentence); sentence.setEnd(content.length()); buildDependencyEdges(sentenceLines, document, sentence); buildSemanticRoleEdges(sentenceLines, document, sentence); } private void buildTokens(List<String> sentenceLines, StringBuilder content, Document document, Sentence sentence) { String separator = ""; for(String sentenceLine:sentenceLines) { String[] fields = sentenceLine.split("\t"); if(fields.length >= 14) { try { Integer.parseInt(fields[0]); } catch (NumberFormatException e) { System.err.println("Unknown format: " + sentenceLine); } } else { System.err.println("Unknown format: " + sentenceLine); } Token token = new Token(document); token.setFeature(CoNLLFeature.ID, fields[0]); token.setFeature(CoNLLFeature.FORM, fields[1]); token.setFeature(CoNLLFeature.LEMMA, fields[2]); token.setFeature(CoNLLFeature.PLEMMA, fields[3]); token.setFeature(CoNLLFeature.POS, fields[4]); token.setFeature(CoNLLFeature.PPOS, fields[5]); token.setFeature(CoNLLFeature.FEAT, fields[6]); token.setFeature(CoNLLFeature.PFEAT, fields[7]); if(fields[12].equalsIgnoreCase("Y") && !fields[13].equalsIgnoreCase("_")) { token.setFeature(CoNLLFeature.PRED, fields[13]); } content.append(separator); token.setBegin(content.length()); content.append(fields[1]); token.setEnd(content.length()); separator = TOKEN_SEPARATOR; } } private void buildDependencyEdges(List<String> sentenceLines, Document document, Sentence sentence) { if(sentenceLines.size() != sentence.getTokens().size()) { System.err.println("Internal error: number of tokens not equal to number of conll tokens"); return; } for(String sentenceLine:sentenceLines) { int index = sentenceLines.indexOf(sentenceLine) + 1; String[] fields = sentenceLine.split("\t"); if(fields.length >= 14) { try { Integer.parseInt(fields[0]); } catch (NumberFormatException e) { System.err.println("Unknown format: " + sentenceLine); } } else { System.err.println("Unknown format: " + sentenceLine); } if(!fields[8].equalsIgnoreCase("_") && !fields[10].equalsIgnoreCase("_")) { sentence.getToken(index).setFeature(CoNLLFeature.HEAD, fields[8]); sentence.getToken(index).setFeature(CoNLLFeature.DEPREL, fields[10]); } if(!fields[9].equalsIgnoreCase("_") && !fields[11].equalsIgnoreCase("_")) { sentence.getToken(index).setFeature(CoNLLFeature.PHEAD, fields[9]); sentence.getToken(index).setFeature(CoNLLFeature.PDEPREL, fields[11]); } } } protected void buildSemanticRoleEdges(List<String> sentenceLines, Document document, Sentence sentence) { int numberOfPredicates = sentence.getPredicateTokens().size(); for(String sentenceLine:sentenceLines) { int index = sentenceLines.indexOf(sentenceLine) + 1; String[] fields = sentenceLine.split("\t"); if(fields.length == 14 + numberOfPredicates) { try { Integer.parseInt(fields[0]); } catch (NumberFormatException e) { System.err.println("Unknown format: " + sentenceLine); } } else { System.err.println("Unknown format: " + sentenceLine); } int i = 0; for(Token predicateToken:sentence.getPredicateTokens()) { if(!fields[14+i].equalsIgnoreCase("_")) { sentence.getToken(index).addSemanticRole(predicateToken, fields[14+i]); } i++; } } } }