///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2007 University of Texas at Austin and (C) 2005
// University of Pennsylvania and Copyright (C) 2002, 2003 University
// of Massachusetts Amherst, Department of Computer Science.
//
// This software is licensed under the terms of the Common Public
// License, Version 1.0 or (at your option) any subsequent version.
//
// The license is approved by the Open Source Initiative, and is
// available from their website at http://www.opensource.org.
///////////////////////////////////////////////////////////////////////////////
package mstparser.io;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import mstparser.DependencyInstance;
import mstparser.RelationalFeature;
/**
* A reader for files in CoNLL format.
*
* <p> Created: Sat Nov 10 15:25:10 2001 </p>
*
* @author Jason Baldridge
* @version $Id: CONLLReader.java 112 2007-03-23 19:19:28Z jasonbaldridge $
* @see mstparser.io.DependencyReader
*/
public class CONLLReader extends DependencyReader {
protected boolean discourseMode = false;
// New Attribute from "MSTParserStacked"
protected boolean useStemmingIfLemmasAbsent = false;
// Used for stacked
protected boolean stacked = false; // afm 03-10-08 --- True if input file contains output predictions
public CONLLReader(boolean discourseMode) {
this.discourseMode = discourseMode;
this.stacked = false;
this.useStemmingIfLemmasAbsent = false;
}
// Constructor for "MSTParserStacked"
public CONLLReader(boolean discourseMode, boolean stacked, boolean useStemmingIfLemmasAbsent) {
this.discourseMode = discourseMode;
this.stacked = stacked;
this.useStemmingIfLemmasAbsent = useStemmingIfLemmasAbsent;
}
@Override
public DependencyInstance getNext() throws IOException {
ArrayList<String[]> lineList = new ArrayList<String[]>();
String line = inputReader.readLine();
while (line != null && !line.equals("") && !line.startsWith("*")) {
lineList.add(line.split("\t"));
line = inputReader.readLine();
//DependencyParser.out.println("## "+line);
}
int length = lineList.size();
if (length == 0) {
inputReader.close();
return null;
}
String[] forms = new String[length + 1];
String[] lemmas = new String[length + 1];
String[] cpos = new String[length + 1];
String[] pos = new String[length + 1];
String[][] feats = new String[length + 1][];
String[] deprels = new String[length + 1];
int[] heads = new int[length + 1];
// stacked input data
String[] deprels_pred = null;
int[] heads_pred = null;
if (stacked == true) {
deprels_pred = new String[length + 1]; // For stacked learning --- afm 03-10-08
heads_pred = new int[length + 1]; // For stacked learning --- afm 03-10-08
}
// confidence score parameter
double[] confscores = confScores ? new double[length + 1] : null;
forms[0] = "<root>";
lemmas[0] = "<root-LEMMA>";
cpos[0] = "<root-CPOS>";
pos[0] = "<root-POS>";
deprels[0] = "<no-type>";
heads[0] = -1;
if (confScores) {
confscores[0] = 1;
}
// used for retrived normal number
List<String> numbers = new LinkedList<String>();
for (int i = 0; i < length; i++) {
String[] info = lineList.get(i);
forms[i + 1] = normalize(info[1], numbers);
lemmas[i + 1] = normalize(info[2], numbers);
// For languages that do not have lemma information --- afm 06-12-2008
if (useStemmingIfLemmasAbsent) {
if (lemmas[i + 1].equals("_")) {
lemmas[i + 1] = (forms[i + 1].length() > 3) ? forms[i + 1].substring(0, 3) : forms[i + 1];
}
}
cpos[i + 1] = info[3];
pos[i + 1] = info[4];
//////////////////////////////////////////////////////////
// new feature for agreement
//->if (!info[5].equals("_")) // if not "_"
//-> feats[i+1] = info[5].split("\\|"); // split into list
//->else // otherwise
//-> feats[i+1] = new String[0]; // make empty list
///////////////////////////////////////////////////////////
feats[i + 1] = info[5].split("\\|");
if (stacked == true) // For stacked learning --- afm 03-10-08
{
deprels_pred[i + 1] = labeled ? info[7] : "<no-type>";
heads_pred[i + 1] = Integer.parseInt(info[6]);
deprels[i + 1] = labeled ? info[9] : "<no-type>";
heads[i + 1] = Integer.parseInt(info[8]);
} else {
deprels[i + 1] = labeled ? info[7] : "<no-type>";
heads[i + 1] = Integer.parseInt(info[6]);
}
if (confScores) {
confscores[i + 1] = Double.parseDouble(info[10]);
}
}
////////////////////////////////////////////////////
// new feature for agreement
//->feats[0] = new String[0] ; // always add empty list
////////////////////////////////////////////////////
feats[0] = new String[feats[1].length];
for (int i = 0; i < feats[1].length; i++) {
feats[0][i] = "<root-feat>" + i;
}
////////////////////////////////////////////////////
// The following stuff is for discourse and can be safely
// ignored if you are doing sentential parsing. (In theory it
// could be useful for sentential parsing.)
if (discourseMode) {
String[][] extended_feats = new String[feats[0].length][length + 1];
for (int i = 0; i < extended_feats.length; i++) {
for (int j = 0; j < length + 1; j++) {
extended_feats[i][j] = feats[j][i];
}
}
feats = extended_feats;
}
ArrayList<RelationalFeature> rfeats = new ArrayList<RelationalFeature>();
while (line != null && !line.equals("")) {
rfeats.add(new RelationalFeature(length, line, inputReader));
line = inputReader.readLine();
}
RelationalFeature[] rfeatsList = new RelationalFeature[rfeats.size()];
rfeats.toArray(rfeatsList);
// End of discourse stuff.
DependencyInstance instance;
if (stacked) {
instance = new DependencyInstance(forms, lemmas, cpos, pos, feats, deprels, heads, rfeatsList, deprels_pred, heads_pred, stacked, numbers);
} else {
instance = new DependencyInstance(forms, lemmas, cpos, pos, feats, deprels, heads, rfeatsList, confscores, numbers);
}
return instance;
}
@Override
protected boolean fileContainsLabels(String file) throws IOException {
BufferedReader in = new BufferedReader(new FileReader(file));
String line = in.readLine();
in.close();
if (line.trim().length() > 0) {
return true;
} else {
return false;
}
}
}