/////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2007 University of Texas at Austin and (C) 2005 // University of Pennsylvania and Copyright (C) 2002, 2003 University // of Massachusetts Amherst, Department of Computer Science. // // This software is licensed under the terms of the Common Public // License, Version 1.0 or (at your option) any subsequent version. // // The license is approved by the Open Source Initiative, and is // available from their website at http://www.opensource.org. /////////////////////////////////////////////////////////////////////////////// package mstparser.io; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.List; import mstparser.DependencyParser; /** * A class that defines common behavior and abstract methods for readers for * different formats. * * <p> Created: Sat Nov 10 15:25:10 2001 </p> * * @author Jason Baldridge * @version $Id: DependencyReader.java 112 2007-03-23 19:19:28Z jasonbaldridge $ */ public abstract class DependencyReader { protected BufferedReader inputReader; protected boolean labeled = true; protected boolean confScores = false; public static DependencyReader createDependencyReader(String format) throws IOException { return createDependencyReader(format, false); } public static DependencyReader createDependencyReader(String format, boolean discourseMode) throws IOException { return createDependencyReader(format, false, false, false); } public static DependencyReader createDependencyReader(String format, boolean discourseMode, boolean stacked, boolean useStemmingIfLemmasAbsent) throws IOException { if (format.equals("MST")) { return new MSTReader(); } else if (format.equals("CONLL")) { return new CONLLReader(discourseMode, stacked, useStemmingIfLemmasAbsent); } else { DependencyParser.out.println("!!!!!!! Not a supported format: " + format); DependencyParser.out.println("********* Assuming CONLL format. **********"); return new CONLLReader(discourseMode, stacked, useStemmingIfLemmasAbsent); } } public static DependencyReader createDependencyReaderWithConfidenceScores( String format) throws IOException { DependencyReader reader = createDependencyReader(format); reader.confScores = true; return reader; } public static DependencyReader createDependencyReaderWithStacked (String format) throws IOException { return createDependencyReader(format, false, false, false); } public boolean startReading(String file) throws IOException { labeled = fileContainsLabels(file); inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF8")); return labeled; } public boolean isLabeled() { return labeled; } public abstract mstparser.DependencyInstance getNext() throws IOException; protected abstract boolean fileContainsLabels(String filename) throws IOException; protected String normalize(String s, List<String> numbers) { if (s.matches("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+")) { if (numbers != null) numbers.add(s); return "<num>"; } return s; } }