/**
* Copyright (c) 2010, Regents of the University of Colorado All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided
* with the distribution. Neither the name of the University of Colorado at
* Boulder nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package clear.engine;
import clear.dep.DepTree;
import clear.morph.MorphEnAnalyzer;
import clear.reader.AbstractReader;
import clear.treebank.*;
import clear.util.IOUtil;
import java.io.File;
import java.io.PrintStream;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
public class PennToDep {
@Option(name = "-i", usage = "name of a file containing phrase structure tree", required = true, metaVar = "REQUIRED")
String s_inputFile;
@Option(name = "-o", usage = "name of a file containing dependency trees", required = true, metaVar = "REQUIRED")
String s_outputFile;
@Option(name = "-h", usage = "name of a file containing head-percolation rules", required = true, metaVar = "REQUIRED")
String s_headruleFile;
@Option(name = "-m", usage = "name of a file containing dictionaries for morphological analyzer", metaVar = "OPTIONAL")
String s_dictFile = null;
@Option(name = "-l", usage = "language ::= " + AbstractReader.LANG_EN + " (default) | " + AbstractReader.LANG_KR, metaVar = "OPTIONAL")
String s_language = AbstractReader.LANG_EN;
@Option(name = "-n", usage = "minimum sentence length (inclusive; default = 1)", metaVar = "OPTIONAL")
int n_length = 1;
@Option(name = "-f", usage = "if set, include function tags", metaVar = "OPTIONAL")
boolean b_funcTag = false;
@Option(name = "-e", usage = "if set, include empty categories", metaVar = "OPTIONAL")
boolean b_ec = false;
@Option(name = "-r", usage = "if set, reverse dependencies of auxiliaries and modals", metaVar = "OPTIONAL")
boolean b_reverseVC = false;
public PennToDep(String[] args) {
CmdLineParser cmd = new CmdLineParser(this);
try {
cmd.parseArgument(args);
convert();
} catch (CmdLineException e) {
System.err.println(e.getMessage());
cmd.printUsage(System.err);
}
}
public void convert() {
TBReader reader = new TBReader(s_inputFile);
TBHeadRules headrules = new TBHeadRules(s_headruleFile);
MorphEnAnalyzer morph = (s_dictFile != null) ? new MorphEnAnalyzer(s_dictFile) : null;
PrintStream fout = IOUtil.createPrintFileStream(s_outputFile);
TBTree tree;
AbstractTBConvert converter;
if (s_language.equals(AbstractReader.LANG_KR)) {
converter = new TBKrConvert(headrules);
} else {
converter = new TBEnConvert(headrules, morph, b_funcTag, b_ec, b_reverseVC);
}
String filename = s_inputFile.substring(s_inputFile.lastIndexOf(File.separator) + 1);
int i = 0;
System.out.print("\r" + filename + ": 0");
while ((tree = reader.nextTree()) != null) {
DepTree dTree = converter.toDepTree(tree);
if (dTree.size() >= n_length) {
fout.println(dTree + "\n");
i++;
}
if (i % 1000 == 0) {
System.out.print("\r" + filename + ": " + i);
}
}
fout.flush();
fout.close();
System.out.println("\r" + filename + ": " + i);
}
static public void main(String[] args) {
PennToDep ptd = new PennToDep(args);
}
}