/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.bin; import java.io.BufferedReader; import java.io.File; import java.io.PrintStream; import java.util.ArrayList; import java.util.List; import java.util.Set; import org.kohsuke.args4j.Option; import edu.emory.clir.clearnlp.collection.map.IntObjectHashMap; import edu.emory.clir.clearnlp.collection.triple.ObjectIntIntTriple; import edu.emory.clir.clearnlp.component.mode.morph.AbstractMPAnalyzer; import edu.emory.clir.clearnlp.component.utils.NLPUtils; import edu.emory.clir.clearnlp.constituent.CTNode; import edu.emory.clir.clearnlp.constituent.CTReader; import edu.emory.clir.clearnlp.constituent.CTTree; import edu.emory.clir.clearnlp.conversion.AbstractC2DConverter; import edu.emory.clir.clearnlp.dependency.DEPLibEn; import edu.emory.clir.clearnlp.dependency.DEPNode; import edu.emory.clir.clearnlp.dependency.DEPTree; import edu.emory.clir.clearnlp.lexicon.propbank.PBInstance; import edu.emory.clir.clearnlp.lexicon.propbank.PBReader; import edu.emory.clir.clearnlp.ner.BILOU; import edu.emory.clir.clearnlp.pos.POSLibEn; import edu.emory.clir.clearnlp.util.BinUtils; import edu.emory.clir.clearnlp.util.FileUtils; import edu.emory.clir.clearnlp.util.IOUtils; import edu.emory.clir.clearnlp.util.Splitter; import edu.emory.clir.clearnlp.util.arc.SRLArc; import edu.emory.clir.clearnlp.util.lang.TLanguage; public class C2DConvert { @Option(name="-h", usage="headrule file (required)", required=true, metaVar="<filename>") private String s_headruleFile; @Option(name="-i", usage="input path (required)", required=true, metaVar="<filepath>") private String s_inputPath; @Option(name="-pe", usage="parse file extension (default: parse)", required=false, metaVar="<string>") private String s_parseExt = "parse"; @Option(name="-re", usage="propbank file extension (default: prop)", required=false, metaVar="<string>") private String s_propExt = "prop"; @Option(name="-ne", usage="named entity file extension (default: name)", required=false, metaVar="<string>") private String s_nameExt = "name"; @Option(name="-oe", usage="output file extension (default: dep)", required=false, metaVar="<string>") private String s_outputExt = "dep"; @Option(name="-l", usage="language (default: english)", required=false, metaVar="<language>") private String s_language = TLanguage.ENGLISH.toString(); @Option(name="-n", usage="if set, normalize empty category indices", required=false, metaVar="<boolean>") private boolean b_normalize = false; @Option(name="-r", usage="if set, traverse parse files recursively", required=false, metaVar="<boolean>") private boolean b_recursive = false; public C2DConvert() {} public C2DConvert(String[] args) throws Exception { BinUtils.initArgs(args, this); List<String> parseFiles = FileUtils.getFileList(s_inputPath, s_parseExt, b_recursive); TLanguage language = TLanguage.getType(s_language); int n; AbstractC2DConverter converter = NLPUtils.getC2DConverter(language, IOUtils.createFileInputStream(s_headruleFile)); AbstractMPAnalyzer analyzer = NLPUtils.getMPAnalyzer(language); for (String parseFile : parseFiles) { n = convert(converter, analyzer, parseFile, s_parseExt, s_propExt, s_nameExt, s_outputExt, b_normalize); System.out.printf("%s: %d trees\n", parseFile, n); } } protected int convert(AbstractC2DConverter converter, AbstractMPAnalyzer analyzer, String parseFile, String parseExt, String propExt, String nameExt, String outputExt, boolean normalize) throws Exception { IntObjectHashMap<List<ObjectIntIntTriple<String>>> mName = getNamedEntityMap(parseFile, parseExt, nameExt); IntObjectHashMap<List<PBInstance>> mProp = getPBInstanceMap(parseFile, parseExt, propExt); PrintStream fout = IOUtils.createBufferedPrintStream(parseFile+"."+outputExt); CTReader reader = new CTReader(IOUtils.createFileInputStream(parseFile)); List<ObjectIntIntTriple<String>> names = null; List<PBInstance> instances = null; CTTree cTree; DEPTree dTree; int n; for (n=0; (cTree = reader.nextTree()) != null; n++) { if (normalize) cTree.normalizeIndices(); if (mProp != null && (instances = mProp.get(n)) != null) initPropBank(cTree, instances); if (mName != null && (names = mName.get(n)) != null) initNamedEntities(cTree, names); dTree = converter.toDEPTree(cTree); if (dTree != null) { if (instances != null) { retainOnyVerbPredicates(dTree); DEPLibEn.postLabel(dTree); } analyzer.process(dTree); fout.println(dTree.toString()+"\n"); } else System.err.println("No token in the tree "+(n+1)+"\n"+cTree.toStringLine()); } reader.close(); fout.close(); return n; } private IntObjectHashMap<List<PBInstance>> getPBInstanceMap(String parseFile, String parseExt, String propExt) { String filename = getFilename(parseFile, parseExt, propExt); return filename != null ? new PBReader(IOUtils.createFileInputStream(filename)).getInstanceMap() : null; } private String getFilename(String parseFile, String parseExt, String otherExt) { if (parseExt == null || otherExt == null) return null; String filename = FileUtils.replaceExtension(parseFile, parseExt, otherExt); if (filename == null || !new File(filename).isFile()) return null; return filename; } private void initPropBank(CTTree tree, List<PBInstance> instances) { for (PBInstance instance : instances) { if (!instance.isTemporaryInstance()) tree.initPBInstance(instance); } } private void retainOnyVerbPredicates(DEPTree tree) { Set<DEPNode> verbs; DEPNode head; SRLArc arc; for (DEPNode node : tree) { if (node.isSemanticHead() && !POSLibEn.isVerb(node.getPOSTag())) { verbs = node.getSemanticHeadSet("AM-PRR"); for (DEPNode arg : tree) { if (node != arg && (arc = arg.getSemanticHeadArc(node)) != null) { head = arg.getHead(); if (verbs.contains(head) || (head = getRCVerb(verbs, arg)) != null) arc.setNode(head); else arg.removeSemanticHead(arc); } } node.clearRolesetID(); } } } private DEPNode getRCVerb(Set<DEPNode> verbs, DEPNode arg) { for (DEPNode verb : verbs) { if (verb.isDependentOf(arg)) return verb; } return null; } private IntObjectHashMap<List<ObjectIntIntTriple<String>>> getNamedEntityMap(String parseFile, String parseExt, String nameExt) throws Exception { String filename = getFilename(parseFile, parseExt, nameExt); if (filename == null) return null; IntObjectHashMap<List<ObjectIntIntTriple<String>>> map = new IntObjectHashMap<>(); BufferedReader fin = IOUtils.createBufferedReader(filename); String[] tmp; String line; int treeID; while ((line = fin.readLine()) != null) { tmp = Splitter.splitSpace(line); treeID = Integer.parseInt(tmp[1]); map.put(treeID, getNamedEntityList(tmp)); } fin.close(); return map; } private List<ObjectIntIntTriple<String>> getNamedEntityList(String[] names) { int i, bIdx, eIdx, size = names.length; List<ObjectIntIntTriple<String>> list = new ArrayList<>(size-2); String[] t0, t1; String ent; for (i=2; i<size; i++) { t0 = Splitter.splitHyphens(names[i]); t1 = Splitter.splitColons(t0[0]); ent = t0[1]; bIdx = Integer.parseInt(t1[0]); eIdx = Integer.parseInt(t1[1]); list.add(new ObjectIntIntTriple<>(ent, bIdx, eIdx)); } return list; } private void initNamedEntities(CTTree cTree, List<ObjectIntIntTriple<String>> names) { if (names == null) return; int i; for (CTNode node : cTree.getTerminalList()) node.setNamedEntityTag(BILOU.O.toString()); for (ObjectIntIntTriple<String> t : names) { if (t.i1 == t.i2) cTree.getTerminal(t.i1).setNamedEntityTag(BILOU.U+"-"+t.o); else { cTree.getTerminal(t.i1).setNamedEntityTag(BILOU.B+"-"+t.o); cTree.getTerminal(t.i2).setNamedEntityTag(BILOU.L+"-"+t.o); for (i=t.i1+1; i<t.i2; i++) cTree.getTerminal(i).setNamedEntityTag(BILOU.I+"-"+t.o); } } } public static void main(String[] args) { try { new C2DConvert(args); } catch (Exception e) {e.printStackTrace();} } }