package edu.stanford.nlp.trees; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.*; import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams; import edu.stanford.nlp.process.PTBTokenizer; import edu.stanford.nlp.process.WhitespaceTokenizer; import edu.stanford.nlp.trees.international.pennchinese.CTBErrorCorrectingTreeNormalizer; import edu.stanford.nlp.util.*; import edu.stanford.nlp.util.logging.Redwood; import java.io.*; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; import java.util.function.Function; import java.util.function.Predicate; /** * Contains several utility methods to convert constituency trees to * dependency trees. * * Used by {@link GrammaticalStructure#main(String[])} */ public class GrammaticalStructureConversionUtils { public static final String DEFAULT_PARSER_FILE = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; /** * Print typed dependencies in either the Stanford dependency representation * or in the conllx format. * * @param deps Typed dependencies to print * @param tree Tree corresponding to typed dependencies (only necessary if conllx * == true) * @param conllx If true use conllx format, otherwise use Stanford representation * @param extraSep If true, in the Stanford representation, the extra dependencies * (which do not preserve the tree structure) are printed after the * basic dependencies * @param convertToUPOS If true convert the POS tags to universal POS tags and output * them along the original POS tags. */ public static void printDependencies(GrammaticalStructure gs, Collection<TypedDependency> deps, Tree tree, boolean conllx, boolean extraSep, boolean convertToUPOS) { System.out.println(dependenciesToString(gs, deps, tree, conllx, extraSep, convertToUPOS)); } /** * Calls dependenciesToCoNLLXString with the basic dependencies * from a grammatical structure. * * (see {@link #dependenciesToCoNLLXString(Collection, CoreMap)}) */ public static String dependenciesToCoNLLXString(GrammaticalStructure gs, CoreMap sentence) { return dependenciesToCoNLLXString(gs.typedDependencies(), sentence); } /** * * Returns a dependency tree in CoNNL-X format. * It requires a CoreMap for the sentence with a TokensAnnotation. * Each token has to contain a word and a POS tag. * * @param deps The list of TypedDependency relations. * @param sentence The corresponding CoreMap for the sentence. * @return Dependency tree in CoNLL-X format. */ public static String dependenciesToCoNLLXString(Collection<TypedDependency> deps, CoreMap sentence) { StringBuilder bf = new StringBuilder(); HashMap<Integer, TypedDependency> indexedDeps = new HashMap<>(deps.size()); for (TypedDependency dep : deps) { indexedDeps.put(dep.dep().index(), dep); } List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); if (tokens == null) { throw new RuntimeException("dependenciesToCoNLLXString: CoreMap does not have required TokensAnnotation."); } int idx = 1; for (CoreLabel token : tokens) { String word = token.value(); String pos = token.tag(); String cPos = (token.get(CoreAnnotations.CoarseTagAnnotation.class) != null) ? token.get(CoreAnnotations.CoarseTagAnnotation.class) : pos; String lemma = token.lemma() != null ? token.lemma() : "_"; Integer gov = indexedDeps.containsKey(idx) ? indexedDeps.get(idx).gov().index() : 0; String reln = indexedDeps.containsKey(idx) ? indexedDeps.get(idx).reln().toString() : "erased"; String out = String.format("%d\t%s\t%s\t%s\t%s\t_\t%d\t%s\t_\t_\n", idx, word, lemma, cPos, pos, gov, reln); bf.append(out); idx++; } return bf.toString(); } public static String dependenciesToString(GrammaticalStructure gs, Collection<TypedDependency> deps, Tree tree, boolean conllx, boolean extraSep, boolean convertToUPOS) { StringBuilder bf = new StringBuilder(); Map<Integer, Integer> indexToPos = Generics.newHashMap(); indexToPos.put(0,0); // to deal with the special node "ROOT" List<Tree> gsLeaves = gs.root.getLeaves(); for (int i = 0; i < gsLeaves.size(); i++) { TreeGraphNode leaf = (TreeGraphNode) gsLeaves.get(i); indexToPos.put(leaf.label.index(), i + 1); } if (conllx) { List<Tree> leaves = tree.getLeaves(); List<Label> uposLabels = null; if (convertToUPOS) { Tree uposTree = UniversalPOSMapper.mapTree(tree); uposLabels = uposTree.preTerminalYield(); } else { uposLabels = tree.preTerminalYield(); } int index = 0; CoreMap sentence = new CoreLabel(); List<CoreLabel> tokens = new ArrayList<>(leaves.size()); for (Tree leaf : leaves) { index++; if (!indexToPos.containsKey(index)) { continue; } CoreLabel token = new CoreLabel(); token.setIndex(index); token.setValue(leaf.value()); token.setWord(leaf.value()); token.setTag(leaf.parent(tree).value()); token.set(CoreAnnotations.CoarseTagAnnotation.class, uposLabels.get(index - 1).value()); tokens.add(token); } sentence.set(CoreAnnotations.TokensAnnotation.class, tokens); bf.append(dependenciesToCoNLLXString(deps, sentence)); } else { if (extraSep) { List<TypedDependency> extraDeps = new ArrayList<>(); for (TypedDependency dep : deps) { if (dep.extra()) { extraDeps.add(dep); } else { bf.append(toStringIndex(dep, indexToPos)); bf.append("\n"); } } // now we print the separator for extra dependencies, and print these if // there are some if (!extraDeps.isEmpty()) { bf.append("======\n"); for (TypedDependency dep : extraDeps) { bf.append(toStringIndex(dep, indexToPos)); bf.append("\n"); } } } else { for (TypedDependency dep : deps) { bf.append(toStringIndex(dep, indexToPos)); bf.append("\n"); } } } return bf.toString(); } private static String toStringIndex(TypedDependency td, Map<Integer, Integer> indexToPos) { IndexedWord gov = td.gov(); IndexedWord dep = td.dep(); return td.reln() + "(" + gov.value() + "-" + indexToPos.get(gov.index()) + gov.toPrimes() + ", " + dep.value() + "-" + indexToPos.get(dep.index()) + dep.toPrimes() + ")"; } /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(GrammaticalStructureConversionUtils.class); private static String[] parseClassConstructArgs(String namePlusArgs) { String[] args = StringUtils.EMPTY_STRING_ARRAY; String name = namePlusArgs; if (namePlusArgs.matches(".*\\([^)]*\\)$")) { String argStr = namePlusArgs.replaceFirst("^.*\\(([^)]*)\\)$", "$1"); args = argStr.split(","); name = namePlusArgs.replaceFirst("\\([^)]*\\)$", ""); } String[] tokens = new String[1 + args.length]; tokens[0] = name; System.arraycopy(args, 0, tokens, 1, args.length); return tokens; } private static DependencyReader loadAlternateDependencyReader(String altDepReaderName) { Class<? extends DependencyReader> altDepReaderClass = null; String[] toks = parseClassConstructArgs(altDepReaderName); altDepReaderName = toks[0]; String[] depReaderArgs = new String[toks.length - 1]; System.arraycopy(toks, 1, depReaderArgs, 0, toks.length - 1); try { Class<?> cl = Class.forName(altDepReaderName); altDepReaderClass = cl.asSubclass(DependencyReader.class); } catch (ClassNotFoundException e) { // have a second go below } if (altDepReaderClass == null) { try { Class<?> cl = Class.forName("edu.stanford.nlp.trees." + altDepReaderName); altDepReaderClass = cl.asSubclass(DependencyReader.class); } catch (ClassNotFoundException e) { // } } if (altDepReaderClass == null) { log.info("Can't load dependency reader " + altDepReaderName + " or edu.stanford.nlp.trees." + altDepReaderName); return null; } DependencyReader altDepReader; // initialized below if (depReaderArgs.length == 0) { try { altDepReader = altDepReaderClass.newInstance(); } catch (InstantiationException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { log.info("No argument constructor to " + altDepReaderName + " is not public"); return null; } } else { try { altDepReader = altDepReaderClass.getConstructor(String[].class).newInstance((Object) depReaderArgs); } catch (IllegalArgumentException | SecurityException | InvocationTargetException e) { throw new RuntimeException(e); } catch (InstantiationException e) { e.printStackTrace(); return null; } catch (IllegalAccessException e) { log.info(depReaderArgs.length + " argument constructor to " + altDepReaderName + " is not public."); return null; } catch (NoSuchMethodException e) { log.info("String arguments constructor to " + altDepReaderName + " does not exist."); return null; } } return altDepReader; } private static DependencyPrinter loadAlternateDependencyPrinter(String altDepPrinterName) { Class<? extends DependencyPrinter> altDepPrinterClass = null; String[] toks = parseClassConstructArgs(altDepPrinterName); altDepPrinterName = toks[0]; String[] depPrintArgs = new String[toks.length - 1]; System.arraycopy(toks, 1, depPrintArgs, 0, toks.length - 1); try { Class<?> cl = Class.forName(altDepPrinterName); altDepPrinterClass = cl.asSubclass(DependencyPrinter.class); } catch (ClassNotFoundException e) { // } if (altDepPrinterClass == null) { try { Class<?> cl = Class.forName("edu.stanford.nlp.trees." + altDepPrinterName); altDepPrinterClass = cl.asSubclass(DependencyPrinter.class); } catch (ClassNotFoundException e) { // } } if (altDepPrinterClass == null) { System.err.printf("Unable to load alternative printer %s or %s. Is your classpath set correctly?\n", altDepPrinterName, "edu.stanford.nlp.trees." + altDepPrinterName); return null; } try { DependencyPrinter depPrinter; if (depPrintArgs.length == 0) { depPrinter = altDepPrinterClass.newInstance(); } else { depPrinter = altDepPrinterClass.getConstructor(String[].class).newInstance((Object) depPrintArgs); } return depPrinter; } catch (IllegalArgumentException e) { e.printStackTrace(); return null; } catch (SecurityException e) { e.printStackTrace(); return null; } catch (InstantiationException e) { e.printStackTrace(); return null; } catch (IllegalAccessException e) { e.printStackTrace(); return null; } catch (InvocationTargetException e) { e.printStackTrace(); return null; } catch (NoSuchMethodException e) { if (depPrintArgs.length == 0) { System.err.printf("Can't find no-argument constructor %s().%n", altDepPrinterName); } else { System.err.printf("Can't find constructor %s(%s).%n", altDepPrinterName, Arrays.toString(depPrintArgs)); } return null; } } private static Function<List<? extends HasWord>, Tree> loadParser(String parserFile, String parserOptions, boolean makeCopulaHead) { if (parserFile == null || "".equals(parserFile)) { parserFile = DEFAULT_PARSER_FILE; if (parserOptions == null) { parserOptions = "-retainTmpSubcategories"; } } if (parserOptions == null) { parserOptions = ""; } if (makeCopulaHead) { parserOptions = "-makeCopulaHead " + parserOptions; } parserOptions = parserOptions.trim(); // Load parser by reflection, so that this class doesn't require parser // for runtime use // LexicalizedParser lp = LexicalizedParser.loadModel(parserFile); // For example, the tregex package uses TreePrint, which uses // GrammaticalStructure, which would then import the // LexicalizedParser. The tagger can read trees, which means it // would depend on tregex and therefore depend on the parser. Function<List<? extends HasWord>, Tree> lp; try { Class<?>[] classes = new Class<?>[] { String.class, String[].class }; Method method = Class.forName("edu.stanford.nlp.parser.lexparser.LexicalizedParser").getMethod("loadModel", classes); String[] opts = StringUtils.EMPTY_STRING_ARRAY; if ( ! parserOptions.isEmpty()) { opts = parserOptions.split(" +"); } lp = (Function<List<? extends HasWord>,Tree>) method.invoke(null, parserFile, opts); } catch (Exception cnfe) { throw new RuntimeException(cnfe); } return lp; } /** * Allow a collection of trees, that is a Treebank, appear to be a collection * of GrammaticalStructures. * * @author danielcer * */ private static class TreeBankGrammaticalStructureWrapper implements Iterable<GrammaticalStructure> { private final Iterable<Tree> trees; private final boolean keepPunct; private final TreebankLangParserParams params; private final Map<GrammaticalStructure, Tree> origTrees = new WeakHashMap<>(); public TreeBankGrammaticalStructureWrapper(Iterable<Tree> wrappedTrees, boolean keepPunct, TreebankLangParserParams params) { trees = wrappedTrees; this.keepPunct = keepPunct; this.params = params; } @Override public Iterator<GrammaticalStructure> iterator() { return new GsIterator(); } public Tree getOriginalTree(GrammaticalStructure gs) { return origTrees.get(gs); } private class GsIterator implements Iterator<GrammaticalStructure> { private final Iterator<Tree> tbIterator = trees.iterator(); private final Predicate<String> puncFilter; private final HeadFinder hf; private GrammaticalStructure next; public GsIterator() { if (keepPunct) { puncFilter = Filters.acceptFilter(); } else if (params.generateOriginalDependencies()) { puncFilter = params.treebankLanguagePack().punctuationWordRejectFilter(); } else { puncFilter = params.treebankLanguagePack().punctuationTagRejectFilter(); } hf = params.typedDependencyHeadFinder(); primeGs(); } private void primeGs() { GrammaticalStructure gs = null; while (gs == null && tbIterator.hasNext()) { Tree t = tbIterator.next(); // log.info("GsIterator: Next tree is"); // log.info(t); if (t == null) { continue; } try { gs = params.getGrammaticalStructure(t, puncFilter, hf); origTrees.put(gs, t); next = gs; // log.info("GsIterator: Next tree is"); // log.info(t); return; } catch (NullPointerException npe) { log.info("Bung tree caused below dump. Continuing...."); log.info(t); npe.printStackTrace(); } } next = null; } @Override public boolean hasNext() { return next != null; } @Override public GrammaticalStructure next() { GrammaticalStructure ret = next; if (ret == null) { throw new NoSuchElementException(); } primeGs(); return ret; } @Override public void remove() { throw new UnsupportedOperationException(); } } } // end static class TreebankGrammaticalStructureWrapper /** * Enum to identify the different TokenizerTypes. To add a new * TokenizerType, add it to the list with a default options string * and add a clause in getTokenizerType to identify it. */ public enum ConverterOptions { UniversalEnglish("en", new NPTmpRetainingTreeNormalizer(0, false, 1, false), "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams", false, true), UniversalChinese("zh", new CTBErrorCorrectingTreeNormalizer(false, false, false, false), "edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams", false, false), English("en-sd", new NPTmpRetainingTreeNormalizer(0, false, 1, false), "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams", true, true), Chinese("zh-sd", new CTBErrorCorrectingTreeNormalizer(false, false, false, false), "edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams", true, false); public final String abbreviation; public final TreeNormalizer treeNormalizer; public final String tlPPClassName; public final boolean stanfordDependencies; /* Conversion to UPOS is currently only supported for English. */ public final boolean convertToUPOS; ConverterOptions(String abbreviation, TreeNormalizer treeNormalizer, String tlPPClassName, boolean stanfordDependencies, boolean convertToUPOS) { this.abbreviation = abbreviation; this.treeNormalizer = treeNormalizer; this.tlPPClassName = tlPPClassName; /* Generate old Stanford Dependencies instead of UD, when set to true. */ this.stanfordDependencies = stanfordDependencies; this.convertToUPOS = convertToUPOS; } private static final Map<String, ConverterOptions> nameToTokenizerMap = initializeNameMap(); private static Map<String, ConverterOptions> initializeNameMap() { Map<String, ConverterOptions> map = Generics.newHashMap(); for (ConverterOptions opts : ConverterOptions.values()) { if (opts.abbreviation != null) { map.put(opts.abbreviation.toUpperCase(), opts); } map.put(opts.toString().toUpperCase(), opts); } return Collections.unmodifiableMap(map); } public static ConverterOptions getConverterOptions(String language) { if (language == null) { return nameToTokenizerMap.get("EN"); } ConverterOptions opts = nameToTokenizerMap.get(language.toUpperCase()); return opts != null ? opts : nameToTokenizerMap.get("EN"); } } /** * Given sentences or trees, output the typed dependencies. * <p> * By default, the method outputs the collapsed typed dependencies with * processing of conjuncts. The input can be given as plain text (one sentence * by line) using the option -sentFile, or as trees using the option * -treeFile. For -sentFile, the input has to be strictly one sentence per * line. You can specify where to find a parser with -parserFile * serializedParserPath. See LexicalizedParser for more flexible processing of * text files (including with Stanford Dependencies output). The above options * assume a file as input. You can also feed trees (only) via stdin by using * the option -filter. If one does not specify a -parserFile, one * can specify which language pack to use with -tLPP, This option * specifies a class which determines which GrammaticalStructure to * use, which HeadFinder to use, etc. It will default to * edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams, * but any TreebankLangParserParams can be specified. * <p> * If no method of producing trees is given other than to use the * LexicalizedParser, but no parser is specified, a default parser * is used, the English parser. You can specify options to load * with the parser using the -parserOpts flag. If the default * parser is used, and no options are provided, the option * -retainTmpSubcategories is used. * <p> * The following options can be used to specify the types of dependencies * wanted: </p> * <ul> * <li> -collapsed collapsed dependencies * <li> -basic non-collapsed dependencies that preserve a tree structure * <li> -nonCollapsed non-collapsed dependencies that do not preserve a tree * structure (the basic dependencies plus the extra ones) * <li> -CCprocessed * collapsed dependencies and conjunctions processed (dependencies are added * for each conjunct) -- this is the default if no options are passed * <li> -collapsedTree collapsed dependencies retaining a tree structure * <li> -makeCopulaHead Contrary to the approach argued for in the SD papers, * nevertheless make the verb 'to be' the head, not the predicate noun, adjective, * etc. (However, when the verb 'to be' is used as an auxiliary verb, the main * verb is still treated as the head.) * <li> -originalDependencies generate the dependencies using the original converter * instead of the Universal Dependencies converter. * </ul> * <p> * The {@code -conllx} option will output the dependencies in the CoNLL format, * instead of in the standard Stanford format (relation(governor,dependent)) * and will retain punctuation by default. * When used in the "collapsed" format, words such as prepositions, conjunctions * which get collapsed into the grammatical relations and are not part of the * sentence per se anymore will be annotated with "erased" as grammatical relation * and attached to the fake "ROOT" node with index 0. * <p/><p> * There is also an option to retain dependencies involving punctuation: * {@code -keepPunct} * </p><p> * The {@code -extraSep} option used with -nonCollapsed will print the basic * dependencies first, then a separator ======, and then the extra * dependencies that do not preserve the tree structure. The -test option is * used for debugging: it prints the grammatical structure, as well as the * basic, collapsed and CCprocessed dependencies. It also checks the * connectivity of the collapsed dependencies. If the collapsed dependencies * list doesn't constitute a connected graph, it prints the possible offending * nodes (one of them is the real root of the graph). * </p><p> * Using the -conllxFile, you can pass a file containing Stanford dependencies * in the CoNLL format (e.g., the basic dependencies), and obtain another * representation using one of the representation options. * </p><p> * Usage: <br> * <code>java edu.stanford.nlp.trees.GrammaticalStructure [-treeFile FILE | -sentFile FILE | -conllxFile FILE | -filter] <br> * [-collapsed -basic -CCprocessed -test -generateOriginalDependencies]</code> * * @param args Command-line arguments, as above */ @SuppressWarnings("unchecked") public static void convertTrees(String[] args, String defaultLang) { /* Use a tree normalizer that removes all empty nodes. This prevents wrong indexing of the nodes in the dependency relations. */ Iterable<GrammaticalStructure> gsBank = null; Properties props = StringUtils.argsToProperties(args); String language = props.getProperty("language", defaultLang); ConverterOptions opts = ConverterOptions.getConverterOptions(language); MemoryTreebank tb = new MemoryTreebank(opts.treeNormalizer); Iterable<Tree> trees = tb; String encoding = props.getProperty("encoding", "utf-8"); try { System.setOut(new PrintStream(System.out, true, encoding)); } catch (IOException e) { throw new RuntimeException(e); } String treeFileName = props.getProperty("treeFile"); String sentFileName = props.getProperty("sentFile"); String conllXFileName = props.getProperty("conllxFile"); String altDepPrinterName = props.getProperty("altprinter"); String altDepReaderName = props.getProperty("altreader"); String altDepReaderFilename = props.getProperty("altreaderfile"); String filter = props.getProperty("filter"); boolean makeCopulaHead = props.getProperty("makeCopulaHead") != null; boolean generateOriginalDependencies = props.getProperty("originalDependencies") != null || opts.stanfordDependencies; // TODO: if a parser is specified, load this from the parser // instead of ever loading it from this way String tLPP = props.getProperty("tLPP", opts.tlPPClassName); TreebankLangParserParams params = ReflectionLoading.loadByReflection(tLPP); params.setGenerateOriginalDependencies(generateOriginalDependencies); if (makeCopulaHead) { // TODO: generalize and allow for more options String[] options = { "-makeCopulaHead" }; params.setOptionFlag(options, 0); } if (sentFileName == null && (altDepReaderName == null || altDepReaderFilename == null) && treeFileName == null && conllXFileName == null && filter == null) { try { System.err.printf("Usage: java %s%n", GrammaticalStructure.class.getCanonicalName()); System.err.println("Options:"); System.err.println(" Dependency representation:"); System.err.println(" -basic:\t\tGenerate basic dependencies."); System.err.println(" -enhanced:\t\tGenerate enhanced dependencies, currently only implemented for English UD."); System.err.println(" -enhanced++:\tGenerate enhanced++ dependencies (default), currently only implemented for English UD."); System.err.println(" -collapsed:\t\tGenerate collapsed dependencies, deprecated."); System.err.println(" -CCprocessed:\tGenerate CC-processed dependencies, deprecated."); System.err.println(" -collapsedTree:\tGenerate collapsed-tree dependencies, deprecated."); System.err.println(""); System.err.println(" Input:"); System.err.println(" -treeFile <FILE>:\tConvert from constituency trees in <FILE>"); System.err.println(" -sentFile <FILE>:\tParse and convert sentences from <FILE>. Only implemented for English."); System.err.println(""); System.err.println(" Output:"); System.err.println(" -conllx:\t\tOutput dependencies in CoNLL format."); System.err.println(""); System.err.println(" Language:"); System.err.println(" -language [en|zh|en-sd|zh-sd]:\t (Universal English Dependencies, Universal Chinese Dependencies, English Stanford Dependencies, Chinese Stanford Dependencies)"); System.err.println(""); System.err.println(""); System.err.println(""); System.err.println("Example:"); TreeReader tr = new PennTreeReader(new StringReader("((S (NP (NNP Sam)) (VP (VBD died) (NP-TMP (NN today)))))")); tb.add(tr.readTree()); } catch (Exception e) { log.info("Horrible error: " + e); e.printStackTrace(); } } else if (altDepReaderName != null && altDepReaderFilename != null) { DependencyReader altDepReader = loadAlternateDependencyReader(altDepReaderName); try { gsBank = altDepReader.readDependencies(altDepReaderFilename); } catch (IOException e) { log.info("Error reading " + altDepReaderFilename); return; } } else if (treeFileName != null) { tb.loadPath(treeFileName); } else if (filter != null) { tb.load(IOUtils.readerFromStdin()); } else if (conllXFileName != null) { try { gsBank = params.readGrammaticalStructureFromFile(conllXFileName); } catch (RuntimeIOException e) { log.info("Error reading " + conllXFileName); return; } } else { String parserFile = props.getProperty("parserFile"); String parserOpts = props.getProperty("parserOpts"); boolean tokenized = props.getProperty("tokenized") != null; Function<List<? extends HasWord>, Tree> lp = loadParser(parserFile, parserOpts, makeCopulaHead); trees = new LazyLoadTreesByParsing(sentFileName, encoding, tokenized, lp); // Instead of getting this directly from the LP, use reflection // so that a package which uses GrammaticalStructure doesn't // necessarily have to use LexicalizedParser try { Method method = lp.getClass().getMethod("getTLPParams"); params = (TreebankLangParserParams) method.invoke(lp); params.setGenerateOriginalDependencies(generateOriginalDependencies); } catch (Exception cnfe) { throw new RuntimeException(cnfe); } } // treats the output according to the options passed boolean basic = props.getProperty("basic") != null; boolean collapsed = props.getProperty("collapsed") != null; boolean CCprocessed = props.getProperty("CCprocessed") != null; boolean collapsedTree = props.getProperty("collapsedTree") != null; boolean nonCollapsed = props.getProperty("nonCollapsed") != null; boolean extraSep = props.getProperty("extraSep") != null; boolean parseTree = props.getProperty("parseTree") != null; boolean test = props.getProperty("test") != null; boolean keepPunct = true; //always keep punctuation marks boolean conllx = props.getProperty("conllx") != null; // todo: Support checkConnected on more options (including basic) boolean checkConnected = props.getProperty("checkConnected") != null; boolean portray = props.getProperty("portray") != null; boolean enhanced = props.getProperty("enhanced") != null; boolean enhancedPlusPlus = props.getProperty("enhanced++") != null; // If requested load alternative printer DependencyPrinter altDepPrinter = null; if (altDepPrinterName != null) { altDepPrinter = loadAlternateDependencyPrinter(altDepPrinterName); } // log.info("First tree in tb is"); // log.info(((MemoryTreebank) tb).get(0)); Method m = null; if (test) { // see if we can use SemanticGraph(Factory) to check for being a DAG // Do this by reflection to avoid this becoming a dependency when we distribute the parser try { Class sgf = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphFactory"); m = sgf.getDeclaredMethod("makeFromTree", GrammaticalStructure.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, boolean.class, Predicate.class, String.class, int.class); } catch (Exception e) { log.info("Test cannot check for cycles in tree format (classes not available)"); } } if (gsBank == null) { gsBank = new TreeBankGrammaticalStructureWrapper(trees, keepPunct, params); } for (GrammaticalStructure gs : gsBank) { Tree tree; if (gsBank instanceof TreeBankGrammaticalStructureWrapper) { // log.info("Using TreeBankGrammaticalStructureWrapper branch"); tree = ((TreeBankGrammaticalStructureWrapper) gsBank).getOriginalTree(gs); // log.info("Tree is: "); // log.info(t); } else { // log.info("Using gs.root() branch"); tree = gs.root(); // recover tree // log.info("Tree from gs is"); // log.info(t); } if (test) { // print the grammatical structure, the basic, collapsed and CCprocessed System.out.println("============= parse tree ======================="); tree.pennPrint(); System.out.println(); System.out.println("------------- GrammaticalStructure -------------"); System.out.println(gs); boolean allConnected = true; boolean connected; Collection<TypedDependency> bungRoots = null; System.out.println("------------- basic dependencies ---------------"); List<TypedDependency> gsb = gs.typedDependencies(GrammaticalStructure.Extras.NONE); System.out.println(StringUtils.join(gsb, "\n")); connected = GrammaticalStructure.isConnected(gsb); if ( ! connected && bungRoots == null) { bungRoots = GrammaticalStructure.getRoots(gsb); } allConnected = connected && allConnected; System.out.println("------------- non-collapsed dependencies (basic + extra) ---------------"); List<TypedDependency> gse = gs.typedDependencies(GrammaticalStructure.Extras.MAXIMAL); System.out.println(StringUtils.join(gse, "\n")); connected = GrammaticalStructure.isConnected(gse); if ( ! connected && bungRoots == null) { bungRoots = GrammaticalStructure.getRoots(gse); } allConnected = connected && allConnected; System.out.println("------------- collapsed dependencies -----------"); System.out.println(StringUtils.join(gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), "\n")); System.out.println("------------- collapsed dependencies tree -----------"); System.out.println(StringUtils.join(gs.typedDependenciesCollapsedTree(), "\n")); System.out.println("------------- CCprocessed dependencies --------"); List<TypedDependency> gscc = gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL); System.out.println(StringUtils.join(gscc, "\n")); System.out.println("-----------------------------------------------"); // connectivity tests connected = GrammaticalStructure.isConnected(gscc); if ( ! connected && bungRoots == null) { bungRoots = GrammaticalStructure.getRoots(gscc); } allConnected = connected && allConnected; if (allConnected) { System.out.println("dependencies form connected graphs."); } else { System.out.println("dependency graph NOT connected! possible offending nodes: " + bungRoots); } // test for collapsed dependencies being a tree: // make sure at least it doesn't contain cycles (i.e., is a DAG) // Do this by reflection so parser doesn't need SemanticGraph and its // libraries if (m != null) { try { // the first arg is null because it's a static method.... Object semGraph = m.invoke(null, gs, false, true, false, false, false, false, null, null, 0); Class sg = Class.forName("edu.stanford.nlp.semgraph.SemanticGraph"); Method mDag = sg.getDeclaredMethod("isDag"); boolean isDag = (Boolean) mDag.invoke(semGraph); System.out.println("tree dependencies form a DAG: " + isDag); } catch (Exception e) { e.printStackTrace(); } } }// end of "test" output else { if (parseTree) { System.out.println("============= parse tree ======================="); tree.pennPrint(); System.out.println(); } if (basic) { if (collapsed || CCprocessed || collapsedTree || nonCollapsed || enhanced || enhancedPlusPlus) { System.out.println("------------- basic dependencies ---------------"); } if (altDepPrinter == null) { printDependencies(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree, conllx, false, opts.convertToUPOS); } else { System.out.println(altDepPrinter.dependenciesToString(gs, gs.typedDependencies(GrammaticalStructure.Extras.NONE), tree)); } } if (nonCollapsed) { if (basic || CCprocessed || collapsed || collapsedTree) { System.out.println("----------- non-collapsed dependencies (basic + extra) -----------"); } printDependencies(gs, gs.allTypedDependencies(), tree, conllx, extraSep, opts.convertToUPOS); } if (collapsed) { if (basic || CCprocessed || collapsedTree || nonCollapsed) { System.out.println("----------- collapsed dependencies -----------"); } printDependencies(gs, gs.typedDependenciesCollapsed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS); } if (CCprocessed) { if (basic || collapsed || collapsedTree || nonCollapsed) { System.out.println("---------- CCprocessed dependencies ----------"); } List<TypedDependency> deps = gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL); if (checkConnected) { if (!GrammaticalStructure.isConnected(deps)) { log.info("Graph is not connected for:"); log.info(tree); log.info("possible offending nodes: " + GrammaticalStructure.getRoots(deps)); } } printDependencies(gs, deps, tree, conllx, false, opts.convertToUPOS); } if (collapsedTree) { if (basic || CCprocessed || collapsed || nonCollapsed) { System.out.println("----------- collapsed dependencies tree -----------"); } printDependencies(gs, gs.typedDependenciesCollapsedTree(), tree, conllx, false, opts.convertToUPOS); } if (enhanced) { if (basic || enhancedPlusPlus) { System.out.println("----------- enhanced dependencies tree -----------"); } printDependencies(gs, gs.typedDependenciesEnhanced(), tree, conllx, false, opts.convertToUPOS); } if (enhancedPlusPlus) { if (basic || enhanced) { System.out.println("----------- enhanced++ dependencies tree -----------"); } printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS); } // default use: enhanced++ for UD, CCprocessed for SD (to parallel what happens within the parser) if (!basic && !collapsed && !CCprocessed && !collapsedTree && !nonCollapsed && !enhanced && !enhancedPlusPlus) { // System.out.println("----------- CCprocessed dependencies -----------"); if (generateOriginalDependencies) { printDependencies(gs, gs.typedDependenciesCCprocessed(GrammaticalStructure.Extras.MAXIMAL), tree, conllx, false, opts.convertToUPOS); } else { printDependencies(gs, gs.typedDependenciesEnhancedPlusPlus(), tree, conllx, false, opts.convertToUPOS); } } } if (portray) { try { // put up a window showing it Class sgu = Class.forName("edu.stanford.nlp.semgraph.SemanticGraphUtils"); Method mRender = sgu.getDeclaredMethod("render", GrammaticalStructure.class, String.class); // the first arg is null because it's a static method.... mRender.invoke(null, gs, "Collapsed, CC processed deps"); } catch (Exception e) { throw new RuntimeException("Couldn't use swing to portray semantic graph", e); } } } // end for } // end convertTrees // todo [cdm 2013]: Take this out and make it a trees class: TreeIterableByParsing static class LazyLoadTreesByParsing implements Iterable<Tree> { final Reader reader; final String filename; final boolean tokenized; final String encoding; final Function<List<? extends HasWord>, Tree> lp; public LazyLoadTreesByParsing(String filename, String encoding, boolean tokenized, Function<List<? extends HasWord>, Tree> lp) { this.filename = filename; this.encoding = encoding; this.reader = null; this.tokenized = tokenized; this.lp = lp; } @Override public Iterator<Tree> iterator() { final BufferedReader iReader; if (reader != null) { iReader = new BufferedReader(reader); } else { try { iReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding)); } catch (IOException e) { throw new RuntimeException(e); } } return new Iterator<Tree>() { String line; // = null; @Override public boolean hasNext() { if (line != null) { return true; } else { try { line = iReader.readLine(); } catch (IOException e) { throw new RuntimeException(e); } if (line == null) { try { if (reader == null) iReader.close(); } catch (Exception e) { throw new RuntimeException(e); } return false; } return true; } } @Override public Tree next() { if (line == null) { throw new NoSuchElementException(); } Reader lineReader = new StringReader(line); line = null; List<Word> words; if (tokenized) { words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize(); } else { words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize(); } if (!words.isEmpty()) { // the parser throws an exception if told to parse an empty sentence. Tree parseTree = lp.apply(words); return parseTree; } else { return new SimpleTree(); } } @Override public void remove() { throw new UnsupportedOperationException(); } }; } } // end static class LazyLoadTreesByParsing }