package edu.stanford.nlp.trees.international.pennchinese;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams;
import edu.stanford.nlp.parser.ViterbiParserWithOptions;
import edu.stanford.nlp.trees.*;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Filters;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import java.io.*;
import java.util.*;
import java.lang.reflect.Constructor;
import static edu.stanford.nlp.trees.GrammaticalRelation.DEPENDENT;
/**
* A GrammaticalStructure for Chinese.
*
* @author Galen Andrew
* @author Pi-Chuan Chang
* @author Daniel Cer - support for printing CoNLL-X format, encoding update,
* and preliminary changes to make
* ChineseGrammaticalStructure behave more like
* EnglishGrammaticalStructure on the command line
* (ultimately, both classes should probably use the same
* abstract main method).
*/
public class UniversalChineseGrammaticalStructure extends GrammaticalStructure {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(UniversalChineseGrammaticalStructure.class);
private static HeadFinder shf = new UniversalChineseSemanticHeadFinder();
//private static HeadFinder shf = new ChineseHeadFinder();
/**
* Construct a new <code>GrammaticalStructure</code> from an
* existing parse tree. The new <code>GrammaticalStructure</code>
* has the same tree structure and label values as the given tree
* (but no shared storage). As part of construction, the parse tree
* is analyzed using definitions from {@link GrammaticalRelation
* <code>GrammaticalRelation</code>} to populate the new
* <code>GrammaticalStructure</code> with as many labeled
* grammatical relations as it can.
*
* @param t Tree to process
*/
public UniversalChineseGrammaticalStructure(Tree t) {
this(t, new ChineseTreebankLanguagePack().punctuationWordRejectFilter());
}
public UniversalChineseGrammaticalStructure(Tree t, Predicate<String> puncFilter) {
this (t, puncFilter, shf);
}
public UniversalChineseGrammaticalStructure(Tree t, HeadFinder hf) {
this (t, null, hf);
}
public UniversalChineseGrammaticalStructure(Tree t, Predicate<String> puncFilter, HeadFinder hf) {
super(t, UniversalChineseGrammaticalRelations.values(), UniversalChineseGrammaticalRelations.valuesLock(), null, hf, puncFilter, Filters.acceptFilter());
}
/** Used for postprocessing CoNLL X dependencies */
public UniversalChineseGrammaticalStructure(List<TypedDependency> projectiveDependencies, TreeGraphNode root) {
super(projectiveDependencies, root);
}
@Override
protected void collapseDependencies(List<TypedDependency> list, boolean CCprocess, Extras includeExtras) {
// collapseConj(list);
collapsePrepAndPoss(list);
// collapseMultiwordPreps(list);
}
private static void collapsePrepAndPoss(Collection<TypedDependency> list) {
Collection<TypedDependency> newTypedDeps = new ArrayList<>();
// Construct a map from words to the set of typed
// dependencies in which the word appears as governor.
Map<IndexedWord, Set<TypedDependency>> map = Generics.newHashMap();
for (TypedDependency typedDep : list) {
if (!map.containsKey(typedDep.gov())) {
map.put(typedDep.gov(), Generics.<TypedDependency>newHashSet());
}
map.get(typedDep.gov()).add(typedDep);
}
//log.info("here's the map: " + map);
for (TypedDependency td1 : list) {
if (td1.reln() != GrammaticalRelation.KILL) {
IndexedWord td1Dep = td1.dep();
String td1DepPOS = td1Dep.tag();
// find all other typedDeps having our dep as gov
Set<TypedDependency> possibles = map.get(td1Dep);
if (possibles != null) {
// look for the "second half"
for (TypedDependency td2 : possibles) {
// TreeGraphNode td2Dep = td2.dep();
// String td2DepPOS = td2Dep.parent().value();
if (td1.reln() == DEPENDENT && td2.reln() == DEPENDENT && td1DepPOS.equals("P")) {
GrammaticalRelation td3reln = UniversalChineseGrammaticalRelations.valueOf(td1Dep.value());
if (td3reln == null) {
td3reln = GrammaticalRelation.valueOf(Language.UniversalChinese,
td1Dep.value());
}
TypedDependency td3 = new TypedDependency(td3reln, td1.gov(), td2.dep());
//log.info("adding: " + td3);
newTypedDeps.add(td3);
td1.setReln(GrammaticalRelation.KILL); // remember these are "used up"
td2.setReln(GrammaticalRelation.KILL); // remember these are "used up"
}
}
// Now we need to see if there any TDs that will be "orphaned"
// by this collapse. Example: if we have:
// dep(drew, on)
// dep(on, book)
// dep(on, right)
// the first two will be collapsed to on(drew, book), but then
// the third one will be orphaned, since its governor no
// longer appears. So, change its governor to 'drew'.
if (td1.reln().equals(GrammaticalRelation.KILL)) {
for (TypedDependency td2 : possibles) {
if (!td2.reln().equals(GrammaticalRelation.KILL)) {
//log.info("td1 & td2: " + td1 + " & " + td2);
td2.setGov(td1.gov());
}
}
}
}
}
}
// now copy remaining unkilled TDs from here to new
for (TypedDependency td : list) {
if (!td.reln().equals(GrammaticalRelation.KILL)) {
newTypedDeps.add(td);
}
}
list.clear(); // forget all (esp. killed) TDs
list.addAll(newTypedDeps);
}
public static void main(String args[]) {
Properties params = StringUtils.argsToProperties(args);
if (params.getProperty("sentFile") != null) {
log.error("Parsing sentences to constituency trees is not supported for Chinese. " +
"Please parse your sentences first and then convert them to dependency trees using the -treeFile option." );
return;
}
GrammaticalStructureConversionUtils.convertTrees(args, "zh");
}
public static List<GrammaticalStructure> readCoNLLXGrammaticalStructureCollection(String fileName) throws IOException {
return readCoNLLXGrammaticalStructureCollection(fileName, UniversalChineseGrammaticalRelations.shortNameToGRel, new FromDependenciesFactory());
}
public static UniversalChineseGrammaticalStructure buildCoNLLXGrammaticalStructure(List<List<String>> tokenFields) {
return (UniversalChineseGrammaticalStructure) buildCoNLLXGrammaticalStructure(tokenFields, UniversalChineseGrammaticalRelations.shortNameToGRel, new FromDependenciesFactory());
}
public static class FromDependenciesFactory
implements GrammaticalStructureFromDependenciesFactory
{
public UniversalChineseGrammaticalStructure build(List<TypedDependency> tdeps, TreeGraphNode root) {
return new UniversalChineseGrammaticalStructure(tdeps, root);
}
}
private static final long serialVersionUID = 8877651855167458256L;
}