package edu.stanford.nlp.trees.international.pennchinese; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.tregex.TregexPattern; import java.io.Serializable; import java.util.function.Predicate; /** * Filters the fragments which end documents in Chinese Treebank */ public class FragmentTreeFilter implements Predicate<Tree>, Serializable { static final TregexPattern threeNodePattern = TregexPattern.compile("FRAG=root <, (PU <: /(/) <2 (VV <: /完/) <- (PU=a <: /)/) <3 =a : =root !> (__ > __)"); static final TregexPattern oneNodePattern = TregexPattern.compile("FRAG=root <: (VV <: /完/) : =root !> (__ > __)"); static final TregexPattern automaticInitialPattern = TregexPattern.compile("automatic=root <: (initial !< __) : =root !> __"); static final TregexPattern manuallySegmentedPattern = TregexPattern.compile("manually=root <: (segmented !< __) : =root !> __"); static final TregexPattern onthewayPattern = TregexPattern.compile("FRAG=root <: (NR <: (ontheway !< __)) : =root !> (__ > __)"); static final TregexPattern singlePuncFragPattern = TregexPattern.compile("__ !> __ <: (PU=punc <: __)"); static final TregexPattern singlePuncPattern = TregexPattern.compile("PU=punc !> __ <: __"); static final TregexPattern metaPattern = TregexPattern.compile("META !> __ <: NN"); // The ctb tree reader uses CHTBTokenizer, which filters out SGML // and accidentally catches five trees in ctb7. // TODO: One alternative would be to get rid of the specialized tokenizer static final TregexPattern bracketPattern = TregexPattern.compile("/[<>]/"); static final TregexPattern[] patterns = { threeNodePattern, oneNodePattern, automaticInitialPattern, manuallySegmentedPattern, onthewayPattern, singlePuncFragPattern, singlePuncPattern, metaPattern, bracketPattern }; public boolean test(Tree tree) { for (TregexPattern pattern : patterns) { if (pattern.matcher(tree).find()) { return false; } } return true; } private static final long serialVersionUID = 1L; }