package edu.stanford.nlp.trees.international.pennchinese;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.*;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Filters;
import edu.stanford.nlp.ling.HasWord;
/**
* Language pack for the UPenn/Colorado/Brandeis Chinese treebank.
* The native character set for the Chinese Treebank was GB18030, but later became UTF-8.
* This file (like the rest of JavaNLP) is in UTF-8.
*
* @author Roger Levy
*/
public class ChineseTreebankLanguagePack extends AbstractTreebankLanguagePack {
private static final long serialVersionUID = 5757403475523638802L;
private TokenizerFactory<? extends HasWord> tf;
public void setTokenizerFactory(TokenizerFactory<? extends HasWord> tf) {
this.tf = tf;
}
@Override
public TokenizerFactory<? extends HasWord> getTokenizerFactory() {
if (tf != null) {
return tf;
} else {
return super.getTokenizerFactory();
}
}
public static final String ENCODING = "utf-8";
/**
* Return the input Charset encoding for the Treebank.
* See documentation for the <code>Charset</code> class.
*
* @return Name of Charset
*/
@Override
public String getEncoding() {
return ENCODING;
}
/**
* Accepts a String that is a punctuation
* tag name, and rejects everything else.
*
* @return Whether this is a punctuation tag
*/
@Override
public boolean isPunctuationTag(String str) {
return str.equals("PU");
}
/**
* Accepts a String that is a punctuation
* word, and rejects everything else.
* If one can't tell for sure (as for ' in the Penn Treebank), it
* maks the best guess that it can.
*
* @return Whether this is a punctuation word
*/
@Override
public boolean isPunctuationWord(String str) {
return chineseCommaAcceptFilter().test(str) || chineseEndSentenceAcceptFilter().test(str) || chineseDouHaoAcceptFilter().test(str) || chineseQuoteMarkAcceptFilter().test(str) || chineseParenthesisAcceptFilter().test(str) || chineseColonAcceptFilter().test(str) || chineseDashAcceptFilter().test(str) || chineseOtherAcceptFilter().test(str);
}
/**
* Accepts a String that is a sentence end
* punctuation tag, and rejects everything else.
* TODO FIXME: this is testing whether it is a sentence final word,
* not a sentence final tag.
*
* @return Whether this is a sentence final punctuation tag
*/
@Override
public boolean isSentenceFinalPunctuationTag(String str) {
return chineseEndSentenceAcceptFilter().test(str);
}
/**
* Returns a String array of punctuation tags for this treebank/language.
*
* @return The punctuation tags
*/
@Override
public String[] punctuationTags() {
return tags;
}
/**
* Returns a String array of punctuation words for this treebank/language.
*
* @return The punctuation words
*/
@Override
public String[] punctuationWords() {
return punctWords;
}
/**
* Returns a String array of sentence final punctuation tags for this
* treebank/language.
*
* @return The sentence final punctuation tags
*/
@Override
public String[] sentenceFinalPunctuationTags() {
return tags;
}
/**
* Returns a String array of sentence final punctuation words for this
* treebank/language.
*
* @return The sentence final punctuation tags
*/
@Override
public String[] sentenceFinalPunctuationWords() {
return endSentence;
}
/**
* Accepts a String that is a punctuation
* tag that should be ignored by EVALB-style evaluation,
* and rejects everything else.
* Traditionally, EVALB has ignored a subset of the total set of
* punctuation tags in the English Penn Treebank (quotes and
* period, comma, colon, etc., but not brackets)
*
* @return Whether this is a EVALB-ignored punctuation tag
*/
@Override
public boolean isEvalBIgnoredPunctuationTag(String str) {
return Filters.collectionAcceptFilter(tags).test(str);
}
/**
* The first 3 are used by the Penn Treebank; # is used by the
* BLLIP corpus, and ^ and ~ are used by Klein's
* lexparser. Identical to PennTreebankLanguagePack.
*/
private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~'};
/**
* Return an array of characters at which a String should be
* truncated to give the basic syntactic category of a label.
* The idea here is that Penn treebank style labels follow a syntactic
* category with various functional and crossreferencing information
* introduced by special characters (such as "NP-SBJ=1"). This would
* be truncated to "NP" by the array containing '-' and "=".
*
* @return An array of characters that set off label name suffixes
*/
@Override
public char[] labelAnnotationIntroducingCharacters() {
return annotationIntroducingChars;
}
/**
* This is valid for "BobChrisTreeNormalizer" conventions
* only. Again, identical to PennTreebankLanguagePack.
*/
private static final String[] startSymbols = {"ROOT"};
/**
* Returns a String array of treebank start symbols.
*
* @return The start symbols
*/
@Override
public String[] startSymbols() {
return startSymbols;
}
private static final String[] tags = {"PU"};
private static final String[] comma = {",", ",", " "}; // last is an "ideographic space"...?
private static final String[] endSentence = {"。", ".", "!", "?", "?", "!", "."};
private static final String[] douHao = {"、"};
private static final String[] quoteMark = {"“", "”", "‘", "’", "《", "》", "『", "』", "〈", "〉",
"「", "」", """, "<", ">", "'", "`", "'", "`", "「", "」"};
private static final String[] parenthesis = {"(", ")", "[", "]", "{", "}", "-LRB-", "-RRB-", "【", "】",
"〔", "〖", "〘", "〚", "⦅", "〕", "〗", "〙", "〛", "⦆" }; // ( and ) still must be escaped
private static final String[] colon = {":", ";", "∶", ":"};
private static final String[] dash = {"…", "―", "——", "———", "————", "—", "——", "———",
"-", "--", "---", "--", "---", "----", "-----", "------",
"──", "━", "━━", "—-", "-", "----", "~", "~~", "~~~", "~~~~", "~~~~~", "……", "~",
"..." /* 3 full width dots as ellipsis */ };
private static final String[] other = {"·", "/", "/", "*", "&", "/", "//", "*", "※", "■", "●", "|" }; // slashes are used in urls
// Note that these next four should contain only things in quoteMark and parenthesis. All such things are there but straight quotes
private static final String[] leftQuoteMark = {"“", "‘", "《", "『", "〈", "「", "<", "`", "`", "「"};
private static final String[] rightQuoteMark = {"”", "’", "》", "』", "〉", "」", ">", "'", "」"};
private static final String[] leftParenthesis = {"(", "-LRB-", "[", "{", "【", "〔", "〖", "〘", "〚", "⦅"};
private static final String[] rightParenthesis = {")", "-RRB-", "]", "}", "】", "〕", "〗", "〙", "〛", "⦆"};
// "〔", "〖", "〘", "〚", "⦅", "〕", "〗", "〙", "〛", "⦆"
private static final String[] punctWords;
static {
final int n = comma.length + endSentence.length + douHao.length + quoteMark.length + parenthesis.length + colon.length + dash.length +
other.length + leftQuoteMark.length + rightQuoteMark.length + leftParenthesis.length + rightParenthesis.length;
punctWords = new String[n];
int m = 0;
System.arraycopy(comma, 0, punctWords, m, comma.length);
m += comma.length;
System.arraycopy(endSentence, 0, punctWords, m, endSentence.length);
m += endSentence.length;
System.arraycopy(douHao, 0, punctWords, m, douHao.length);
m += douHao.length;
System.arraycopy(quoteMark, 0, punctWords, m, quoteMark.length);
m += quoteMark.length;
System.arraycopy(parenthesis, 0, punctWords, m, parenthesis.length);
m += parenthesis.length;
System.arraycopy(colon, 0, punctWords, m, colon.length);
m += colon.length;
System.arraycopy(dash, 0, punctWords, m, dash.length);
m += dash.length;
System.arraycopy(other, 0, punctWords, m, other.length);
m += other.length;
}
public static Predicate<String> chineseCommaAcceptFilter() {
return Filters.collectionAcceptFilter(comma);
}
public static Predicate<String> chineseEndSentenceAcceptFilter() {
return Filters.collectionAcceptFilter(endSentence);
}
public static Predicate<String> chineseDouHaoAcceptFilter() {
return Filters.collectionAcceptFilter(douHao);
}
public static Predicate<String> chineseQuoteMarkAcceptFilter() {
return Filters.collectionAcceptFilter(quoteMark);
}
public static Predicate<String> chineseParenthesisAcceptFilter() {
return Filters.collectionAcceptFilter(parenthesis);
}
public static Predicate<String> chineseColonAcceptFilter() {
return Filters.collectionAcceptFilter(colon);
}
public static Predicate<String> chineseDashAcceptFilter() {
return Filters.collectionAcceptFilter(dash);
}
public static Predicate<String> chineseOtherAcceptFilter() {
return Filters.collectionAcceptFilter(other);
}
public static Predicate<String> chineseLeftParenthesisAcceptFilter() {
return Filters.collectionAcceptFilter(leftParenthesis);
}
public static Predicate<String> chineseRightParenthesisAcceptFilter() {
return Filters.collectionAcceptFilter(rightParenthesis);
}
public static Predicate<String> chineseLeftQuoteMarkAcceptFilter() {
return Filters.collectionAcceptFilter(leftQuoteMark);
}
public static Predicate<String> chineseRightQuoteMarkAcceptFilter() {
return Filters.collectionAcceptFilter(rightQuoteMark);
}
/**
* Returns the extension of treebank files for this treebank.
* This is "fid".
*/
@Override
public String treebankFileExtension() {
return "fid";
}
@Override
public GrammaticalStructureFactory grammaticalStructureFactory() {
if (this.generateOriginalDependencies()) {
return new ChineseGrammaticalStructureFactory();
} else {
return new UniversalChineseGrammaticalStructureFactory();
}
}
@Override
public GrammaticalStructureFactory grammaticalStructureFactory(Predicate<String> puncFilt) {
if (this.generateOriginalDependencies()) {
return new ChineseGrammaticalStructureFactory(puncFilt);
} else {
return new UniversalChineseGrammaticalStructureFactory(puncFilt);
}
}
@Override
public GrammaticalStructureFactory grammaticalStructureFactory(Predicate<String> puncFilt, HeadFinder hf) {
if (this.generateOriginalDependencies()) {
return new ChineseGrammaticalStructureFactory(puncFilt, hf);
} else {
return new UniversalChineseGrammaticalStructureFactory(puncFilt, hf);
}
}
@Override
public boolean supportsGrammaticalStructures() {
return true;
}
@Override
public TreeReaderFactory treeReaderFactory() {
final TreeNormalizer tn = new BobChrisTreeNormalizer();
return new CTBTreeReaderFactory(tn);
}
/** {@inheritDoc} */
@Override
public HeadFinder headFinder() {
return new ChineseHeadFinder(this);
}
/** {@inheritDoc} */
@Override
public HeadFinder typedDependencyHeadFinder() {
if (this.generateOriginalDependencies()) {
return new ChineseSemanticHeadFinder(this);
} else {
return new UniversalChineseSemanticHeadFinder();
}
}
@Override
public boolean generateOriginalDependencies() {
return generateOriginalDependencies;
}
}