package edu.berkeley.nlp.treebank;
import java.io.Serializable;
import edu.berkeley.nlp.tokenizer.TokenizerFactory;
import edu.berkeley.nlp.util.Filter;
import edu.berkeley.nlp.util.Filters;
/**
* Language pack for Chinese treebank. (Look into using native2ascii
* to edit this file as a GB file)
*
* @author Roger Levy
*/
public class ChineseTreebankLanguagePack extends AbstractTreebankLanguagePack implements Serializable {
private static TokenizerFactory tf;
public static void setTokenizerFactory(TokenizerFactory tf) {
ChineseTreebankLanguagePack.tf = tf;
}
public static final String ENCODING = "GB18030";
/**
* Return the input Charset encoding for the Treebank.
* See documentation for the <code>Charset</code> class.
*
* @return Name of Charset
*/
public String getEncoding() {
return ENCODING;
}
/**
* Accepts a String that is a punctuation
* tag name, and rejects everything else.
*
* @return Whether this is a punctuation tag
*/
public boolean isPunctuationTag(String str) {
return str.equals("PU");
}
/**
* Accepts a String that is a punctuation
* word, and rejects everything else.
* If one can't tell for sure (as for ' in the Penn Treebank), it
* maks the best guess that it can.
*
* @return Whether this is a punctuation word
*/
public boolean isPunctuationWord(String str) {
return chineseCommaAcceptFilter().accept(str) || chineseEndSentenceAcceptFilter().accept(str) || chineseDouHaoAcceptFilter().accept(str) || chineseQuoteMarkAcceptFilter().accept(str) || chineseParenthesisAcceptFilter().accept(str) || chineseColonAcceptFilter().accept(str) || chineseDashAcceptFilter().accept(str) || chineseOtherAcceptFilter().accept(str);
}
/**
* Accepts a String that is a sentence end
* punctuation tag, and rejects everything else.
*
* @return Whether this is a sentence final punctuation tag
*/
public boolean isSentenceFinalPunctuationTag(String str) {
return chineseEndSentenceAcceptFilter().accept(str);
}
/**
* Returns a String array of punctuation tags for this treebank/language.
*
* @return The punctuation tags
*/
public String[] punctuationTags() {
return tags;
}
/**
* Returns a String array of punctuation words for this treebank/language.
*
* @return The punctuation words
*/
public String[] punctuationWords() {
return punctWords;
}
/**
* Returns a String array of sentence final punctuation tags for this
* treebank/language.
*
* @return The sentence final punctuation tags
*/
public String[] sentenceFinalPunctuationTags() {
return tags;
}
/**
* Returns a String array of sentence final punctuation words for this
* treebank/language.
*
* @return The sentence final punctuation tags
*/
public String[] sentenceFinalPunctuationWords() {
return endSentence;
}
/**
* Accepts a String that is a punctuation
* tag that should be ignored by EVALB-style evaluation,
* and rejects everything else.
* Traditionally, EVALB has ignored a subset of the total set of
* punctuation tags in the English Penn Treebank (quotes and
* period, comma, colon, etc., but not brackets)
*
* @return Whether this is a EVALB-ignored punctuation tag
*/
public boolean isEvalBIgnoredPunctuationTag(String str) {
return Filters.collectionAcceptFilter(tags).accept(str);
}
/**
* The first 3 are used by the Penn Treebank; # is used by the
* BLLIP corpus, and ^ and ~ are used by Klein's
* lexparser. Identical to PennTreebankLanguagePack.
*/
private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~'};
/**
* Return an array of characters at which a String should be
* truncated to give the basic syntactic category of a label.
* The idea here is that Penn treebank style labels follow a syntactic
* category with various functional and crossreferencing information
* introduced by special characters (such as "NP-SBJ=1"). This would
* be truncated to "NP" by the array containing '-' and "=".
*
* @return An array of characters that set off label name suffixes
*/
public char[] labelAnnotationIntroducingCharacters() {
return annotationIntroducingChars;
}
/**
* This is valid for "BobChrisTreeNormalizer" conventions
* only. Again, identical to PennTreebankLanguagePack.
*/
private static final String[] startSymbols = {"ROOT"};
/**
* Returns a String array of treebank start symbols.
*
* @return The start symbols
*/
public String[] startSymbols() {
return startSymbols;
}
private static final String[] tags = {"PU"};
private static final String[] comma = {",", "\uff0c", "\u3000"}; // \u3000 is an "ideographic space"...?
private static final String[] endSentence = {"\u3002", "\uff0e", "\uff01", "\uff1f", "?", "!", "."};
private static final String[] douHao = {"\u3001"};
private static final String[] quoteMark = {"\u201c", "\u201d", "\u2018", "\u2019", "\u300a", "\u300b", "\u300e", "\u300f", "\u3008", "\u3009", "\u300c", "\u300d", "\uff02", "\uff1c", "\uff1e", "`", "\uff07"};
private static final String[] parenthesis = {"\uff08", "\uff09", "-LRB-", "-RRB-", "\u3010", "\u3011"};
private static final String[] colon = {"\uff1a", "\uff1b", "\u2236", ":"};
private static final String[] dash = {"\u2026", "\u2014", "\u2014\u2014", "\u2014\u2014\u2014", "\uff0d", "\uff0d\uff0d", "\u2500\u2500", "\u2501", "\u2501\u2501", "\u2014\uff0d", "-", "----", "~", "\u2026\u2026", "\uff5e"};
private static final String[] other = {"\u00b7", "\uff0f", "\uff0f", "\uff0a", "\uff06", "/", "//", "*" }; // slashes are used in urls
private static String[] leftQuoteMark = {"\u201c", "\u2018", "\u300a", "\u300e", "\u3008", "\u300c", "\uff1c", "`"};
private static String[] rightQuoteMark = {"\u201d", "\u2019", "\u300b", "\u300f", "\u3009", "\u300d", "\uff1e", "\uff07"};
private static String[] leftParenthesis = {"\uff08", "-LRB-", "\u3010"};
private static String[] rightParenthesis = {"\uff09", "-RRB-", "\u3011"};
private static final String[] punctWords;
static {
int n = tags.length + comma.length + endSentence.length + douHao.length + quoteMark.length + parenthesis.length + colon.length + dash.length + other.length;
punctWords = new String[n];
int m = 0;
System.arraycopy(tags, 0, punctWords, m, tags.length);
m += tags.length;
System.arraycopy(comma, 0, punctWords, m, comma.length);
m += comma.length;
System.arraycopy(endSentence, 0, punctWords, m, endSentence.length);
m += endSentence.length;
System.arraycopy(douHao, 0, punctWords, m, douHao.length);
m += douHao.length;
System.arraycopy(quoteMark, 0, punctWords, m, quoteMark.length);
m += quoteMark.length;
System.arraycopy(parenthesis, 0, punctWords, m, parenthesis.length);
m += parenthesis.length;
System.arraycopy(colon, 0, punctWords, m, colon.length);
m += colon.length;
System.arraycopy(dash, 0, punctWords, m, dash.length);
m += dash.length;
System.arraycopy(other, 0, punctWords, m, other.length);
}
public static Filter<String> chineseCommaAcceptFilter() {
return Filters.collectionAcceptFilter(comma);
}
public static Filter<String> chineseEndSentenceAcceptFilter() {
return Filters.collectionAcceptFilter(endSentence);
}
public static Filter<String> chineseDouHaoAcceptFilter() {
return Filters.collectionAcceptFilter(douHao);
}
public static Filter<String> chineseQuoteMarkAcceptFilter() {
return Filters.collectionAcceptFilter(quoteMark);
}
public static Filter<String> chineseParenthesisAcceptFilter() {
return Filters.collectionAcceptFilter(parenthesis);
}
public static Filter<String> chineseColonAcceptFilter() {
return Filters.collectionAcceptFilter(colon);
}
public static Filter<String> chineseDashAcceptFilter() {
return Filters.collectionAcceptFilter(dash);
}
public static Filter<String> chineseOtherAcceptFilter() {
return Filters.collectionAcceptFilter(other);
}
public static Filter<String> chineseLeftParenthesisAcceptFilter() {
return Filters.collectionAcceptFilter(leftParenthesis);
}
public static Filter<String> chineseRightParenthesisAcceptFilter() {
return Filters.collectionAcceptFilter(rightParenthesis);
}
public static Filter<String> chineseLeftQuoteMarkAcceptFilter() {
return Filters.collectionAcceptFilter(leftQuoteMark);
}
public static Filter<String> chineseRightQuoteMarkAcceptFilter() {
return Filters.collectionAcceptFilter(rightQuoteMark);
}
/**
* Returns the extension of treebank files for this treebank.
* This is "fid".
*/
public String treebankFileExtension() {
return "fid";
}
private static final long serialVersionUID = 5757403475523638802L;
}