package edu.umd.hooka; import org.apache.hadoop.conf.Configuration; import edu.umd.hooka.corpora.Language; import edu.umd.hooka.corpora.LanguagePair; /** * This class contains tokenizers for several languages. * The method to tokenize a sentence is preprocessWordsImpl. The input is an array of Strings, generated by splitting the input sentence by space characters. * * @author ferhanture * */ public abstract class AlignmentWordPreprocessor { public final String[] preprocessWordsForAlignment(String[] arg) { final String[] res = preprocessWordsImpl(arg); assert(res.length == arg.length); return res; } protected abstract String[] preprocessWordsImpl(String[] arg); public static AlignmentWordPreprocessor CreatePreprocessor(LanguagePair lp, Language l, Configuration conf) { if(l == null) return new NullPreprocessor(conf); if (l == Language.languageForISO639_1("en")) return new Truncator(conf); if (l == Language.languageForISO639_1("de")) return new GermanTruncator(conf); if (l == Language.languageForISO639_1("ar")) return new ArabicRawTruncator(conf); if (l == Language.languageForISO639_1("hu")) return new HungarianTruncator(conf); return new Truncator(conf); } } class NullPreprocessor extends AlignmentWordPreprocessor { public NullPreprocessor(Configuration c){} @Override protected String[] preprocessWordsImpl(String[] arg) { return arg; } } class ArabicRawTruncator extends AlignmentWordPreprocessor { int length = 4; static final String AL = "\u0627\u0644"; static final String A = "\u0627"; public ArabicRawTruncator(Configuration conf) { } @Override protected String[] preprocessWordsImpl(String[] arg) { String[] res = new String[arg.length]; for (int i =0; i < arg.length; ++i) { final String cur = arg[i].toLowerCase(); int l = length; int s = 0; if (cur.startsWith(AL)) l+=2; else if (cur.startsWith(A)) { l+=1; } if (s >= cur.length()) s=0; if (cur.length() < (s+l)) l = cur.length() - s; res[i] = cur.substring(s, s+l); } return res; } } class Truncator extends AlignmentWordPreprocessor { int length = 4; public Truncator(Configuration conf) { } @Override protected String[] preprocessWordsImpl(String[] arg) { String[] res = new String[arg.length]; for (int i =0; i < arg.length; ++i) { final String cur = arg[i].toLowerCase(); int l = length; if (cur.startsWith("con")) l+=2; else if (cur.startsWith("intra")) l+=4; else if (cur.startsWith("pro")) l+=2; else if (cur.startsWith("anti")) l+=3; else if (cur.startsWith("inter")) l+=4; else if (cur.startsWith("in")) l+=2; else if (cur.startsWith("im")) l+=2; else if (cur.startsWith("re")) l+=2; else if (cur.startsWith("de")) l+=1; else if (cur.startsWith("pre")) l+=2; else if (cur.startsWith("un")) l+=2; else if (cur.startsWith("co")) l+=2; else if (cur.startsWith("qu")) l+=1; else if (cur.startsWith("ad")) l+=1; else if (cur.startsWith("en")) l+=2; else if (cur.startsWith("al-")) l+=2; else if (cur.startsWith("sim")) l+=2; else if (cur.startsWith("sym")) l+=2; if (cur.length() < l) l = cur.length(); res[i] = cur.substring(0, l); } return res; } } class HungarianTruncator extends AlignmentWordPreprocessor { int length = 6; public HungarianTruncator(Configuration conf) { } @Override protected String[] preprocessWordsImpl(String[] arg) { String[] res = new String[arg.length]; for (int i =0; i < arg.length; ++i) { final String cur = arg[i].toLowerCase(); int l = length; if (cur.startsWith("con")) l+=2; else if (cur.startsWith("intra")) l+=4; if (cur.length() < l) l = cur.length(); res[i] = cur.substring(0, l); } return res; } } class GermanTruncator extends AlignmentWordPreprocessor { int length = 4; public GermanTruncator(Configuration conf) { } @Override protected String[] preprocessWordsImpl(String[] arg) { String[] res = new String[arg.length]; for (int i =0; i < arg.length; ++i) { final String cur = arg[i].toLowerCase().replaceAll("sch", "S"); int l = length; int s = 0; if (cur.startsWith("gegen")) l+=5; else if (cur.startsWith("zusammen")) l+=8; else if (cur.startsWith("zuge")) l+=4; else if (cur.startsWith("einge")) l+=5; else if (cur.startsWith("aufge")) l+=5; else if (cur.startsWith("ausge")) l+=5; else if (cur.startsWith("hinge")) l+=5; else if (cur.startsWith("herge")) l+=5; else if (cur.startsWith("ein")) l+=3; else if (cur.startsWith("zer")) l+=2; else if (cur.startsWith("ver")) l+=3; else if (cur.startsWith("ent")) l+=2; else if (cur.startsWith("auf")) l+=3; else if (cur.startsWith("aus")) l+=3; else if (cur.startsWith("abge")) l+=4; else if (cur.startsWith("bei")) l+=3; else if (cur.startsWith("voran")) l+=5; else if (cur.startsWith("vor")) l+=3; else if (cur.startsWith("mit")) l+=3; else if (cur.startsWith("ab")) l+=2; else if (cur.startsWith("be")) l+=1; else if (cur.startsWith("\u00FCber")) l+=4; else if (cur.startsWith("unter")) l+=5; else if (cur.startsWith("ge")) s+=2; else if (cur.startsWith("er")) l+=1; else if (cur.startsWith("zu")) l+=2; else if (cur.startsWith("ange")) l+=3; else if (cur.startsWith("an")) l+=2; else if (cur.startsWith("durch")) l+=5; else if (cur.startsWith("nieder")) l+=5; else if (cur.startsWith("dar")) l+=2; if (s >= cur.length()) s=0; if (cur.length() < (s+l)) l = cur.length() - s; res[i] = cur.substring(s, s+l); } return res; } }