package edu.umd.hooka;
import org.apache.hadoop.conf.Configuration;
import edu.umd.hooka.corpora.Language;
import edu.umd.hooka.corpora.LanguagePair;
/**
* This class contains tokenizers for several languages.
* The method to tokenize a sentence is preprocessWordsImpl. The input is an array of Strings, generated by splitting the input sentence by space characters.
*
* @author ferhanture
*
*/
public abstract class AlignmentWordPreprocessor {
public final String[] preprocessWordsForAlignment(String[] arg) {
final String[] res = preprocessWordsImpl(arg);
assert(res.length == arg.length);
return res;
}
protected abstract String[] preprocessWordsImpl(String[] arg);
public static AlignmentWordPreprocessor CreatePreprocessor(LanguagePair lp,
Language l,
Configuration conf) {
if(l == null)
return new NullPreprocessor(conf);
if (l == Language.languageForISO639_1("en"))
return new Truncator(conf);
if (l == Language.languageForISO639_1("de"))
return new GermanTruncator(conf);
if (l == Language.languageForISO639_1("ar"))
return new ArabicRawTruncator(conf);
if (l == Language.languageForISO639_1("hu"))
return new HungarianTruncator(conf);
return new Truncator(conf);
}
}
class NullPreprocessor extends AlignmentWordPreprocessor {
public NullPreprocessor(Configuration c){}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
return arg;
}
}
class ArabicRawTruncator extends AlignmentWordPreprocessor {
int length = 4;
static final String AL = "\u0627\u0644";
static final String A = "\u0627";
public ArabicRawTruncator(Configuration conf) {
}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
String[] res = new String[arg.length];
for (int i =0; i < arg.length; ++i) {
final String cur = arg[i].toLowerCase();
int l = length;
int s = 0;
if (cur.startsWith(AL))
l+=2;
else if (cur.startsWith(A)) {
l+=1;
}
if (s >= cur.length()) s=0;
if (cur.length() < (s+l)) l = cur.length() - s;
res[i] = cur.substring(s, s+l);
}
return res;
}
}
class Truncator extends AlignmentWordPreprocessor {
int length = 4;
public Truncator(Configuration conf) {
}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
String[] res = new String[arg.length];
for (int i =0; i < arg.length; ++i) {
final String cur = arg[i].toLowerCase();
int l = length;
if (cur.startsWith("con"))
l+=2;
else if (cur.startsWith("intra"))
l+=4;
else if (cur.startsWith("pro"))
l+=2;
else if (cur.startsWith("anti"))
l+=3;
else if (cur.startsWith("inter"))
l+=4;
else if (cur.startsWith("in"))
l+=2;
else if (cur.startsWith("im"))
l+=2;
else if (cur.startsWith("re"))
l+=2;
else if (cur.startsWith("de"))
l+=1;
else if (cur.startsWith("pre"))
l+=2;
else if (cur.startsWith("un"))
l+=2;
else if (cur.startsWith("co"))
l+=2;
else if (cur.startsWith("qu"))
l+=1;
else if (cur.startsWith("ad"))
l+=1;
else if (cur.startsWith("en"))
l+=2;
else if (cur.startsWith("al-"))
l+=2;
else if (cur.startsWith("sim"))
l+=2;
else if (cur.startsWith("sym"))
l+=2;
if (cur.length() < l) l = cur.length();
res[i] = cur.substring(0, l);
}
return res;
}
}
class HungarianTruncator extends AlignmentWordPreprocessor {
int length = 6;
public HungarianTruncator(Configuration conf) {
}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
String[] res = new String[arg.length];
for (int i =0; i < arg.length; ++i) {
final String cur = arg[i].toLowerCase();
int l = length;
if (cur.startsWith("con"))
l+=2;
else if (cur.startsWith("intra"))
l+=4;
if (cur.length() < l) l = cur.length();
res[i] = cur.substring(0, l);
}
return res;
}
}
class GermanTruncator extends AlignmentWordPreprocessor {
int length = 4;
public GermanTruncator(Configuration conf) {
}
@Override
protected String[] preprocessWordsImpl(String[] arg) {
String[] res = new String[arg.length];
for (int i =0; i < arg.length; ++i) {
final String cur = arg[i].toLowerCase().replaceAll("sch", "S");
int l = length;
int s = 0;
if (cur.startsWith("gegen"))
l+=5;
else if (cur.startsWith("zusammen"))
l+=8;
else if (cur.startsWith("zuge"))
l+=4;
else if (cur.startsWith("einge"))
l+=5;
else if (cur.startsWith("aufge"))
l+=5;
else if (cur.startsWith("ausge"))
l+=5;
else if (cur.startsWith("hinge"))
l+=5;
else if (cur.startsWith("herge"))
l+=5;
else if (cur.startsWith("ein"))
l+=3;
else if (cur.startsWith("zer"))
l+=2;
else if (cur.startsWith("ver"))
l+=3;
else if (cur.startsWith("ent"))
l+=2;
else if (cur.startsWith("auf"))
l+=3;
else if (cur.startsWith("aus"))
l+=3;
else if (cur.startsWith("abge"))
l+=4;
else if (cur.startsWith("bei"))
l+=3;
else if (cur.startsWith("voran"))
l+=5;
else if (cur.startsWith("vor"))
l+=3;
else if (cur.startsWith("mit"))
l+=3;
else if (cur.startsWith("ab"))
l+=2;
else if (cur.startsWith("be"))
l+=1;
else if (cur.startsWith("\u00FCber"))
l+=4;
else if (cur.startsWith("unter"))
l+=5;
else if (cur.startsWith("ge"))
s+=2;
else if (cur.startsWith("er"))
l+=1;
else if (cur.startsWith("zu"))
l+=2;
else if (cur.startsWith("ange"))
l+=3;
else if (cur.startsWith("an"))
l+=2;
else if (cur.startsWith("durch"))
l+=5;
else if (cur.startsWith("nieder"))
l+=5;
else if (cur.startsWith("dar"))
l+=2;
if (s >= cur.length()) s=0;
if (cur.length() < (s+l)) l = cur.length() - s;
res[i] = cur.substring(s, s+l);
}
return res;
}
}