package org.basex.util.ft;
import static org.basex.util.Token.*;
import java.util.Arrays;
import java.util.Collection;
/**
* English stemming algorithm, based on the publication from
* Porter (1980), "An algorithm for suffix stripping".
*
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
*/
final class EnglishStemmer extends InternalStemmer {
/** Stemming character. */
private static final byte[] AT = token("at");
/** Stemming character. */
private static final byte[] BL = token("bl");
/** Stemming character. */
private static final byte[] ED = token("ed");
/** Stemming character. */
private static final byte[] EED = token("eed");
/** Stemming character. */
private static final byte[] IES = token("ies");
/** Stemming character. */
private static final byte[] ING = token("ing");
/** Stemming character. */
private static final byte[] ION = token("ion");
/** Stemming character. */
private static final byte[] IZ = token("iz");
/** Stemming character. */
private static final byte[] LL = token("ll");
/** Stemming character. */
private static final byte[] SION = token("sion");
/** Stemming character. */
private static final byte[] SSES = token("sses");
/** Stemming character. */
private static final byte[] TION = token("tion");
/** Stemming character. */
private static final byte S = 's';
/** Stemming character. */
private static final byte Y = 'y';
/** Stemming character. */
private static final byte E = 'e';
/** Stemming character. */
private static final byte L = 'l';
/** Step 2. */
private static final byte[][][] ST2 = {
tokens("abli", "able"), tokens("alism", "al"), tokens("aliti", "al"),
tokens("alli", "al"), tokens("anci", "ance"), tokens("ation", "ate"),
tokens("ational", "ate"), tokens("ator", "ate"), tokens("biliti", "ble"),
tokens("eli", "e"), tokens("enci", "ence"), tokens("entli", "ent"),
tokens("fulness", "ful"), tokens("iveness", "ive"),
tokens("iviti", "ive"),
tokens("ization", "ize"), tokens("ization", "ize"),
tokens("izer", "ize"),
tokens("izer", "ize"), tokens("ousli", "ous"), tokens("ousness", "ous"),
tokens("tional", "tion"),
};
/** Step 3. */
private static final byte[][][] ST3 = {
tokens("alize", "al"), tokens("alize", "al"), tokens("ative", ""),
tokens("ful", ""), tokens("ical", "ic"), tokens("icate", "ic"),
tokens("iciti", "ic"), tokens("ness", "")
};
/** Step 4. */
private static final byte[][] ST4 = tokens(
"able", "al", "ance", "ant", "ate", "ement", "ence", "ent", "er", "ible",
"ic", "ism", "iti", "ive", "ize", "ment", "ou", "ous", "sion", "tion"
);
/** Token to be stemmed. */
private byte[] tok;
/** Token length. */
private int te;
/** Stemming length. */
private int tt;
/**
* Constructor.
* @param fti full-text iterator
*/
EnglishStemmer(final FTIterator fti) {
super(fti);
}
@Override
Stemmer get(final Language l, final FTIterator fti) {
return new EnglishStemmer(fti);
}
@Override
Collection<Language> languages() {
return collection("en");
}
@Override
protected byte[] stem(final byte[] str) {
te = str.length;
tok = str;
return !s() ? str : Arrays.copyOf(str, te);
}
/**
* Stems the current word.
* @return true if word was stemmed
*/
private boolean s() {
if(te < 3) return false;
// step 1
if(e(S)) {
if(e(SSES) || e(IES)) te -= 2;
else if(l(te - 2) != 's') --te;
}
if(e(EED)) {
if(m() > 0) --te;
} else if((e(ED) || e(ING)) && v()) {
te = tt;
if(e(AT) || e(BL) || e(IZ)) {
tt = te;
a((byte) 'e');
} else if(te > 1) {
final int c = l(te - 1);
if(c == l(te - 2) && c != 'l' && c != 's' && c != 'z') {
--te;
} else if(m() == 1) {
if(c(te)) a((byte) 'e');
}
}
}
if(e(Y) && v()) a((byte) 'i');
// step 2
for(final byte[][] s : ST2) {
if(e(s[0])) {
if(m() > 0) a(s[1]);
break;
}
}
// step 3
for(final byte[][] s : ST3) {
if(e(s[0])) {
if(m() > 0) a(s[1]);
break;
}
}
// step 4
if((e(TION) || e(SION)) && e(ION) && m() > 1) {
te -= 3;
} else {
for(final byte[] s : ST4) {
if(e(s)) {
if(m() > 1) te = tt;
break;
}
}
}
// step 5
if(e(E)) {
final int m = m();
if(m > 1 || m == 1 && !c(te - 1)) --te;
}
if(e(LL) && e(L) && m() > 1) --te;
return te != tok.length;
}
/**
* Checks for the cvc pattern.
* @param l position
* @return result of check
*/
private boolean c(final int l) {
if(l < 3) return false;
final int c = l(l - 1);
return c != 'w' && c != 'x' && c != 'y' &&
!v(l - 1) && v(l - 2) && !v(l - 3);
}
/**
* Suffix test for a token.
* @param s suffix
* @return result of check
*/
private boolean e(final byte[] s) {
final int sl = s.length;
final int l = te - sl;
if(l < 0) return false;
for(int i = 0; i < sl; ++i)
if(l(l + i) != s[i]) return false;
tt = l;
return true;
}
/**
* Suffix test for a single character.
* @param s suffix
* @return result of check
*/
private boolean e(final byte s) {
final int l = te - 1;
if(l < 0 || l(l) != s) return false;
tt = l;
return true;
}
/**
* Returns word measure.
* @return measure
*/
private int m() {
int c = 0;
int i = -1;
boolean v = false;
while(++i < tt) {
if(v ^ v(i)) {
if(v) ++c;
v ^= true;
}
}
return c;
}
/**
* Vowel test.
* @return result of check
*/
private boolean v() {
for(int i = 0; i < tt; ++i)
if(v(i)) return true;
return false;
}
/**
* Vowel test.
* @param p position
* @return result of check
*/
private boolean v(final int p) {
final int c = l(p);
return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' ||
c == 'y' && p != 0 && !v(p - 1);
}
/**
* Returns the lower character at the specified position.
* @param p position
* @return result of check
*/
private int l(final int p) {
return lc(tok[p]);
}
/**
* Adds a character.
* @param c character
*/
private void a(final byte c) {
te = tt;
tok[te++] = c;
}
/**
* Adds a token.
* @param t token
*/
private void a(final byte[] t) {
te = tt;
for(final byte c : t) tok[te++] = c;
}
}