package org.basex.util.ft; import java.util.Collection; import org.basex.util.TokenBuilder; /** * German stemming algorithm, derived from the Apache Lucene project and the * report "A Fast and Simple Stemming Algorithm for German Words" by * Jörg Caumanns. * * @author BaseX Team 2005-12, BSD License * @author Christian Gruen */ final class GermanStemmer extends InternalStemmer { /** Removed characters. */ private int subst; /** * Constructor. * @param fti full-text iterator */ GermanStemmer(final FTIterator fti) { super(fti); } @Override Stemmer get(final Language l, final FTIterator fti) { return new GermanStemmer(fti); } @Override Collection<Language> languages() { return collection("de"); } @Override protected byte[] stem(final byte[] word) { subst = 0; return part(resub(opt(strip(subst(new TokenBuilder(word)))))).finish(); } /** * Does some substitutions. * @param tb string builder * @return substituted string */ private TokenBuilder subst(final TokenBuilder tb) { subst = 0; final int s = tb.size(); final TokenBuilder tmp = new TokenBuilder(s); int ls = 0; int nx = tb.cp(0); for(int c = 0; c < s;) { int sb = 0; int ch = nx; c += tb.cl(c); nx = c < s ? tb.cp(c) : 0; if(ch == ls) { ch = '*'; } else if(ch == '\u00e4') { ch = 'a'; } else if(ch == '\u00f6') { ch = 'o'; } else if(ch == '\u00fc') { ch = 'u'; } else if(ch == '\u00df') { tmp.add('s'); ch = 's'; subst++; } else if(ch == 's' && nx == 'c' && c + 1 < s && tb.get(c + 1) == 'h') { ch = '\1'; sb = 2; } else if(ch == 'c' && nx == 'h') { ch = '\2'; sb = 1; } else if(ch == 'e' && nx == 'i') { ch = '\3'; sb = 1; } else if(ch == 'i' && nx == 'e') { ch = '\4'; sb = 1; } else if(ch == 'i' && nx == 'g') { ch = '\5'; sb = 1; } else if(ch == 's' && nx == 't') { ch = '\6'; sb = 1; } if(sb > 0) { c += sb; nx = c < s ? tb.cp(c) : 0; subst += sb; } ls = ch; tmp.add(ch); } return tmp; } /** * Strips suffixes. * @param tb token builder * @return token builder */ private TokenBuilder strip(final TokenBuilder tb) { while(tb.size() > 3) { final int tl = tb.size(); final int c1 = tb.get(tl - 1); final int c2 = tb.get(tl - 2); if(tl + subst > 5 && c2 == 'n' && c1 == 'd') { tb.size(tl - 2); } else if(tl + subst > 4 && c2 == 'e' && (c1 == 'm' || c1 == 'r')) { tb.size(tl - 2); } else if(c1 == 'e' || c1 == 's' || c1 == 'n' || c1 == 't') { tb.size(tl - 1); } else { break; } } return tb; } /** * Does optimizations. * @param tb token builder * @return token builder */ private TokenBuilder opt(final TokenBuilder tb) { int tl = tb.size(); if(tl > 5 && tb.get(tl - 5) == 'e' && tb.get(tl - 4) == 'r' && tb.get(tl - 3) == 'i' && tb.get(tl - 2) == 'n' && tb.get(tl - 1) == '*') { tb.size(tl - 1); strip(tb); } tl = tb.size(); if(tb.get(tl - 1) == 'z') tb.set((byte) 'x', tl - 1); return tb; } /** * Undoes the changes made by substitute. * @param tb token builder * @return new token builder */ private static TokenBuilder resub(final TokenBuilder tb) { final TokenBuilder tmp = new TokenBuilder(); final int s = tb.size(); for(int c = 0; c < s; c++) { final int ch = tb.get(c); if(ch == '*') { tmp.add(tmp.get(c - 1)); } else if(ch == '\1') { tmp.add('s').add('c').add('h'); } else if(ch == '\2') { tmp.add('c').add('h'); } else if(ch == '\3') { tmp.add('e').add('i'); } else if(ch == '\4') { tmp.add('i').add('e'); } else if(ch == '\5') { tmp.add('i').add('g'); } else if(ch == '\6') { tmp.add('s').add('t'); } else { tmp.add(ch); } } return tmp; } /** * Removes a particle denotion ("ge") from a term. * @param tb token builder * @return token builder */ private static TokenBuilder part(final TokenBuilder tb) { for(int c = 0; c < tb.size() - 4; c++) { if(tb.get(c) == 'g' && tb.get(c + 1) == 'e' && tb.get(c + 2) == 'g' && tb.get(c + 3) == 'e') { tb.delete(c, 2); break; } } return tb; } }