package org.xbib.elasticsearch.index.analysis.baseform;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static org.xbib.elasticsearch.index.analysis.baseform.MatchResult.EXACT_MATCH;
import static org.xbib.elasticsearch.index.analysis.baseform.MatchResult.NO_MATCH;
import static org.xbib.elasticsearch.index.analysis.baseform.MatchResult.SEQUENCE_IS_A_PREFIX;
public class Dictionary {
private final Charset UTF8 = Charset.forName("UTF-8");
private FSA fsa;
private FSATraversal matcher;
public Dictionary load(Reader in) throws IOException {
BufferedReader reader = new BufferedReader(in);
List<byte[]> lines = new ArrayList<byte[]>();
String line;
while ((line = reader.readLine()) != null) {
lines.add(line.replace('\t', '+').getBytes(UTF8));
}
reader.close();
Collections.sort(lines, FSABuilder.LEXICAL_ORDERING);
FSABuilder builder = new FSABuilder();
for (byte[] b : lines) {
builder.add(b, 0, b.length);
}
this.fsa = builder.complete();
this.matcher = new FSATraversal(fsa);
return this;
}
public String lookup(CharSequence prefix) throws CharacterCodingException {
return lookup(UTF8.newEncoder().encode(CharBuffer.wrap(prefix)), prefix.toString());
}
public String lookup(ByteBuffer buf, String result) {
MatchResult match = matcher.match(buf.array(), buf.position(), buf.remaining(), fsa.getRootNode());
switch (match.kind) {
case SEQUENCE_IS_A_PREFIX: {
final int arc = fsa.getArc(match.node, (byte) '+');
if (arc != 0 && !fsa.isArcFinal(arc)) {
FSAFinalStatesIterator finalStatesIterator = new FSAFinalStatesIterator(fsa, fsa.getRootNode());
finalStatesIterator.restartFrom(fsa.getEndNode(arc));
if (finalStatesIterator.hasNext()) {
buf = finalStatesIterator.next();
String s = new String(buf.array(), buf.position(), buf.remaining(), UTF8);
return s.isEmpty() || s.equals(result) ? s : lookup(buf, s);
}
}
break;
}
case EXACT_MATCH: {
break;
}
case NO_MATCH: {
break;
}
}
return result;
}
}