package org.xbib.elasticsearch.common.fsa; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; /** * */ public class Dictionary { private FSA fsa; private FSATraversal matcher; /** * Format of file: sourceform "\t" targetform1 "\t" targetform2 ... * @param reader the reader * @return the dictionary * @throws IOException if dictionary load fails */ public Dictionary loadLines(Reader reader) throws IOException { List<byte[]> lines = new ArrayList<>(); try (BufferedReader bufferedReader = new BufferedReader(reader)) { String line; while ((line = bufferedReader.readLine()) != null) { lines.add(line.replace('\t', '+').getBytes(StandardCharsets.UTF_8)); } } lines.sort(FSABuilder.LEXICAL_ORDERING); FSABuilder builder = new FSABuilder(); for (byte[] b : lines) { builder.add(b, 0, b.length); } this.fsa = builder.complete(); this.matcher = new FSATraversal(fsa); return this; } /** * @param reader the reader * @return the dictionary * @throws IOException if dictionary load fails */ public Dictionary loadLinesReverse(Reader reader) throws IOException { List<byte[]> lines = new ArrayList<>(); try (BufferedReader bufferedReader = new BufferedReader(reader)) { String line; while ((line = bufferedReader.readLine()) != null) { List<String> s = Arrays.asList(line.split("\t")); Collections.reverse(s); lines.add(String.join("+", s).getBytes(StandardCharsets.UTF_8)); } } lines.sort(FSABuilder.LEXICAL_ORDERING); FSABuilder builder = new FSABuilder(); for (byte[] b : lines) { builder.add(b, 0, b.length); } this.fsa = builder.complete(); this.matcher = new FSATraversal(fsa); return this; } public Dictionary loadFSA(InputStream inputStream) throws IOException { FSABuilder builder = new FSABuilder(); this.fsa = builder.load(new DataInputStream(inputStream)); this.matcher = new FSATraversal(fsa); return this; } public CharSequence lookup(CharSequence prefix) throws CharacterCodingException { if (prefix == null || prefix.length() == 0) { return prefix; } return lookup(StandardCharsets.UTF_8.newEncoder().encode(CharBuffer.wrap(prefix)), prefix.toString()); } public CharSequence lookup(ByteBuffer buf, String request) { return lookup(buf, request, 0); } public CharSequence lookup(ByteBuffer buf, String request, int level) { if (level > 3) { return request; } MatchResult match = matcher.match(buf.array(), buf.position(), buf.remaining(), fsa.getRootNode()); switch (match.getKind()) { case MatchResult.SEQUENCE_IS_A_PREFIX: final int arc = fsa.getArc(match.getNode(), (byte) '+'); if (arc != 0 && !fsa.isArcFinal(arc)) { FSAFinalStatesIterator finalStatesIterator = new FSAFinalStatesIterator(fsa, fsa.getRootNode()); finalStatesIterator.restartFrom(fsa.getEndNode(arc)); if (finalStatesIterator.hasNext()) { ByteBuffer buffer = finalStatesIterator.next(); String s = new String(buffer.array(), buffer.position(), buffer.remaining(), StandardCharsets.UTF_8); return s.isEmpty() || s.equals(request) ? s : lookup(buffer, s, level + 1); } } break; case MatchResult.EXACT_MATCH: case MatchResult.NO_MATCH: default: break; } return request; } public FSA fsa() { return fsa; } }