package org.xbib.elasticsearch.common.fst; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashSet; import java.util.regex.Pattern; import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.INPUT_TYPE; import org.apache.lucene.util.fst.NoOutputs; /** * Compile an FSA from an UTF-8 text file (must be properly sorted). */ public class FstCompiler { private static final Pattern pattern = Pattern.compile("\\s+"); /** * * @param inputStream the input stream * @param outputStream the output stream * @throws IOException if compilation fails */ public void compile(InputStream inputStream, OutputStream outputStream) throws IOException { final HashSet<BytesRef> words = new HashSet<>(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); String line; String last = null; StringBuilder stringBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { if (line.indexOf('#') >= 0) { continue; } line = pattern.split(line)[0].trim(); line = line.toLowerCase(); if (line.equals(last)) { continue; } last = line; /* * Add the word to the hash set in left-to-right characters order and reversed * for easier matching later on. */ stringBuilder.setLength(0); stringBuilder.append(line); final int len = stringBuilder.length(); stringBuilder.append('>'); words.add(new BytesRef(stringBuilder)); stringBuilder.setLength(len); stringBuilder.reverse().append('<'); words.add(new BytesRef(stringBuilder)); } reader.close(); final BytesRef [] all = new BytesRef[words.size()]; words.toArray(all); Arrays.sort(all, BytesRef::compareTo); final Object nothing = NoOutputs.getSingleton().getNoOutput(); final Builder<Object> builder = new Builder<>(INPUT_TYPE.BYTE4, NoOutputs.getSingleton()); final IntsRefBuilder intsRef = new IntsRefBuilder(); for (BytesRef bytesRef : all) { intsRef.clear(); intsRef.copyUTF8Bytes(bytesRef); builder.add(intsRef.get(), nothing); } final FST<Object> fst = builder.finish(); try (final OutputStreamDataOutput out = new OutputStreamDataOutput(outputStream)) { fst.save(out); } } }