package org.xbib.elasticsearch.common.decompound.fst; import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.FST.INPUT_TYPE; import org.apache.lucene.util.fst.NoOutputs; import java.io.IOException; import java.io.InputStream; import java.io.UncheckedIOException; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Deque; import java.util.Iterator; import java.util.List; import static org.apache.lucene.util.UnicodeUtil.newString; /** * This is a copy of org.apache.lucene.analysis.de.compounds.GermanCompoundSplitter from * https://github.com/dweiss/compound-splitter */ public class FstDecompounder { private static final String[] morphemes = { "e", "es", "en", "er", "n", "ens", "ns", "s" }; /** * A static FSA with inflected and base surface forms. * * @see "http://www.wolfganglezius.de/doku.php?id=cl:surfaceForms" */ private final FST<Object> surfaceForms; /** * A static FSA with glue glueMorphemes. This could be merged into a single FSA * together with {@link #surfaceForms}, but I leave it separate for now. */ private final FST<Object> glueMorphemes; public FstDecompounder(InputStream inputStream, String[] glue) throws IOException { try { this.surfaceForms = new FST<>(new InputStreamDataInput(inputStream), NoOutputs.getSingleton()); } finally { inputStream.close(); } // set up glue morphemes this.glueMorphemes = createGlueMorphemes(glue != null && glue.length > 0 ? glue : morphemes); } private FST<Object> createGlueMorphemes(String[] glue) throws IOException { for (int i = 0; i < glue.length; i++) { glue[i] = new StringBuilder(glue[i]).reverse().toString(); } Arrays.sort(glue); final Builder<Object> builder = new Builder<>(INPUT_TYPE.BYTE4, NoOutputs.getSingleton()); final Object nothing = NoOutputs.getSingleton().getNoOutput(); IntsRefBuilder intsBuilder = new IntsRefBuilder(); for (String morpheme : glue) { fromUTF16ToUTF32(morpheme, intsBuilder); builder.add(intsBuilder.get(), nothing); } return builder.finish(); } public List<String> decompound(String word) { CharSequence chars = split(word); if (chars != null) { return Arrays.asList(chars.toString().split(",'")); } return Collections.singletonList(word); } /** * Splits the input sequence of characters into separate words if this sequence is * potentially a compound word. * * @param word The word to be split. * @return Returns <code>null</code> if this word is not recognized at all. Returns a * character sequence with '.'-delimited compound chunks (if ambiguous * interpretations are possible, they are separated by a ',' character). The * returned buffer will change with each call to <code>split</code> so copy the * content if needed. */ public CharSequence split(CharSequence word) { try { StringBuilder builder = new StringBuilder(); builder.append(word); builder.reverse(); for (int i = builder.length(); --i > 0; ) { builder.setCharAt(i, Character.toLowerCase(builder.charAt(i))); } IntsRefBuilder utf32Builder = new IntsRefBuilder(); IntsRef utf32 = fromUTF16ToUTF32(builder, utf32Builder).get(); builder.setLength(0); Deque<Chunk> chunks = new ArrayDeque<>(); matchWord(utf32, utf32.offset, builder, chunks); return builder.length() == 0 ? null : builder; } catch (IOException e) { throw new UncheckedIOException(e); } } /** * Consume a word, then recurse into glue morphemes/ further words. */ private void matchWord(IntsRef utf32, int offset, StringBuilder builder, Deque<Chunk> chunks) throws IOException { FST.Arc<Object> arc = surfaceForms.getFirstArc(new FST.Arc<>()); FST.Arc<Object> scratch = new FST.Arc<>(); List<Chunk> wordsFromHere = new ArrayList<>(); BytesReader br = surfaceForms.getBytesReader(); for (int i = offset; i < utf32.length; i++) { int chr = utf32.ints[i]; arc = surfaceForms.findTargetArc(chr, arc, arc, br); if (arc == null) { break; } if (surfaceForms.findTargetArc('<', arc, scratch, br) != null) { wordsFromHere.add(new Chunk(offset, i + 1, ChunkType.WORD)); } } /* * This array stores the minimum number of decomposition words during traversals to * avoid splitting a larger word into smaller chunks. */ IntsRefBuilder maxPathsBuilder = new IntsRefBuilder(); maxPathsBuilder.grow(utf32.length + 1); Arrays.fill(maxPathsBuilder.ints(), 0, utf32.length + 1, Integer.MAX_VALUE); int[] maxPaths = maxPathsBuilder.ints(); for (int j = wordsFromHere.size(); --j >= 0; ) { final Chunk ch = wordsFromHere.get(j); if (chunks.size() + 1 > maxPaths[ch.end]) { continue; } maxPaths[ch.end] = chunks.size() + 1; chunks.addLast(ch); if (ch.end == utf32.offset + utf32.length) { // add match to the builder if (builder.length() > 0) { builder.append(","); } boolean first = true; Iterator<Chunk> i = chunks.descendingIterator(); while (i.hasNext()) { Chunk chunk = i.next(); if (chunk.type == ChunkType.WORD) { if (!first) { builder.append('.'); } first = false; builder.append(new StringBuilder(newString(utf32.ints, chunk.start, chunk.end - chunk.start)).reverse()); } } } else { matchWord(utf32, ch.end, builder, chunks); matchGlueMorpheme(utf32, ch.end, builder, chunks); } chunks.removeLast(); } } /** * Consume a maximal glue morpheme, if any, and consume the next word. */ private void matchGlueMorpheme(IntsRef utf32, final int offset, StringBuilder builder, Deque<Chunk> chunks) throws IOException { FST.Arc<Object> arc = glueMorphemes.getFirstArc(new FST.Arc<>()); BytesReader br = glueMorphemes.getBytesReader(); for (int i = offset; i < utf32.length; i++) { int chr = utf32.ints[i]; arc = glueMorphemes.findTargetArc(chr, arc, arc, br); if (arc == null) { break; } if (arc.isFinal()) { chunks.addLast(new Chunk(offset, i + 1, ChunkType.GLUE_MORPHEME)); if (i + 1 < utf32.offset + utf32.length) { matchWord(utf32, i + 1, builder, chunks); } chunks.removeLast(); } } } /** * Convert a character sequence into full unicode codepoints. */ private static IntsRefBuilder fromUTF16ToUTF32(CharSequence s, IntsRefBuilder builder) { builder.clear(); for (int charIdx = 0, charLimit = s.length(); charIdx < charLimit; ) { final int utf32 = Character.codePointAt(s, charIdx); builder.append(utf32); charIdx += Character.charCount(utf32); } return builder; } /** * Category for a given chunk of a compound. */ enum ChunkType { GLUE_MORPHEME, WORD } /** * A slice of a compound word. */ private final class Chunk { final int start; final int end; final ChunkType type; Chunk(int start, int end, ChunkType type) { this.start = start; this.end = end; this.type = type; } } }