/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.suggest; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.Set; import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IOUtils; /** * Dictionary represented by a text file. * * <p>Format allowed: 1 entry per line:<br> * An entry can be: <br> * <ul> * <li>suggestion</li> * <li>suggestion <code>fieldDelimiter</code> weight</li> * <li>suggestion <code>fieldDelimiter</code> weight <code>fieldDelimiter</code> payload</li> * </ul> * where the default <code>fieldDelimiter</code> is {@value #DEFAULT_FIELD_DELIMITER}<br> * <p> * <b>NOTE:</b> * <ul> * <li>In order to have payload enabled, the first entry has to have a payload</li> * <li>If the weight for an entry is not specified then a value of 1 is used</li> * <li>A payload cannot be specified without having the weight specified for an entry</li> * <li>If the payload for an entry is not specified (assuming payload is enabled) * then an empty payload is returned</li> * <li>An entry cannot have more than two <code>fieldDelimiter</code></li> * </ul> * <p> * <b>Example:</b><br> * word1 word2 TAB 100 TAB payload1<br> * word3 TAB 101<br> * word4 word3 TAB 102<br> */ public class FileDictionary implements Dictionary { /** * Tab-delimited fields are most common thus the default, but one can override this via the constructor */ public final static String DEFAULT_FIELD_DELIMITER = "\t"; private BufferedReader in; private String line; private boolean done = false; private final String fieldDelimiter; /** * Creates a dictionary based on an inputstream. * Using {@link #DEFAULT_FIELD_DELIMITER} as the * field separator in a line. * <p> * NOTE: content is treated as UTF-8 */ public FileDictionary(InputStream dictFile) { this(dictFile, DEFAULT_FIELD_DELIMITER); } /** * Creates a dictionary based on a reader. * Using {@link #DEFAULT_FIELD_DELIMITER} as the * field separator in a line. */ public FileDictionary(Reader reader) { this(reader, DEFAULT_FIELD_DELIMITER); } /** * Creates a dictionary based on a reader. * Using <code>fieldDelimiter</code> to separate out the * fields in a line. */ public FileDictionary(Reader reader, String fieldDelimiter) { in = new BufferedReader(reader); this.fieldDelimiter = fieldDelimiter; } /** * Creates a dictionary based on an inputstream. * Using <code>fieldDelimiter</code> to separate out the * fields in a line. * <p> * NOTE: content is treated as UTF-8 */ public FileDictionary(InputStream dictFile, String fieldDelimiter) { in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8)); this.fieldDelimiter = fieldDelimiter; } @Override public InputIterator getEntryIterator() { try { return new FileIterator(); } catch (IOException e) { throw new RuntimeException(e); } } final class FileIterator implements InputIterator { private long curWeight; private final BytesRefBuilder spare = new BytesRefBuilder(); private BytesRefBuilder curPayload = new BytesRefBuilder(); private boolean isFirstLine = true; private boolean hasPayloads = false; private FileIterator() throws IOException { line = in.readLine(); if (line == null) { done = true; IOUtils.close(in); } else { String[] fields = line.split(fieldDelimiter); if (fields.length > 3) { throw new IllegalArgumentException("More than 3 fields in one line"); } else if (fields.length == 3) { // term, weight, payload hasPayloads = true; spare.copyChars(fields[0]); readWeight(fields[1]); curPayload.copyChars(fields[2]); } else if (fields.length == 2) { // term, weight spare.copyChars(fields[0]); readWeight(fields[1]); } else { // only term spare.copyChars(fields[0]); curWeight = 1; } } } @Override public long weight() { return curWeight; } @Override public BytesRef next() throws IOException { if (done) { return null; } if (isFirstLine) { isFirstLine = false; return spare.get(); } line = in.readLine(); if (line != null) { String[] fields = line.split(fieldDelimiter); if (fields.length > 3) { throw new IllegalArgumentException("More than 3 fields in one line"); } else if (fields.length == 3) { // term, weight and payload spare.copyChars(fields[0]); readWeight(fields[1]); if (hasPayloads) { curPayload.copyChars(fields[2]); } } else if (fields.length == 2) { // term, weight spare.copyChars(fields[0]); readWeight(fields[1]); if (hasPayloads) { // have an empty payload curPayload = new BytesRefBuilder(); } } else { // only term spare.copyChars(fields[0]); curWeight = 1; if (hasPayloads) { curPayload = new BytesRefBuilder(); } } return spare.get(); } else { done = true; IOUtils.close(in); return null; } } @Override public BytesRef payload() { return (hasPayloads) ? curPayload.get() : null; } @Override public boolean hasPayloads() { return hasPayloads; } private void readWeight(String weight) { // keep reading floats for bw compat try { curWeight = Long.parseLong(weight); } catch (NumberFormatException e) { curWeight = (long)Double.parseDouble(weight); } } @Override public Set<BytesRef> contexts() { return null; } @Override public boolean hasContexts() { return false; } } }