/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.byTask.feeds; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.lang.reflect.Constructor; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; import java.util.Properties; import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.StreamUtils; import org.apache.lucene.util.IOUtils; /** * A {@link ContentSource} reading one line at a time as a * {@link org.apache.lucene.document.Document} from a single file. This saves IO * cost (over DirContentSource) of recursing through a directory and opening a * new file for every document.<br> * The expected format of each line is (arguments are separated by <TAB>): * <i>title, date, body</i>. If a line is read in a different format, a * {@link RuntimeException} will be thrown. In general, you should use this * content source for files that were created with {@link WriteLineDocTask}.<br> * <br> * Config properties: * <ul> * <li>docs.file=<path to the file> * <li>content.source.encoding - default to UTF-8. * <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs * from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise. * </ul> */ public class LineDocSource extends ContentSource { /** Reader of a single input line into {@link DocData}. */ public static abstract class LineParser { protected final String[] header; /** Construct with the header * @param header header line found in the input file, or null if none */ public LineParser(String[] header) { this.header = header; } /** parse an input line and fill doc data appropriately */ public abstract void parseLine(DocData docData, String line); } /** * {@link LineParser} which ignores the header passed to its constructor * and assumes simply that field names and their order are the same * as in {@link WriteLineDocTask#DEFAULT_FIELDS} */ public static class SimpleLineParser extends LineParser { public SimpleLineParser(String[] header) { super(header); } @Override public void parseLine(DocData docData, String line) { int k1 = 0; int k2 = line.indexOf(WriteLineDocTask.SEP, k1); if (k2<0) { throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!"); } docData.setTitle(line.substring(k1,k2)); k1 = k2+1; k2 = line.indexOf(WriteLineDocTask.SEP, k1); if (k2<0) { throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!"); } docData.setDate(line.substring(k1,k2)); k1 = k2+1; k2 = line.indexOf(WriteLineDocTask.SEP, k1); if (k2>=0) { throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!"); } // last one docData.setBody(line.substring(k1)); } } /** * {@link LineParser} which sets field names and order by * the header - any header - of the lines file. * It is less efficient than {@link SimpleLineParser} but more powerful. */ public static class HeaderLineParser extends LineParser { private enum FieldName { NAME , TITLE , DATE , BODY, PROP } private final FieldName[] posToF; public HeaderLineParser(String[] header) { super(header); posToF = new FieldName[header.length]; for (int i=0; i<header.length; i++) { String f = header[i]; if (DocMaker.NAME_FIELD.equals(f)) { posToF[i] = FieldName.NAME; } else if (DocMaker.TITLE_FIELD.equals(f)) { posToF[i] = FieldName.TITLE; } else if (DocMaker.DATE_FIELD.equals(f)) { posToF[i] = FieldName.DATE; } else if (DocMaker.BODY_FIELD.equals(f)) { posToF[i] = FieldName.BODY; } else { posToF[i] = FieldName.PROP; } } } @Override public void parseLine(DocData docData, String line) { int n = 0; int k1 = 0; int k2; while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) { if (n>=header.length) { throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]"); } setDocDataField(docData, n, line.substring(k1,k2)); ++n; k1 = k2 + 1; } if (n!=header.length-1) { throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]"); } // last one setDocDataField(docData, n, line.substring(k1)); } private void setDocDataField(DocData docData, int position, String text) { switch(posToF[position]) { case NAME: docData.setName(text); break; case TITLE: docData.setTitle(text); break; case DATE: docData.setDate(text); break; case BODY: docData.setBody(text); break; case PROP: Properties p = docData.getProps(); if (p==null) { p = new Properties(); docData.setProps(p); } p.setProperty(header[position], text); break; } } } private Path file; private BufferedReader reader; private int readCount; private LineParser docDataLineReader = null; private boolean skipHeaderLine = false; private synchronized void openFile() { try { if (reader != null) { reader.close(); } InputStream is = StreamUtils.inputStream(file); reader = new BufferedReader(new InputStreamReader(is, encoding), StreamUtils.BUFFER_SIZE); if (skipHeaderLine) { reader.readLine(); // skip one line - the header line - already handled that info } } catch (IOException e) { throw new RuntimeException(e); } } @Override public void close() throws IOException { if (reader != null) { reader.close(); reader = null; } } @Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { final String line; final int myID; synchronized(this) { line = reader.readLine(); if (line == null) { if (!forever) { throw new NoMoreDataException(); } // Reset the file openFile(); return getNextDocData(docData); } if (docDataLineReader == null) { // first line ever, one time initialization, docDataLineReader = createDocDataLineReader(line); if (skipHeaderLine) { return getNextDocData(docData); } } // increment IDS only once... myID = readCount++; } // The date String was written in the format of DateTools.dateToString. docData.clear(); docData.setID(myID); docDataLineReader.parseLine(docData, line); return docData; } private LineParser createDocDataLineReader(String line) { String[] header; String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP; if (line.startsWith(headIndicator)) { header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP)); skipHeaderLine = true; // mark to skip the header line when input file is reopened } else { header = WriteLineDocTask.DEFAULT_FIELDS; } // if a specific DocDataLineReader was configured, must respect it String docDataLineReaderClassName = getConfig().get("line.parser", null); if (docDataLineReaderClassName!=null) { try { final Class<? extends LineParser> clazz = Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class); Constructor<? extends LineParser> cnstr = clazz.getConstructor(String[].class); return cnstr.newInstance((Object)header); } catch (Exception e) { throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e); } } // if this the simple case, if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) { return new SimpleLineParser(header); } return new HeaderLineParser(header); } @Override public void resetInputs() throws IOException { super.resetInputs(); openFile(); } @Override public void setConfig(Config config) { super.setConfig(config); String fileName = config.get("docs.file", null); if (fileName == null) { throw new IllegalArgumentException("docs.file must be set"); } file = Paths.get(fileName).toAbsolutePath(); if (encoding == null) { encoding = IOUtils.UTF_8; } } }