LineDocSource.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.benchmark.byTask.feeds;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.reflect.Constructor;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Properties;

import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.IOUtils;

/**
 * A {@link ContentSource} reading one line at a time as a
 * {@link org.apache.lucene.document.Document} from a single file. This saves IO
 * cost (over DirContentSource) of recursing through a directory and opening a
 * new file for every document.<br>
 * The expected format of each line is (arguments are separated by <TAB>):
 * <i>title, date, body</i>. If a line is read in a different format, a
 * {@link RuntimeException} will be thrown. In general, you should use this
 * content source for files that were created with {@link WriteLineDocTask}.<br>
 * <br>
 * Config properties:
 * <ul>
 * <li>docs.file=<path to the file>
 * <li>content.source.encoding - default to UTF-8.
 * <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs 
 *     from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
 * </ul>
 */
public class LineDocSource extends ContentSource {

  /** Reader of a single input line into {@link DocData}. */
  public static abstract class LineParser {
    protected final String[] header;
    /** Construct with the header 
     * @param header header line found in the input file, or null if none
     */
    public LineParser(String[] header) {
      this.header = header; 
    }
    /** parse an input line and fill doc data appropriately */
    public abstract void parseLine(DocData docData, String line);
  }
  
  /** 
   * {@link LineParser} which ignores the header passed to its constructor
   * and assumes simply that field names and their order are the same 
   * as in {@link WriteLineDocTask#DEFAULT_FIELDS} 
   */
  public static class SimpleLineParser extends LineParser {
    public SimpleLineParser(String[] header) {
      super(header);
    }
    @Override
    public void parseLine(DocData docData, String line) {
      int k1 = 0;
      int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
      if (k2<0) {
        throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
      }
      docData.setTitle(line.substring(k1,k2));
      k1 = k2+1;
      k2 = line.indexOf(WriteLineDocTask.SEP, k1);
      if (k2<0) {
        throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
      }
      docData.setDate(line.substring(k1,k2));
      k1 = k2+1;
      k2 = line.indexOf(WriteLineDocTask.SEP, k1);
      if (k2>=0) {
        throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!");
      }
      // last one
      docData.setBody(line.substring(k1));
    }
  }
  
  /** 
   * {@link LineParser} which sets field names and order by 
   * the header - any header - of the lines file.
   * It is less efficient than {@link SimpleLineParser} but more powerful.
   */
  public static class HeaderLineParser extends LineParser {
    private enum FieldName { NAME , TITLE , DATE , BODY, PROP } 
    private final FieldName[] posToF;
    public HeaderLineParser(String[] header) {
      super(header);
      posToF = new FieldName[header.length];
      for (int i=0; i<header.length; i++) {
        String f = header[i];
        if (DocMaker.NAME_FIELD.equals(f)) {
          posToF[i] = FieldName.NAME;
        } else if (DocMaker.TITLE_FIELD.equals(f)) {
          posToF[i] = FieldName.TITLE;
        } else if (DocMaker.DATE_FIELD.equals(f)) {
          posToF[i] = FieldName.DATE;
        } else if (DocMaker.BODY_FIELD.equals(f)) {
          posToF[i] = FieldName.BODY;
        } else {
          posToF[i] = FieldName.PROP;
        }
      }
    }
    
    @Override
    public void parseLine(DocData docData, String line) {
      int n = 0;
      int k1 = 0;
      int k2;
      while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
        if (n>=header.length) {
          throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
        }
        setDocDataField(docData, n, line.substring(k1,k2));
        ++n;
        k1 = k2 + 1;
      }
      if (n!=header.length-1) {
        throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
      }
      // last one
      setDocDataField(docData, n, line.substring(k1)); 
    }

    private void setDocDataField(DocData docData, int position, String text) {
      switch(posToF[position]) {
        case NAME: 
          docData.setName(text);
          break;
        case TITLE: 
          docData.setTitle(text);
          break;
        case DATE: 
          docData.setDate(text);
          break;
        case BODY: 
          docData.setBody(text);
          break;
        case PROP:
          Properties p = docData.getProps();
          if (p==null) {
            p = new Properties();
            docData.setProps(p);
          }
          p.setProperty(header[position], text);
          break;
      }
    }
  }
  
  private Path file;
  private BufferedReader reader;
  private int readCount;

  private LineParser docDataLineReader = null;
  private boolean skipHeaderLine = false;

  private synchronized void openFile() {
    try {
      if (reader != null) {
        reader.close();
      }
      InputStream is = StreamUtils.inputStream(file);
      reader = new BufferedReader(new InputStreamReader(is, encoding), StreamUtils.BUFFER_SIZE);
      if (skipHeaderLine) {
        reader.readLine(); // skip one line - the header line - already handled that info
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  @Override
  public void close() throws IOException {
    if (reader != null) {
      reader.close();
      reader = null;
    }
  }
  
  @Override
  public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    final String line;
    final int myID;
    
    synchronized(this) {
      line = reader.readLine();
      if (line == null) {
        if (!forever) {
          throw new NoMoreDataException();
        }
        // Reset the file
        openFile();
        return getNextDocData(docData);
      }
      if (docDataLineReader == null) { // first line ever, one time initialization,
        docDataLineReader = createDocDataLineReader(line);
        if (skipHeaderLine) {
          return getNextDocData(docData);
        }
      }
      // increment IDS only once...
      myID = readCount++; 
    }
    
    // The date String was written in the format of DateTools.dateToString.
    docData.clear();
    docData.setID(myID);
    docDataLineReader.parseLine(docData, line);
    return docData;
  }

  private LineParser createDocDataLineReader(String line) {
    String[] header;
    String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;

    if (line.startsWith(headIndicator)) {
      header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
      skipHeaderLine = true; // mark to skip the header line when input file is reopened
    } else {
      header = WriteLineDocTask.DEFAULT_FIELDS;
    }
    
    // if a specific DocDataLineReader was configured, must respect it
    String docDataLineReaderClassName = getConfig().get("line.parser", null);
    if (docDataLineReaderClassName!=null) {
      try {
        final Class<? extends LineParser> clazz = 
          Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
        Constructor<? extends LineParser> cnstr = clazz.getConstructor(String[].class);
        return cnstr.newInstance((Object)header);
      } catch (Exception e) {
        throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
      }
    }

    // if this the simple case,   
    if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
      return new SimpleLineParser(header);
    }
    return new HeaderLineParser(header);
  }

  @Override
  public void resetInputs() throws IOException {
    super.resetInputs();
    openFile();
  }
  
  @Override
  public void setConfig(Config config) {
    super.setConfig(config);
    String fileName = config.get("docs.file", null);
    if (fileName == null) {
      throw new IllegalArgumentException("docs.file must be set");
    }
    file = Paths.get(fileName).toAbsolutePath();
    if (encoding == null) {
      encoding = IOUtils.UTF_8;
    }
  }

}