CSVRequestHandler.java example

Explorer
solrcene-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler;

import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.update.*;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.commons.csv.CSVStrategy;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.io.IOUtils;

import java.util.regex.Pattern;
import java.util.List;
import java.io.*;

/**
 * @version $Id: CSVRequestHandler.java 979807 2010-07-27 19:06:39Z yonik $
 */

public class CSVRequestHandler extends ContentStreamHandlerBase {

  protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
    return new SingleThreadedCSVLoader(req, processor);
  }

  //////////////////////// SolrInfoMBeans methods //////////////////////
  @Override
  public String getDescription() {
    return "Add/Update multiple documents with CSV formatted rows";
  }

  @Override
  public String getVersion() {
    return "$Revision: 979807 $";
  }

  @Override
  public String getSourceId() {
    return "$Id: CSVRequestHandler.java 979807 2010-07-27 19:06:39Z yonik $";
  }

  @Override
  public String getSource() {
    return "$URL: https://svn.apache.org/repos/asf/lucene/dev/trunk/solr/src/java/org/apache/solr/handler/CSVRequestHandler.java $";
  }
}


abstract class CSVLoader extends ContentStreamLoader {
  public static final String SEPARATOR="separator";
  public static final String FIELDNAMES="fieldnames";
  public static final String HEADER="header";
  public static final String SKIP="skip";
  public static final String SKIPLINES="skipLines";
  public static final String MAP="map";
  public static final String TRIM="trim";
  public static final String EMPTY="keepEmpty";
  public static final String SPLIT="split";
  public static final String ENCAPSULATOR="encapsulator";
  public static final String ESCAPE="escape";
  public static final String OVERWRITE="overwrite";

  private static Pattern colonSplit = Pattern.compile(":");
  private static Pattern commaSplit = Pattern.compile(",");

  final IndexSchema schema;
  final SolrParams params;
  final CSVStrategy strategy;
  final UpdateRequestProcessor processor;


  String[] fieldnames;
  SchemaField[] fields;
  CSVLoader.FieldAdder[] adders;

  int skipLines;    // number of lines to skip at start of file

  final AddUpdateCommand templateAdd;



  /** Add a field to a document unless it's zero length.
   * The FieldAdder hierarchy handles all the complexity of
   * further transforming or splitting field values to keep the
   * main logic loop clean.  All implementations of add() must be
   * MT-safe!
   */
  private class FieldAdder {
    void add(SolrInputDocument doc, int line, int column, String val) {
      if (val.length() > 0) {
        doc.addField(fields[column].getName(),val,1.0f);
      }
    }
  }

  /** add zero length fields */
  private class FieldAdderEmpty extends CSVLoader.FieldAdder {
    void add(SolrInputDocument doc, int line, int column, String val) {
      doc.addField(fields[column].getName(),val,1.0f);
    }
  }

  /** trim fields */
  private class FieldTrimmer extends CSVLoader.FieldAdder {
    private final CSVLoader.FieldAdder base;
    FieldTrimmer(CSVLoader.FieldAdder base) { this.base=base; }
    void add(SolrInputDocument doc, int line, int column, String val) {
      base.add(doc, line, column, val.trim());
    }
  }

  /** map a single value.
   * for just a couple of mappings, this is probably faster than
   * using a HashMap.
   */
 private class FieldMapperSingle extends CSVLoader.FieldAdder {
   private final String from;
   private final String to;
   private final CSVLoader.FieldAdder base;
   FieldMapperSingle(String from, String to, CSVLoader.FieldAdder base) {
     this.from=from;
     this.to=to;
     this.base=base;
   }
    void add(SolrInputDocument doc, int line, int column, String val) {
      if (from.equals(val)) val=to;
      base.add(doc,line,column,val);
    }
 }

  /** Split a single value into multiple values based on
   * a CSVStrategy.
   */
  private class FieldSplitter extends CSVLoader.FieldAdder {
    private final CSVStrategy strategy;
    private final CSVLoader.FieldAdder base;
    FieldSplitter(CSVStrategy strategy, CSVLoader.FieldAdder base) {
      this.strategy = strategy;
      this.base = base;
    }

    void add(SolrInputDocument doc, int line, int column, String val) {
      CSVParser parser = new CSVParser(new StringReader(val), strategy);
      try {
        String[] vals = parser.getLine();
        if (vals!=null) {
          for (String v: vals) base.add(doc,line,column,v);
        } else {
          base.add(doc,line,column,val);
        }
      } catch (IOException e) {
        throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,e);
      }
    }
  }


  String errHeader="CSVLoader:";

  CSVLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
    this.processor = processor;
    this.params = req.getParams();
    schema = req.getSchema();

    templateAdd = new AddUpdateCommand();
    templateAdd.allowDups=false;
    templateAdd.overwriteCommitted=true;
    templateAdd.overwritePending=true;

    if (params.getBool(OVERWRITE,true)) {
      templateAdd.allowDups=false;
      templateAdd.overwriteCommitted=true;
      templateAdd.overwritePending=true;
    } else {
      templateAdd.allowDups=true;
      templateAdd.overwriteCommitted=false;
      templateAdd.overwritePending=false;
    }

    strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, CSVStrategy.ESCAPE_DISABLED, false, false, false, true);
    String sep = params.get(SEPARATOR);
    if (sep!=null) {
      if (sep.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid separator:'"+sep+"'");
      strategy.setDelimiter(sep.charAt(0));
    }

    String encapsulator = params.get(ENCAPSULATOR);
    if (encapsulator!=null) {
      if (encapsulator.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+encapsulator+"'");
    }

    String escape = params.get(ESCAPE);
    if (escape!=null) {
      if (escape.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid escape:'"+escape+"'");
    }

    // if only encapsulator or escape is set, disable the other escaping mechanism
    if (encapsulator == null && escape != null) {
      strategy.setEncapsulator( CSVStrategy.ENCAPSULATOR_DISABLED);     
      strategy.setEscape(escape.charAt(0));
    } else {
      if (encapsulator != null) {
        strategy.setEncapsulator(encapsulator.charAt(0));
      }
      if (escape != null) {
        char ch = escape.charAt(0);
        strategy.setEscape(ch);
        if (ch == '\\') {
          // If the escape is the standard backslash, then also enable
          // unicode escapes (it's harmless since 'u' would not otherwise
          // be escaped.                    
          strategy.setUnicodeEscapeInterpretation(true);
        }
      }
    }

    String fn = params.get(FIELDNAMES);
    fieldnames = fn != null ? commaSplit.split(fn,-1) : null;

    Boolean hasHeader = params.getBool(HEADER);

    skipLines = params.getInt(SKIPLINES,0);

    if (fieldnames==null) {
      if (null == hasHeader) {
        // assume the file has the headers if they aren't supplied in the args
        hasHeader=true;
      } else if (!hasHeader) {
        throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"CSVLoader: must specify fieldnames=<fields>* or header=true");
      }
    } else {
      // if the fieldnames were supplied and the file has a header, we need to
      // skip over that header.
      if (hasHeader!=null && hasHeader) skipLines++;

      prepareFields();
    }
  }

  /** create the FieldAdders that control how each field  is indexed */
  void prepareFields() {
    // Possible future optimization: for really rapid incremental indexing
    // from a POST, one could cache all of this setup info based on the params.
    // The link from FieldAdder to this would need to be severed for that to happen.

    fields = new SchemaField[fieldnames.length];
    adders = new CSVLoader.FieldAdder[fieldnames.length];
    String skipStr = params.get(SKIP);
    List<String> skipFields = skipStr==null ? null : StrUtils.splitSmart(skipStr,',');

    CSVLoader.FieldAdder adder = new CSVLoader.FieldAdder();
    CSVLoader.FieldAdder adderKeepEmpty = new CSVLoader.FieldAdderEmpty();

    for (int i=0; i<fields.length; i++) {
      String fname = fieldnames[i];
      // to skip a field, leave the entries in fields and addrs null
      if (fname.length()==0 || (skipFields!=null && skipFields.contains(fname))) continue;

      fields[i] = schema.getField(fname);
      boolean keepEmpty = params.getFieldBool(fname,EMPTY,false);
      adders[i] = keepEmpty ? adderKeepEmpty : adder;

      // Order that operations are applied: split -> trim -> map -> add
      // so create in reverse order.
      // Creation of FieldAdders could be optimized and shared among fields

      String[] fmap = params.getFieldParams(fname,MAP);
      if (fmap!=null) {
        for (String mapRule : fmap) {
          String[] mapArgs = colonSplit.split(mapRule,-1);
          if (mapArgs.length!=2)
            throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Map rules must be of the form 'from:to' ,got '"+mapRule+"'");
          adders[i] = new CSVLoader.FieldMapperSingle(mapArgs[0], mapArgs[1], adders[i]);
        }
      }

      if (params.getFieldBool(fname,TRIM,false)) {
        adders[i] = new CSVLoader.FieldTrimmer(adders[i]);
      }

      if (params.getFieldBool(fname,SPLIT,false)) {
        String sepStr = params.getFieldParam(fname,SEPARATOR);
        char fsep = sepStr==null || sepStr.length()==0 ? ',' : sepStr.charAt(0);
        String encStr = params.getFieldParam(fname,ENCAPSULATOR);
        char fenc = encStr==null || encStr.length()==0 ? (char)-2 : encStr.charAt(0);
        String escStr = params.getFieldParam(fname,ESCAPE);
        char fesc = escStr==null || encStr.length()==0 ? CSVStrategy.ESCAPE_DISABLED : escStr.charAt(0);

        CSVStrategy fstrat = new CSVStrategy(fsep,fenc,CSVStrategy.COMMENTS_DISABLED,fesc, false, false, false, false);
        adders[i] = new CSVLoader.FieldSplitter(fstrat, adders[i]);
      }
    }
  }

  private void input_err(String msg, String[] line, int lineno) {
    StringBuilder sb = new StringBuilder();
    sb.append(errHeader+", line="+lineno + ","+msg+"\n\tvalues={");
    for (String val: line) { sb.append("'"+val+"',"); }
    sb.append('}');
    throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,sb.toString());
  }

  /** load the CSV input */
  public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream) throws IOException {
    errHeader = "CSVLoader: input=" + stream.getSourceInfo();
    Reader reader = null;
    try {
      reader = stream.getReader();
      if (skipLines>0) {
        if (!(reader instanceof BufferedReader)) {
          reader = new BufferedReader(reader);
        }
        BufferedReader r = (BufferedReader)reader;
        for (int i=0; i<skipLines; i++) {
          r.readLine();
        }
      }

      CSVParser parser = new CSVParser(reader, strategy);

      // parse the fieldnames from the header of the file
      if (fieldnames==null) {
        fieldnames = parser.getLine();
        if (fieldnames==null) {
          throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Expected fieldnames in CSV input");
        }
        prepareFields();
      }

      // read the rest of the CSV file
      for(;;) {
        int line = parser.getLineNumber();  // for error reporting in MT mode
        String[] vals = parser.getLine();
        if (vals==null) break;

        if (vals.length != fields.length) {
          input_err("expected "+fields.length+" values but got "+vals.length, vals, line);
        }

        addDoc(line,vals);
      }
    } finally{
      if (reader != null) {
        IOUtils.closeQuietly(reader);
      }
    }
  }

  /** called for each line of values (document) */
  abstract void addDoc(int line, String[] vals) throws IOException;

  /** this must be MT safe... may be called concurrently from multiple threads. */
  void doAdd(int line, String[] vals, SolrInputDocument doc, AddUpdateCommand template) throws IOException {
    // the line number is passed simply for error reporting in MT mode.
    // first, create the lucene document
    for (int i=0; i<vals.length; i++) {
      if (fields[i]==null) continue;  // ignore this field
      String val = vals[i];
      adders[i].add(doc, line, i, val);
    }

    template.solrDoc = doc;
    processor.processAdd(template);
  }

}


class SingleThreadedCSVLoader extends CSVLoader {
  SingleThreadedCSVLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
    super(req, processor);
  }

  void addDoc(int line, String[] vals) throws IOException {
    templateAdd.indexedId = null;
    SolrInputDocument doc = new SolrInputDocument();
    doAdd(line, vals, doc, templateAdd);
  }
}