GenerateSolrSequenceKeyBuilder.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.morphlines.solr;

import java.security.SecureRandom;
import java.util.Arrays;
import java.util.Collection;
import java.util.Random;

import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.AbstractCommand;
import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.base.Notifications;

import com.typesafe.config.Config;

/**
 * A command that assigns a record unique key that is the concatenation of the given
 * <code>baseIdField</code> record field, followed by a running count of the record number within
 * the current session. The count is reset to zero whenever a "startSession" notification is
 * received.
 * <p>
 * For example, assume a CSV file containing multiple records but no unique ids, and the
 * <code>baseIdField</code> field is the filesystem path of the file. Now this command can be used
 * to assign the following record values to Solr's unique key field:
 * <code>$path#0, $path#1, ... $path#N</code>.
 * <p>
 * The name of the unique key field is fetched from Solr's schema.xml file, as directed by the
 * <code>solrLocator</code> configuration parameter.
 */
public final class GenerateSolrSequenceKeyBuilder implements CommandBuilder {

  @Override
  public Collection<String> getNames() {
    return Arrays.asList(
        "generateSolrSequenceKey", 
        "sanitizeUniqueSolrKey" // old name (retained for backwards compatibility)
    );
  }

  @Override
  public Command build(Config config, Command parent, Command child, MorphlineContext context) {
    return new GenerateSolrSequenceKey(this, config, parent, child, context);
  }
  
  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  private static final class GenerateSolrSequenceKey extends AbstractCommand {
    
    private final boolean preserveExisting;
    private final String baseIdFieldName;
    private final String uniqueKeyName;
    private long recordCounter = 0;
  
    private final String idPrefix; // for load testing only; enables adding same document many times with a different unique key
    private final Random randomIdPrefix; // for load testing only; enables adding same document many times with a different unique key

    public GenerateSolrSequenceKey(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
      super(builder, config, parent, child, context);
      this.baseIdFieldName = getConfigs().getString(config, "baseIdField", Fields.BASE_ID);
      this.preserveExisting = getConfigs().getBoolean(config, "preserveExisting", true);      
      
      Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
      SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
      LOG.debug("solrLocator: {}", locator);
      IndexSchema schema = locator.getIndexSchema();
      SchemaField uniqueKey = schema.getUniqueKeyField();
      uniqueKeyName = uniqueKey == null ? null : uniqueKey.getName();
      
      String tmpIdPrefix = getConfigs().getString(config, "idPrefix", null);  // for load testing only
      Random tmpRandomIdPrefx = null;
      if ("random".equals(tmpIdPrefix)) { // for load testing only
        tmpRandomIdPrefx = new Random(new SecureRandom().nextLong());    
        tmpIdPrefix = null;
      }
      idPrefix = tmpIdPrefix;
      randomIdPrefix = tmpRandomIdPrefx;
      validateArguments();
    }

    @Override
    protected boolean doProcess(Record doc) {      
      long num = recordCounter++;
      // LOG.debug("record #{} id before sanitizing doc: {}", num, doc);
      if (uniqueKeyName == null || (preserveExisting && doc.getFields().containsKey(uniqueKeyName))) {
        ; // we must preserve the existing id
      } else {
        Object baseId = doc.getFirstValue(baseIdFieldName);
        if (baseId == null) {
          throw new MorphlineRuntimeException("Record field " + baseIdFieldName
              + " must not be null as it is needed as a basis for a unique key for solr doc: " + doc);
        }
        doc.replaceValues(uniqueKeyName, baseId.toString() + "#" + num);          
      }
      
      // for load testing only; enables adding same document many times with a different unique key
      if (idPrefix != null) { 
        String id = doc.getFirstValue(uniqueKeyName).toString();
        id = idPrefix + id;
        doc.replaceValues(uniqueKeyName, id);
      } else if (randomIdPrefix != null) {
        String id = doc.getFirstValue(uniqueKeyName).toString();
        id = String.valueOf(Math.abs(randomIdPrefix.nextInt())) + "#" + id;
        doc.replaceValues(uniqueKeyName, id);
      }

      LOG.debug("record #{} unique key sanitized to this: {}", num, doc);
      
      return super.doProcess(doc);
    }
    
    @Override
    protected void doNotify(Record notification) {
      if (Notifications.containsLifecycleEvent(notification, Notifications.LifecycleEvent.START_SESSION)) {
        recordCounter = 0; // reset
      }
      super.doNotify(notification);
    }

  }
}