/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.solr;
import java.security.SecureRandom;
import java.util.Arrays;
import java.util.Collection;
import java.util.Random;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.AbstractCommand;
import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.base.Notifications;
import com.typesafe.config.Config;
/**
* A command that assigns a record unique key that is the concatenation of the given
* <code>baseIdField</code> record field, followed by a running count of the record number within
* the current session. The count is reset to zero whenever a "startSession" notification is
* received.
* <p>
* For example, assume a CSV file containing multiple records but no unique ids, and the
* <code>baseIdField</code> field is the filesystem path of the file. Now this command can be used
* to assign the following record values to Solr's unique key field:
* <code>$path#0, $path#1, ... $path#N</code>.
* <p>
* The name of the unique key field is fetched from Solr's schema.xml file, as directed by the
* <code>solrLocator</code> configuration parameter.
*/
public final class GenerateSolrSequenceKeyBuilder implements CommandBuilder {
@Override
public Collection<String> getNames() {
return Arrays.asList(
"generateSolrSequenceKey",
"sanitizeUniqueSolrKey" // old name (retained for backwards compatibility)
);
}
@Override
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
return new GenerateSolrSequenceKey(this, config, parent, child, context);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class GenerateSolrSequenceKey extends AbstractCommand {
private final boolean preserveExisting;
private final String baseIdFieldName;
private final String uniqueKeyName;
private long recordCounter = 0;
private final String idPrefix; // for load testing only; enables adding same document many times with a different unique key
private final Random randomIdPrefix; // for load testing only; enables adding same document many times with a different unique key
public GenerateSolrSequenceKey(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
super(builder, config, parent, child, context);
this.baseIdFieldName = getConfigs().getString(config, "baseIdField", Fields.BASE_ID);
this.preserveExisting = getConfigs().getBoolean(config, "preserveExisting", true);
Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
LOG.debug("solrLocator: {}", locator);
IndexSchema schema = locator.getIndexSchema();
SchemaField uniqueKey = schema.getUniqueKeyField();
uniqueKeyName = uniqueKey == null ? null : uniqueKey.getName();
String tmpIdPrefix = getConfigs().getString(config, "idPrefix", null); // for load testing only
Random tmpRandomIdPrefx = null;
if ("random".equals(tmpIdPrefix)) { // for load testing only
tmpRandomIdPrefx = new Random(new SecureRandom().nextLong());
tmpIdPrefix = null;
}
idPrefix = tmpIdPrefix;
randomIdPrefix = tmpRandomIdPrefx;
validateArguments();
}
@Override
protected boolean doProcess(Record doc) {
long num = recordCounter++;
// LOG.debug("record #{} id before sanitizing doc: {}", num, doc);
if (uniqueKeyName == null || (preserveExisting && doc.getFields().containsKey(uniqueKeyName))) {
; // we must preserve the existing id
} else {
Object baseId = doc.getFirstValue(baseIdFieldName);
if (baseId == null) {
throw new MorphlineRuntimeException("Record field " + baseIdFieldName
+ " must not be null as it is needed as a basis for a unique key for solr doc: " + doc);
}
doc.replaceValues(uniqueKeyName, baseId.toString() + "#" + num);
}
// for load testing only; enables adding same document many times with a different unique key
if (idPrefix != null) {
String id = doc.getFirstValue(uniqueKeyName).toString();
id = idPrefix + id;
doc.replaceValues(uniqueKeyName, id);
} else if (randomIdPrefix != null) {
String id = doc.getFirstValue(uniqueKeyName).toString();
id = String.valueOf(Math.abs(randomIdPrefix.nextInt())) + "#" + id;
doc.replaceValues(uniqueKeyName, id);
}
LOG.debug("record #{} unique key sanitized to this: {}", num, doc);
return super.doProcess(doc);
}
@Override
protected void doNotify(Record notification) {
if (Notifications.containsLifecycleEvent(notification, Notifications.LifecycleEvent.START_SESSION)) {
recordCounter = 0; // reset
}
super.doNotify(notification);
}
}
}