/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexwriter.solr;
import java.lang.invoke.MethodHandles;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.indexer.IndexWriter;
import org.apache.nutch.indexer.IndexerMapReduce;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchField;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.common.util.NamedList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.util.NutchConfiguration;
// WORK AROUND FOR NOT REMOVING URL ENCODED URLS!!!
import java.net.URLDecoder;
public class SolrIndexWriter implements IndexWriter {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private List<SolrClient> solrClients;
private SolrMappingReader solrMapping;
private ModifiableSolrParams params;
private Configuration config;
private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>();
private final List<SolrInputDocument> updateDocs = new ArrayList<SolrInputDocument>();
private final List<String> deleteIds = new ArrayList<String>();
private int batchSize;
private int numDeletes = 0;
private int totalAdds = 0;
private int totalDeletes = 0;
private int totalUpdates = 0;
private boolean delete = false;
public void open(JobConf job, String name) throws IOException {
solrClients = SolrUtils.getSolrClients(job);
init(solrClients, job);
}
// package protected for tests
void init(List<SolrClient> solrClients, JobConf job) throws IOException {
batchSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
solrMapping = SolrMappingReader.getInstance(job);
delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
// parse optional params
params = new ModifiableSolrParams();
String paramString = job.get(IndexerMapReduce.INDEXER_PARAMS);
if (paramString != null) {
String[] values = paramString.split("&");
for (String v : values) {
String[] kv = v.split("=");
if (kv.length < 2) {
continue;
}
params.add(kv[0], kv[1]);
}
}
}
public void delete(String key) throws IOException {
try {
key = URLDecoder.decode(key, "UTF8");
} catch (UnsupportedEncodingException e) {
LOG.error("Error decoding: " + key);
throw new IOException("UnsupportedEncodingException for " + key);
} catch (IllegalArgumentException e) {
LOG.warn("Could not decode: " + key + ", it probably wasn't encoded in the first place..");
}
// escape solr hash separator
key = key.replaceAll("!", "\\!");
if (delete) {
deleteIds.add(key);
totalDeletes++;
}
if (deleteIds.size() >= batchSize) {
push();
}
}
public void deleteByQuery(String query) throws IOException {
try {
LOG.info("SolrWriter: deleting " + query);
for (SolrClient solrClient : solrClients) {
solrClient.deleteByQuery(query);
}
} catch (final SolrServerException e) {
LOG.error("Error deleting: " + deleteIds);
throw makeIOException(e);
}
}
@Override
public void update(NutchDocument doc) throws IOException {
write(doc);
}
public void write(NutchDocument doc) throws IOException {
final SolrInputDocument inputDoc = new SolrInputDocument();
for (final Entry<String, NutchField> e : doc) {
for (final Object val : e.getValue().getValues()) {
// normalise the string representation for a Date
Object val2 = val;
if (val instanceof Date) {
val2 = DateUtil.getThreadLocalDateFormat().format(val);
}
if (e.getKey().equals("content") || e.getKey().equals("title")) {
val2 = SolrUtils.stripNonCharCodepoints((String) val);
}
inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e.getValue()
.getWeight());
String sCopy = solrMapping.mapCopyKey(e.getKey());
if (sCopy != e.getKey()) {
inputDoc.addField(sCopy, val);
}
}
}
inputDoc.setDocumentBoost(doc.getWeight());
inputDocs.add(inputDoc);
totalAdds++;
if (inputDocs.size() + numDeletes >= batchSize) {
push();
}
}
public void close() throws IOException {
commit();
for (SolrClient solrClient : solrClients) {
solrClient.close();
}
}
@Override
public void commit() throws IOException {
push();
try {
for (SolrClient solrClient : solrClients) {
solrClient.commit();
}
} catch (final SolrServerException e) {
LOG.error("Failed to commit solr connection: " + e.getMessage()); // FIXME
}
}
public void push() throws IOException {
if (inputDocs.size() > 0) {
try {
LOG.info("Indexing " + Integer.toString(inputDocs.size())
+ "/" + Integer.toString(totalAdds) + " documents");
LOG.info("Deleting " + Integer.toString(numDeletes) + " documents");
numDeletes = 0;
UpdateRequest req = new UpdateRequest();
req.add(inputDocs);
req.setAction(AbstractUpdateRequest.ACTION.OPTIMIZE, false, false);
req.setParams(params);
for (SolrClient solrClient : solrClients) {
NamedList res = solrClient.request(req);
}
} catch (final SolrServerException e) {
throw makeIOException(e);
}
inputDocs.clear();
}
if (deleteIds.size() > 0) {
try {
LOG.info("SolrIndexer: deleting " + Integer.toString(deleteIds.size())
+ "/" + Integer.toString(totalDeletes) + " documents");
for (SolrClient solrClient : solrClients) {
solrClient.deleteById(deleteIds);
}
} catch (final SolrServerException e) {
LOG.error("Error deleting: " + deleteIds);
throw makeIOException(e);
}
deleteIds.clear();
}
}
public static IOException makeIOException(SolrServerException e) {
final IOException ioe = new IOException();
ioe.initCause(e);
return ioe;
}
@Override
public Configuration getConf() {
return config;
}
@Override
public void setConf(Configuration conf) {
config = conf;
String serverURL = conf.get(SolrConstants.SERVER_URL);
String zkHosts = conf.get(SolrConstants.ZOOKEEPER_HOSTS);
if (serverURL == null && zkHosts == null) {
String message = "Missing SOLR URL and Zookeeper URL. Either on should be set via -D "
+ SolrConstants.SERVER_URL + " or -D " + SolrConstants.ZOOKEEPER_HOSTS;
message += "\n" + describe();
LOG.error(message);
throw new RuntimeException(message);
}
}
public String describe() {
StringBuffer sb = new StringBuffer("SOLRIndexWriter\n");
sb.append("\t").append(SolrConstants.SERVER_URL)
.append(" : URL of the SOLR instance\n");
sb.append("\t").append(SolrConstants.ZOOKEEPER_HOSTS)
.append(" : URL of the Zookeeper quorum\n");
sb.append("\t").append(SolrConstants.COMMIT_SIZE)
.append(" : buffer size when sending to SOLR (default 1000)\n");
sb.append("\t")
.append(SolrConstants.MAPPING_FILE)
.append(
" : name of the mapping file for fields (default solrindex-mapping.xml)\n");
sb.append("\t").append(SolrConstants.USE_AUTH)
.append(" : use authentication (default false)\n");
sb.append("\t").append(SolrConstants.USERNAME)
.append(" : username for authentication\n");
sb.append("\t").append(SolrConstants.PASSWORD)
.append(" : password for authentication\n");
return sb.toString();
}
}