/**
* Copyright 2012-2013 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*
* **************************************************************************
* NOTICE This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*
*/
package org.opensextant.extraction;
import java.util.Map;
import org.apache.solr.client.solrj.StreamingResponseCallback;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.SolrParams;
import org.opensextant.ConfigException;
import org.opensextant.util.SolrProxy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* Connects to a Solr sever via HTTP and tags place names in document. The
* <code>SOLR_HOME</code> environment variable must be set to the location of
* the Solr server.
* <p >
* This class is not thread-safe. It could be made to be with little effort.
*
* @author David Smiley - dsmiley@mitre.org
* @author Marc Ubaldino - ubaldino@mitre.org
*/
public abstract class SolrMatcherSupport {
protected Logger log = LoggerFactory.getLogger(getClass());
/*
*/
protected String requestHandler = "/tag";
// protected String coreName = null;
protected SolrProxy solr = null;
// updated after each call to getText();
protected int tagNamesTime = 0;
protected int getNamesTime = 0;
protected int totalTime = 0;
/**
* Use this if you intend to set a non-default tagger path. E.g.,
* /tag1
* /tag-lang1
* etc.
*
* @param nonDefault path of tagger.
*/
public void setTaggerHandler(String nonDefault) {
requestHandler = nonDefault;
}
/**
* Close solr resources.
*/
public void shutdown() {
if (solr != null) {
solr.close();
}
}
/**
* Be explicit about the solr core to use for tagging.
*
* @return the core name
*/
public abstract String getCoreName();
/**
* Return the Solr Parameters for the tagger op.
*
* @return SolrParams
*/
public abstract SolrParams getMatcherParameters();
/**
* Caller must implement their domain objects, POJOs... this callback
* handler only hashes them.
*
* @param doc record to convert to Place record
* @return object representing a Place
*/
public abstract Object createTag(SolrDocument doc);
/**
* Initialize. This capability is not supporting taggers/matchers using HTTP server.
* For now it is intedended to be in-memory, local embedded solr server.
*
* @throws ConfigException if solr server cannot be established from local index or from http server
*/
public void initialize() throws ConfigException {
solr = new SolrProxy(getCoreName());
}
/**
* Emphemeral metric for the current tagText() call. Caller must get these
* numbers immediately after call.
*
* @return time to tag
*/
public int getTaggingNamesTime() {
return tagNamesTime;
}
/**
* @return time to get reference records.
*/
public int getRetrievingNamesTime() {
return getNamesTime;
}
/**
* @return time to get gazetteer records.
*/
public int getTotalTime() {
return totalTime;
}
/**
* Solr call: tag input buffer, returning all candiate reference data that
* matched during tagging.
*
* @param buffer text to tag
* @param docid id for text, only for tracking purposes
* @param refDataMap
* - a map of reference data in solr, It will store caller's
* domain objects. e.g., rec.id => domain(rec)
* @return solr response
* @throws ExtractionException tagger error
*/
protected QueryResponse tagTextCallSolrTagger(String buffer, String docid,
final Map<Integer, Object> refDataMap) throws ExtractionException {
SolrTaggerRequest tagRequest = new SolrTaggerRequest(getMatcherParameters(), buffer);
tagRequest.setPath(requestHandler);
// Stream the response to avoid serialization and to save memory by
// only keeping one SolrDocument materialized at a time
tagRequest.setStreamingResponseCallback(new StreamingResponseCallback() {
@Override
public void streamDocListInfo(long numFound, long start, Float maxScore) {
}
// Future optimization: it would be nice if Solr could give us the
// doc id without giving us a SolrDocument, allowing us to
// conditionally get it. It would save disk IO & speed, at the
// expense of putting ids into memory.
@Override
public void streamSolrDocument(final SolrDocument solrDoc) {
Integer id = (Integer) solrDoc.getFirstValue("id");
// create a domain object for the given tag;
// this callback handler caches such domain obj in simple k/v
// map.
Object domainObj = createTag(solrDoc);
if (domainObj != null) {
refDataMap.put(id, domainObj);
}
}
});
QueryResponse response;
try {
response = tagRequest.process(solr.getInternalSolrServer());
} catch (Exception err) {
throw new ExtractionException("Failed to tag document=" + docid, err);
}
// see https://issues.apache.org/jira/browse/SOLR-5154
SolrDocumentList docList = response.getResults();
if (docList != null) {
// log.debug("Not streaming docs from Solr (not supported)");
StreamingResponseCallback callback = tagRequest.getStreamingResponseCallback();
callback.streamDocListInfo(docList.getNumFound(), docList.getStart(),
docList.getMaxScore());
for (SolrDocument solrDoc : docList) {
/**
* This appears to be an empty list; what is this explicit
* callback loop for?
*/
callback.streamSolrDocument(solrDoc);
}
}
return response;
}
}