package uk.ac.shef.dcs.jate.solr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Pair;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.CopyField;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.update.CommitUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.app.App;
import uk.ac.shef.dcs.jate.app.AppParams;
import uk.ac.shef.dcs.jate.model.JATETerm;
import uk.ac.shef.dcs.jate.util.SolrUtil;
/**
* Scans solr indexed and TR aware content field and perform terminology
* recognition (ranking + filtering + indexing) for whole index.
* <p>
* Term candidates are extracted and stored in index-time,
* which can be triggered by document indexing (e.g., by HTTP POST TOOL)
* or setting 'extraction' to true (as an option) in this request handler.
* <p>
* TR-AWARE SOLR FIELDS MUST ALSO BE CONFIGURED AND INDEXED FOR TERN RANKING ALGORITHMS
* (SEE EXAMPLES SETTING OF schema.xml IN $JATE_HOME/testdata/solr-testbed).
* <p>
* Example configuration in solrconfig.xml
* <p>
* 1. configure JATE library (jar file) to solr classpath
* <p>
* <lib path=
* "${solr.install.dir:../../..}/contrib/jate/lib/jate-2.0-*-with-dependencies.jar"/>
* <p>
* 2. configure request handler for term recognition and indexing
* <p>
* <pre>
* {@code
* <requestHandler name="/termRecogniser" class="uk.ac.shef.dcs.jate.solr.TermRecognitionRequestHandler">
* <lst name="defaults">
* <str name="algorithm">CValue</str>
* <bool name="extraction">false</bool>
* <bool name="indexTerm">true</bool>
* <bool name="boosting">false</bool>
* <str name="-prop"><YOUR_PATH>/resource/jate.properties</str>
* <float name="-cf.t">0</float>
* <str name="-o"><YOUR_PATH>/industry_terms.json</str>
* </lst>
* </requestHandler>
* }
*/
public class TermRecognitionRequestHandler extends RequestHandlerBase {
private final Logger log = LoggerFactory.getLogger(getClass());
/**
* Request parameter.
*/
public static enum Algorithm {
C_VALUE("CValue"), ATTF("ATTF"), CHI_SQUARE("ChiSquare"), GLOSSEX("GlossEx"), RAKE(
"RAKE"), RIDF("RIDF"), TERM_EX("TermEx"), TF_IDF("TTF-IDF"), TTF("TTF"), WEIRDNESS("Weirdness");
private final String algorithmName;
Algorithm(String algorithmName) {
this.algorithmName = algorithmName;
}
public String getAlgorithmName() {
return this.algorithmName;
}
}
/**
* Solr field where content ngram info indexed and stored by means of
* {@code solr.ShingleFilterFactory}
* <p>
* See example cores in /testbed/ for Recommended setting for this field in solr schema.xml:
* <p>
*/
public static final String FIELD_CONTENT_NGRAM = "solr_field_content_ngrams";
/**
* Solr field where final filtered term will be indexed and stored This
* field should be a multi-valued field.
* <p>
* Recommended configuration in solr schema.xml:
* <p>
* <pre>
* {@code
* <field name="jate_domain_terms" type="string" indexed="true"
* stored="true" required="false" omitNorms="false" multiValued="true"/>
* }
* </pre>
*/
@Deprecated
public static final String FIELD_DOMAIN_TERMS = "field_domain_terms";
/**
* Term ranking (unithood/termhood) algorithm.
*
* @see uk.ac.shef.dcs.jate.solr.TermRecognitionRequestHandler for
* all the supported ATR algorithms
*/
public static final String TERM_RANKING_ALGORITHM = "algorithm";
/**
* Boolean flag to indicate whether extract candidate or not
*/
public static final String CANDIDATE_EXTRACTION = "extraction";
/**
* Boolean flag to indicate whether term score will be as boost value for indexed term
*/
public static final String BOOSTING = "boosting";
/**
* Boolean flag to indicate whether filtered candidate terms will be indexed and stored
* This requires corresponding solr field to be configured in schema if set to true
*/
public static final String INDEX_TERM = "indexTerm";
/**
* JATE property file is a required run-time setting file.
* <p>
* The property file must provide the configuration of pre-processed data
* (e.g., solr content field, solr content ngram field for term vector
* statistics) in index-time when term candidates is extracted.
*/
public static final String JATE_PROPERTY_FILE = AppParams.JATE_PROPERTIES_FILE.getParamKey();
/**
* Minimum frequency allowed for term candidates. Increase for better
* precision
*/
public static final String PREFILTER_MIN_TERM_TOTAL_FREQUENCY = AppParams.
PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey();
/**
* Optional
* <p>
* Min frequency of a term appearing in different context
*
* @see uk.ac.shef.dcs.jate.app.AppChiSquare
*/
public static final String PREFILTER_MIN_TERM_CONTEXT_FREQUENCY = AppParams.
PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey();
/**
* cut-off threshold (exclusive) for filtering and indexing ranked term
* candidates by term weight. Any term with weight less or equal to this
* value will be filtered. The value is default as 0
*/
public static final String CUTOFF_THRESHOLD = AppParams.CUTOFF_THRESHOLD.getParamKey();
/**
* Top N (inclusive) threshold for choose top N ranked term candidates
* <p>
* This threshold is an alternative to the default
* @see uk.ac.shef.dcs.jate.solr.TermRecognitionRequestHandler#CUTOFF_THRESHOLD
*/
public static final String CUTOFF_TOP_K = AppParams.CUTOFF_TOP_K.getParamKey();
/**
* Top percentage of total ranked term candidates.
* <p>
* This threshold is an alternative to the default
* @see uk.ac.shef.dcs.jate.solr.TermRecognitionRequestHandler#CUTOFF_THRESHOLD
*/
public static final String CUTOFF_TOP_K_PERCENT = AppParams.CUTOFF_TOP_K_PERCENT.getParamKey();
/**
* Unigram frequency distribution file required by few termhood calculation
*
* @see uk.ac.shef.dcs.jate.app.AppTermEx
* @see uk.ac.shef.dcs.jate.app.AppGlossEx
*/
public static final String REFERENCE_FREQUENCY_FILE = AppParams.REFERENCE_FREQUENCY_FILE.getParamKey();
public static final Float DEFAULT_BOOST_VALUE = 1.0F;
private final TermRecognitionProcessor generalTRProcessor;
public TermRecognitionRequestHandler() {
generalTRProcessor = TermRecognitionProcessorFactory.createTermRecognitionProcessor();
}
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
log.info("Term recognition request handler...");
setTopInitArgsAsInvariants(req);
final String jatePropertyFile = req.getParams().get(JATE_PROPERTY_FILE);
final String algorithmName = req.getParams().get(TERM_RANKING_ALGORITHM);
final Boolean isExtraction = req.getParams().getBool(CANDIDATE_EXTRACTION);
final String outFilePath = req.getParams().get(AppParams.OUTPUT_FILE.getParamKey());
final Boolean isIndexTerms = req.getParams().getBool(INDEX_TERM);
final Boolean isBoosted = req.getParams().getBool(BOOSTING);
final Algorithm algorithm = getAlgorithm(algorithmName);
JATEProperties properties = App.getJateProperties(jatePropertyFile);
final SolrIndexSearcher searcher = req.getSearcher();
try {
if (isExtraction) {
log.info("start candidate extraction (i.e., re-index of whole corpus) ...");
generalTRProcessor.candidateExtraction(searcher.getCore(), jatePropertyFile);
log.info("complete candidate terms indexing.");
}
Map<String, String> trRunTimeParams = initialiseTRRunTimeParams(req);
List<JATETerm> termList = generalTRProcessor.rankingAndFiltering(searcher.getCore(), jatePropertyFile, trRunTimeParams,
algorithm);
log.info(String.format("complete term recognition extraction! Finalized Term size [%s]", termList.size()));
if (isExport(outFilePath)) {
generalTRProcessor.export(termList);
}
if (isIndexTerms) {
log.info("start to index filtered candidate terms ...");
indexTerms(termList, properties, searcher, isBoosted, isExtraction);
//trigger 'optimise' to build new index
searcher.getCore().getUpdateHandler().commit(new CommitUpdateCommand(req, true));
log.info("complete the indexing of candidate terms.");
}
} finally {
searcher.close();
}
}
private boolean isExport(String outFilePath) {
return outFilePath != null && StringUtils.isNotEmpty(outFilePath);
}
/**
* initialise Term Recognition (TR) runtime parameters
* <p>
* The method is to make it compatible with TR command line tool
* <p>
* TODO: need to have a naming convention to make it consistent for both
* ends
* <p>
* see also {@code uk.ac.shef.dcs.jate.app.App} see also
* {@code uk.ac.shef.dcs.jate.JATEProperties}
*
* @param req, Container for a request to execute a query
* @return initialisation parameter map for ATE algorithm
*/
private Map<String, String> initialiseTRRunTimeParams(SolrQueryRequest req) {
Map<String, String> trRunTimeParams = new HashMap<String, String>();
Float cut_off_threshold = req.getParams().getFloat(CUTOFF_THRESHOLD);
if (cut_off_threshold != null) {
trRunTimeParams.put(AppParams.CUTOFF_THRESHOLD.getParamKey(), cut_off_threshold.toString());
}
Integer topNThreshold = req.getParams().getInt(CUTOFF_TOP_K);
if (topNThreshold != null) {
trRunTimeParams.put(AppParams.CUTOFF_TOP_K.getParamKey(), topNThreshold.toString());
}
Float topPercentageThreshold = req.getParams().getFloat(CUTOFF_TOP_K_PERCENT);
if (topPercentageThreshold != null) {
trRunTimeParams.put(AppParams.CUTOFF_TOP_K_PERCENT.getParamKey(),
topPercentageThreshold.toString());
}
Integer minTotalTermFreq = req.getParams().getInt(PREFILTER_MIN_TERM_TOTAL_FREQUENCY);
if (minTotalTermFreq != null) {
trRunTimeParams.put(AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey(),
minTotalTermFreq.toString());
}
Integer minTermContextFreq = req.getParams().getInt(PREFILTER_MIN_TERM_CONTEXT_FREQUENCY);
if (minTermContextFreq != null) {
trRunTimeParams.put(AppParams.PREFILTER_MIN_TERM_CONTEXT_FREQUENCY.getParamKey(),
minTermContextFreq.toString());
}
String unigramFreqFile = req.getParams().get(REFERENCE_FREQUENCY_FILE);
if (unigramFreqFile != null) {
trRunTimeParams.put(AppParams.REFERENCE_FREQUENCY_FILE.getParamKey(), unigramFreqFile);
}
String outputFile = req.getParams().get(AppParams.OUTPUT_FILE.getParamKey());
if (outputFile != null) {
trRunTimeParams.put(AppParams.OUTPUT_FILE.getParamKey(), outputFile);
}
Boolean collectTermInfo = req.getParams().getBool(AppParams.COLLECT_TERM_INFO.getParamKey());
if (collectTermInfo != null) {
trRunTimeParams.put(AppParams.COLLECT_TERM_INFO.getParamKey(), collectTermInfo.toString());
}
return trRunTimeParams;
}
@Override
public String getDescription() {
return "Automatic term recognition and indexing by whole corpus/index analysis.";
}
/**
* Index weighted & filtered final terms back into Solr
*
* @param filteredTerms filtered JATE terms
* @param jateProperties jate properties for integration config between jate2.0 and solr instance
* @param indexSearcher solr index searcher
* @param isBoosted true or false to indicate whether term will be boosted with ATE score
* @throws JATEException
*/
public void indexTerms(List<JATETerm> filteredTerms, JATEProperties jateProperties,
SolrIndexSearcher indexSearcher, boolean isBoosted, boolean isExtraction)
throws JATEException {
int numDocs = indexSearcher.maxDoc();
String domainTermsFieldName = jateProperties.getSolrFieldNameJATEDomainTerms();
String candidateTermFieldName = jateProperties.getSolrFieldNameJATECTerms();
log.info(String.format("indexing [%s] terms into field [%s] for total [%s] documents ...", filteredTerms.size(),
domainTermsFieldName, numDocs));
if (filteredTerms.size() == 0) {
return;
}
SolrCore core = indexSearcher.getCore();
IndexSchema indexSchema = core.getLatestSchema();
IndexWriter writerIn = null;
try {
writerIn = core.getSolrCoreState().getIndexWriter(core).get();
Map<String, List<CopyField>> copyFields = indexSchema.getCopyFieldsMap();
for (int docID = 0; docID < numDocs; docID++) {
try {
Document doc = indexSearcher.doc(docID);
if (isExtraction) {
//TODO: may consider to avoid to index those intermediate values again
SolrUtil.copyFields(copyFields, DEFAULT_BOOST_VALUE, doc);
}
Terms indexedCandidateTermsVectors = SolrUtil.getTermVector(docID,
candidateTermFieldName, indexSearcher);
if (indexedCandidateTermsVectors == null) {
continue;
}
List<String> candidateTerms = SolrUtil.getNormalisedTerms(indexedCandidateTermsVectors);
List<Pair<String, Double>> filteredCandidateTerms = getSelectedWeightedCandidates(filteredTerms,
candidateTerms);
iterateAddDomainTermFields(isBoosted, domainTermsFieldName, indexSchema, doc, filteredCandidateTerms);
log.debug(String.format("document [%s] version before debugging: %s", doc.get("id"),
doc.get("_version_")));
// workaround: doc version is not automatically indexed after the document is updated in this way
String currentVersionNo = doc.get("_version_");
doc.removeField("_version_");
doc.add(indexSchema.getField("_version_").createField(versionIncrement(currentVersionNo),
DEFAULT_BOOST_VALUE));
writerIn.updateDocument(new Term("id", doc.get("id")), doc);
} catch (IOException e) {
throw new JATEException(
String.format("Failed to retrieve current document (docId: [%s]) due to " +
"an unexpected I/O exception: %s", docID, e.toString()));
}
}
writerIn.forceMerge(1, false);
writerIn.commit();
} catch (IOException ioe) {
throw new JATEException(String.format("Failed to index filtered domain terms due to I/O exception when " +
"loading solr index writer: %s", ioe.toString()));
} finally {
if (writerIn != null) {
try {
writerIn.close();
} catch (IOException e) {
log.error(e.toString());
}
}
if (core != null) {
core.close();
}
}
log.info(String.format("finalised terms have been indexed into [%s] field for all documents",
domainTermsFieldName));
}
private String versionIncrement(String currentVersionNo) {
String versionNo = currentVersionNo;
if (NumberUtils.isNumber(currentVersionNo)) {
versionNo = String.valueOf(Long.parseLong(currentVersionNo)+1);
}
return versionNo;
}
private void iterateAddDomainTermFields(boolean isBoosted, String domainTermsFieldName,
IndexSchema indexSchema, Document doc,
List<Pair<String, Double>> filteredCandidateTerms) {
// remove previous fields if exists
doc.removeFields(domainTermsFieldName);
for (Pair<String, Double> filteredTerm : filteredCandidateTerms) {
if (filteredTerm == null) {
continue;
}
if (isBoosted) {
doc.add(indexSchema.getField(domainTermsFieldName).createField(filteredTerm.getKey(),
filteredTerm.getValue().floatValue()));
} else {
doc.add(indexSchema.getField(domainTermsFieldName).createField(filteredTerm.getKey(),
DEFAULT_BOOST_VALUE));
}
}
}
private List<Pair<String, Double>> getSelectedWeightedCandidates(List<JATETerm> filteredTerms, List<String> candidateTerms) {
List<Pair<String, Double>> filteredCandidateTerms = new ArrayList<>();
candidateTerms.parallelStream().forEach(candidateTerm -> {
filteredTerms.parallelStream().forEach(filteredTerm -> {
if (filteredTerm != null && candidateTerm != null
&& filteredTerm.getString() != null &&
filteredTerm.getString().equalsIgnoreCase(candidateTerm)) {
Pair<String, Double> selectedTerm =
new Pair<String, Double>(filteredTerm.getString(), filteredTerm.getScore());
filteredCandidateTerms.add(selectedTerm);
}
});
});
return filteredCandidateTerms;
}
/**
* This request handler supports configuration options defined at the top
* level as well as those in typical Solr 'defaults', 'appends', and
* 'invariants'. The top level ones are treated as invariants.
*/
private void setTopInitArgsAsInvariants(SolrQueryRequest req) {
// First convert top level initArgs to SolrParams
HashMap<String, String> map = new HashMap<String, String>(initArgs.size());
for (int i = 0; i < initArgs.size(); i++) {
Object val = initArgs.getVal(i);
if (val != null && !(val instanceof NamedList))
map.put(initArgs.getName(i), val.toString());
}
if (map.isEmpty())
return;// short circuit; nothing to do
SolrParams topInvariants = new MapSolrParams(map);
// By putting the top level into the 1st arg, it overrides
// request params in 2nd arg.
req.setParams(SolrParams.wrapDefaults(topInvariants, req.getParams()));
}
private Algorithm getAlgorithm(String algName) throws JATEException {
if (StringUtils.isEmpty(algName)) {
// throw new JATEException("ATE algorithm is not specified. " +
// "Please check API documentation for all the supported ATR algorithms.");
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"ATE algorithm is not specified. " +
"Please check API documentation for all the supported ATR algorithms.");
}
if (algName.equalsIgnoreCase(Algorithm.C_VALUE.getAlgorithmName())) {
log.debug(String.format("[%s] algorithm is set to rank term candidates. ",
Algorithm.C_VALUE.getAlgorithmName()));
return Algorithm.C_VALUE;
} else if (algName.equalsIgnoreCase(Algorithm.ATTF.getAlgorithmName())) {
log.debug(String.format("[%s] algorithm is set to rank term candidates. ",
Algorithm.ATTF.getAlgorithmName()));
return Algorithm.ATTF;
} else if (algName.equalsIgnoreCase(Algorithm.CHI_SQUARE.getAlgorithmName())) {
log.debug(String.format("[%s] algorithm is set to rank term candidates. ",
Algorithm.CHI_SQUARE.getAlgorithmName()));
return Algorithm.CHI_SQUARE;
} else if (algName.equalsIgnoreCase(Algorithm.GLOSSEX.getAlgorithmName())) {
log.debug(String.format("[%s] algorithm is set to rank term candidates. ",
Algorithm.GLOSSEX.getAlgorithmName()));
return Algorithm.GLOSSEX;
} else if (algName.equalsIgnoreCase(Algorithm.RAKE.getAlgorithmName())) {
log.debug(String.format("[%s] algorithm is set to rank term candidates. ",
Algorithm.RAKE.getAlgorithmName()));
return Algorithm.RAKE;
} else if (algName.equalsIgnoreCase(Algorithm.RIDF.getAlgorithmName())) {
log.debug(String.format("[%s] algorithm is set to rank term candidates. ",
Algorithm.RIDF.getAlgorithmName()));
return Algorithm.RIDF;
} else if (algName.equalsIgnoreCase(Algorithm.TERM_EX.getAlgorithmName())) {
log.debug(String.format("[%s] algorithm is set to rank term candidates. ",
Algorithm.TERM_EX.getAlgorithmName()));
return Algorithm.TERM_EX;
} else if (algName.equalsIgnoreCase(Algorithm.TF_IDF.getAlgorithmName())) {
log.debug(String.format("[%s] algorithm is set to rank term candidates. ",
Algorithm.TF_IDF.getAlgorithmName()));
return Algorithm.TF_IDF;
} else if (algName.equalsIgnoreCase(Algorithm.TTF.getAlgorithmName())) {
log.debug(
String.format("[%s] algorithm is set to rank term candidates. ", Algorithm.TTF.getAlgorithmName()));
return Algorithm.TTF;
} else if (algName.equalsIgnoreCase(Algorithm.WEIRDNESS.getAlgorithmName())) {
log.debug(String.format("[%s] algorithm is set to rank term candidates. ",
Algorithm.WEIRDNESS.getAlgorithmName()));
return Algorithm.WEIRDNESS;
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
String.format("Current algorithm [%s] is not supported. Please check API documentation for all " +
"the supported ATR algorithms.",
algName));
// throw new JATEException(String.format(
// "Current algorithm [%s] is not supported. Please check API documentation for all the supported ATR algorithms.",
// algName));
}
}
}