/**
* Copyright (c) Codice Foundation
* <p>
* This is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser
* General Public License as published by the Free Software Foundation, either version 3 of the
* License, or any later version.
* <p>
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. A copy of the GNU Lesser General Public License
* is distributed along with this program and can be found at
* <http://www.gnu.org/licenses/lgpl.html>.
*/
package ddf.catalog.pubsub.criteria.contextual;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Attr;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ddf.util.XPathHelper;
public final class ContextualEvaluator {
private static final String FIELD_NAME = "Resource";
private static final String CASE_SENSITIVE_FIELD_NAME = "cs_Resource";
private static final Logger LOGGER =
LoggerFactory.getLogger(ContextualEvaluator.class);
private static final String DEFAULT_XPATH_1 =
"/*[local-name()=\"Resource\"]/*" + "[local-name() != \"identifier\" and "
+ "local-name() != \"language\" and " + "local-name() != \"dates\" and "
+ "local-name() != \"rights\" and " + "local-name() != \"format\" and "
+ "local-name() != \"subjectCoverage\" and "
+ "local-name() != \"temporalCoverage\" and "
+ "local-name() != \"geospatialCoverage\" " + "] ";
private static final String DEFAULT_XPATH_2 = "/*[local-name()=\"Resource\"]"
+ "/*[local-name()=\"geospatialCoverage\"]/*[local-name()=\"GeospatialExtent\"]"
+ "/*[not(ancestor::node()[local-name()=\"boundingGeometry\"] or descendant-or-self::node()[local-name()=\"boundingGeometry\"])] ";
private static final String[] DEFAULT_XPATH_SELECTORS =
new String[] {DEFAULT_XPATH_1, DEFAULT_XPATH_2};
private ContextualEvaluator() {
throw new UnsupportedOperationException(
"This is a utility class - it should never be instantiated");
}
/**
* @param cec
* @return
* @throws IOException
* @throws ParseException
*/
public static boolean evaluate(ContextualEvaluationCriteria cec)
throws IOException, ParseException {
String methodName = "evaluate";
Directory index = cec.getIndex();
String searchPhrase = cec.getCriteria();
// Handle case where no search phrase is specified. Contextual criteria should then specify
// text path(s)
// and be used to determine if an element or attribute exist
if (searchPhrase == null || searchPhrase.isEmpty()) {
String[] textPaths = cec.getTextPaths();
String fullDocument = cec.getMetadata();
if (textPaths != null && textPaths.length > 0 && fullDocument != null) {
String indexableText = getIndexableText(fullDocument, textPaths);
if (indexableText != null && !indexableText.isEmpty()) {
LOGGER.trace("Found element/attribute for textPaths");
return true;
}
}
LOGGER.trace(
"No search phrase specified and could not find element/attribute based on textPaths");
return false;
}
// a. query
QueryParser queryParser = null;
if (cec.isCaseSensitiveSearch()) {
LOGGER.debug("Doing case-sensitive search ...");
queryParser = new QueryParser(Version.LUCENE_30,
CASE_SENSITIVE_FIELD_NAME,
new CaseSensitiveContextualAnalyzer(Version.LUCENE_30));
// Make Wildcard, Prefix, Fuzzy, and Range queries *not* be automatically lower-cased,
// i.e., make them be case-sensitive
queryParser.setLowercaseExpandedTerms(false);
} else {
LOGGER.debug("Doing case-insensitive search ...");
queryParser = new QueryParser(Version.LUCENE_30,
FIELD_NAME,
new ContextualAnalyzer(Version.LUCENE_30));
}
// Configures Lucene query parser to allow a wildcard as first character in the
// contextual search phrase
queryParser.setAllowLeadingWildcard(true);
Query q = queryParser.parse(searchPhrase);
// b. search
int hitsPerPage = 1;
IndexSearcher searcher = new IndexSearcher(index, true);
TopDocs topDocs = searcher.search(q, hitsPerPage);
// c. display results
LOGGER.debug("Found {} hits.", topDocs.totalHits);
// searcher can only be closed when there
// is no need to access the documents any more.
searcher.close();
return topDocs.totalHits > 0;
}
/**
* Create a field with the specified field name and value, and add it to a Lucene Document to be
* added to the specified IndexWriter.
*
* @param indexWriter
* @param fieldName
* @param value
* @throws IOException
*/
private static void addDoc(IndexWriter indexWriter, String fieldName, String value)
throws IOException {
Document doc = new Document();
doc.add(new Field(fieldName,
value,
Field.Store.YES,
Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
indexWriter.addDocument(doc);
}
/**
* Build one Lucene index for the specified XML Document that contains both case-insensitive and
* case-sensitive indexed text. Use the default XPath selectors to extract the indexable text
* from the specified XML document.
*
* @param fullDocument the XML document to be indexed
* @return the Lucene index for the indexed text from the XML document
* @throws IOException
*/
public static Directory buildIndex(String fullDocument) throws IOException {
String methodName = "buildIndex (DEFAULT)";
return buildIndex(fullDocument, DEFAULT_XPATH_SELECTORS);
}
/**
* Build one Lucene index for the specified XML Document that contains both case-insensitive and
* case-sensitive indexed text. Use the provided XPath selectors to extract the indexable text
* from the specified XML document.
*
* @param fullDocument the XML document to be indexed
* @param xpathSelectors the XPath selectors to use to extract the indexable text from the XML document
* @return the Lucene index for the indexed text from the XML document
* @throws IOException
*/
public static Directory buildIndex(String fullDocument, String[] xpathSelectors)
throws IOException {
String methodName = "buildIndex";
// LOGGER.debug( XPathHelper.xmlToString( fullDocument ) );
// 0. Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching
ContextualAnalyzer contextualAnalyzer = new ContextualAnalyzer(Version.LUCENE_30);
// 1. create the index
Directory index = new RAMDirectory();
// Retrieve the text from the document that can be indexed using the specified XPath
// selectors
String indexableText = getIndexableText(fullDocument, xpathSelectors);
// Create an IndexWriter using the case-insensitive StandardAnalyzer
// NOTE: the boolean arg in the IndexWriter constructor means to create a new index,
// overwriting any existing index
IndexWriter indexWriter = new IndexWriter(index,
contextualAnalyzer,
true,
IndexWriter.MaxFieldLength.UNLIMITED);
logTokens(indexWriter.getAnalyzer(), FIELD_NAME, fullDocument, "ContextualAnalyzer");
// Add the indexable text to the case-insensitive index writer, assigning it the
// "case-insensitive" field name
addDoc(indexWriter, FIELD_NAME, indexableText);
indexWriter.close();
CaseSensitiveContextualAnalyzer caseSensitiveStandardAnalyzer =
new CaseSensitiveContextualAnalyzer(Version.LUCENE_30);
// Create a second IndexWriter using the custom case-sensitive StandardAnalyzer
// NOTE: set boolean to false to append the case-sensitive indexed text to the existing
// index (populated by first IndexWriter)
IndexWriter csIndexWriter = new IndexWriter(index,
caseSensitiveStandardAnalyzer,
false,
IndexWriter.MaxFieldLength.UNLIMITED);
// Add the indexable text to the case-sensitive index writer, assigning it the
// "case-sensitive" field name
addDoc(csIndexWriter, CASE_SENSITIVE_FIELD_NAME, indexableText);
csIndexWriter.close();
return index;
}
private static void logTokens(Analyzer analyzer, String fieldName, String fullDocument,
String analyzerName) throws IOException {
if (!LOGGER.isDebugEnabled()) {
return;
}
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(fullDocument));
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
LOGGER.debug("----- {} tokens -----", analyzerName);
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = termAttribute.term();
LOGGER.debug(term);
}
LOGGER.debug("----- END: {} tokens -----", analyzerName);
}
/**
* Extract the text from the specified XML Document that is to be indexed using the specified
* XPath selectors.
*
* @param document
* @param xpathSelectors
* @return
*/
private static String getIndexableText(String document, String[] xpathSelectors) {
String methodName = "getIndexableText";
List<String> indexedText = new ArrayList<String>();
LOGGER.debug("xpathSelectors.size = {}", xpathSelectors.length);
StringBuilder sbuilder = new StringBuilder();
try {
// TODO Is this safe for all cases? Can there be multiple default namespaces such that
// this would screw up the metadata?
// Treat the "default namespace" (i.e., xmlns="http://some.namespace") the same as the
// "no namespace" (i.e., xmlns="")
// so that user-specified XPath Selectors do not need to specify a namespace for
// expressions in the default namespace
// (For example, user can specify //fileTitle vs. //namespace:fileTitle, where a
// NamespaceContext/NamespaceResolver
// would try to resolve the namespace they specified)
// The regex below, "xmlns=['\"].*?['\"]", looks for:
// xmlns="any chars between single or double quotes"
document = document.replaceAll("xmlns=['\"].*?['\"]", "");
XPathHelper xHelper = new XPathHelper(document);
for (String xpath : xpathSelectors) {
LOGGER.debug("Processing xpath selector: {}", xpath);
NodeList nodeList = (NodeList) xHelper.evaluate(xpath, XPathConstants.NODESET);
LOGGER.debug("nodeList length = {}", nodeList.getLength());
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
if (node.getNodeType() == Node.ATTRIBUTE_NODE) {
Attr attribute = (Attr) node;
LOGGER.debug("Adding text [{}]", attribute.getNodeValue());
sbuilder.append(attribute.getNodeValue() + " ");
// On each element node detected, traverse all of its children. Look for
// any Text nodes it has, adding their text values to the list of indexable text
} else if (node.getNodeType() == Node.ELEMENT_NODE) {
Element elem = (Element) node;
traverse(elem, indexedText);
// getTextContent() concatenates *all* text from all descendant Text nodes
// without
// any white space between each Text node's value, e.g., JohnDoe vs. John
// Doe
// That's not good ...
} else {
LOGGER.debug(
"Unsupported node type: " + node.getNodeType() + ", node name = "
+ node.getNodeName());
}
}
}
} catch (XPathExpressionException e1) {
LOGGER.debug("Unable to evaluate XPath", e1);
}
// Append all of the Text nodes' values to the single indexable text string
for (String text : indexedText) {
sbuilder.append(text);
}
return sbuilder.toString();
}
private static void traverse(Node n, List<String> indexedText) {
// Traverse the rest of the tree in depth-first order.
if (n.getNodeType() == Node.TEXT_NODE) {
indexedText.add(n.getNodeValue() + " ");
}
if (n.hasChildNodes()) {
// Get the children in a list.
NodeList nl = n.getChildNodes();
for (int i = 0; i < nl.getLength(); i++) {
// Recursively traverse each of the children.
traverse(nl.item(i), indexedText);
}
}
}
}