/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2013-2015 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see <http://www.gnu.org/licenses/>.
**/
package com.jaeksoft.searchlib.request;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import javax.xml.xpath.XPathExpressionException;
import org.apache.lucene.search.Query;
import org.w3c.dom.DOMException;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.Analyzer;
import com.jaeksoft.searchlib.analysis.FilterFactory;
import com.jaeksoft.searchlib.analysis.filter.DeduplicateTokenPositionsFilter;
import com.jaeksoft.searchlib.analysis.filter.IndexLookupFilter;
import com.jaeksoft.searchlib.analysis.filter.RemoveIncludedTermFilter;
import com.jaeksoft.searchlib.analysis.filter.ShingleFilter;
import com.jaeksoft.searchlib.analysis.filter.StopFilter;
import com.jaeksoft.searchlib.analysis.tokenizer.TokenizerEnum;
import com.jaeksoft.searchlib.analysis.tokenizer.TokenizerFactory;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.index.ReaderInterface;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.result.AbstractResult;
import com.jaeksoft.searchlib.result.ResultNamedEntityExtraction;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.XPathParser;
import com.jaeksoft.searchlib.util.XmlWriter;
import com.jaeksoft.searchlib.web.ServletTransaction;
public class NamedEntityExtractionRequest extends AbstractRequest {
private String text;
private String tokenizer;
private String searchRequest;
private String namedEntityField;
private Set<String> returnedFields;
private Map<String, Boolean> stopWordsMap;
private int maxNumberOfWords;
public NamedEntityExtractionRequest() {
super(null, RequestTypeEnum.NamedEntityExtractionRequest);
}
public NamedEntityExtractionRequest(Config config) {
super(config, RequestTypeEnum.NamedEntityExtractionRequest);
}
@Override
protected void setDefaultValues() {
super.setDefaultValues();
this.text = null;
this.searchRequest = null;
this.namedEntityField = null;
this.returnedFields = null;
this.stopWordsMap = null;
this.maxNumberOfWords = 5;
this.tokenizer = TokenizerEnum.LetterOrDigitTokenizerFactory.name();
}
@Override
public void copyFrom(AbstractRequest request) {
super.copyFrom(request);
NamedEntityExtractionRequest neeRequest = (NamedEntityExtractionRequest) request;
this.text = neeRequest.text;
this.searchRequest = neeRequest.searchRequest;
this.namedEntityField = neeRequest.namedEntityField;
this.returnedFields = neeRequest.returnedFields == null ? null : new TreeSet<String>(neeRequest.returnedFields);
this.stopWordsMap = neeRequest.stopWordsMap == null ? null
: new TreeMap<String, Boolean>(neeRequest.stopWordsMap);
this.maxNumberOfWords = neeRequest.maxNumberOfWords;
this.tokenizer = neeRequest.tokenizer;
}
public void addReturnedField(String returnedField) {
if (StringUtils.isEmpty(returnedField))
return;
if (returnedFields == null)
returnedFields = new TreeSet<String>();
returnedFields.add(returnedField);
}
public void removeReturnedField(String returnedField) {
if (StringUtils.isEmpty(returnedField))
return;
if (returnedFields == null)
return;
returnedFields.remove(returnedField);
}
public void addStopWords(String listName, boolean ignoreCase) {
if (listName == null)
return;
if (stopWordsMap == null)
stopWordsMap = new TreeMap<String, Boolean>();
stopWordsMap.put(listName, ignoreCase);
}
public void removeStopWords(String listName) {
if (listName == null)
return;
stopWordsMap.remove(listName);
if (stopWordsMap.isEmpty())
stopWordsMap = null;
}
public Collection<String> getReturnedFields() {
return returnedFields;
}
public void setReturnedFields(Collection<String> returnedFields) {
this.returnedFields.clear();
for (String returnedField : returnedFields)
addReturnedField(returnedField);
}
public void setReturnedFields(String[] returnedFields) {
this.returnedFields.clear();
for (String returnedField : returnedFields)
addReturnedField(returnedField);
}
@Override
final public Query getQuery() throws SearchLibException, IOException {
return null;
}
private final static String ATTR_SEARCH_REQUEST = "searchRequest";
private final static String ATTR_NAMED_ENTITY_FIELD = "namedEntityField";
private final static String NODE_NAME_STOPWORDS_LIST = "stopWords";
private final static String ATTR_MAX_NUMBER_OF_WORDS = "maxNumberOfWords";
private final static String ATTR_TOKENIZER = "tokenizer";
private final static String ATTR_STOPWORDS_LISTNAME = "listName";
private final static String ATTR_STOPWORDS_CASESENSITIVE = "caseSensitive";
private final static String NODE_TEXT = "text";
private final static String NODE_RETURNED_FIELD = "returnedField";
private final static String ATTR_NAME_FIELD = "name";
@Override
public void fromXmlConfigNoLock(Config config, XPathParser xpp, Node requestNode) throws XPathExpressionException,
DOMException, ParseException, InstantiationException, IllegalAccessException, ClassNotFoundException {
super.fromXmlConfigNoLock(config, xpp, requestNode);
searchRequest = DomUtils.getAttributeText(requestNode, ATTR_SEARCH_REQUEST);
namedEntityField = DomUtils.getAttributeText(requestNode, ATTR_NAMED_ENTITY_FIELD);
maxNumberOfWords = DomUtils.getAttributeInteger(requestNode, ATTR_MAX_NUMBER_OF_WORDS, 5);
tokenizer = DomUtils.getAttributeText(requestNode, ATTR_TOKENIZER,
TokenizerEnum.LetterOrDigitTokenizerFactory.name());
Node textNode = DomUtils.getFirstNode(requestNode, NODE_TEXT);
if (textNode == null)
text = DomUtils.getText(requestNode);
else
text = DomUtils.getText(textNode);
List<Node> returnedNodes = DomUtils.getNodes(requestNode, NODE_RETURNED_FIELD);
if (returnedNodes != null)
for (Node returnedNode : returnedNodes)
addReturnedField(DomUtils.getAttributeText(returnedNode, ATTR_NAME_FIELD));
List<Node> stopwordsNodes = DomUtils.getNodes(requestNode, NODE_NAME_STOPWORDS_LIST);
if (stopwordsNodes != null)
for (Node stopwordsNode : stopwordsNodes)
addStopWords(DomUtils.getAttributeText(stopwordsNode, ATTR_STOPWORDS_LISTNAME),
DomUtils.getAttributeBoolean(stopwordsNode, ATTR_STOPWORDS_CASESENSITIVE, true));
}
@Override
public void writeXmlConfig(XmlWriter xmlWriter) throws SAXException {
rwl.r.lock();
try {
xmlWriter.startElement(XML_NODE_REQUEST, XML_ATTR_NAME, getRequestName(), XML_ATTR_TYPE, getType().name(),
ATTR_SEARCH_REQUEST, searchRequest, ATTR_NAMED_ENTITY_FIELD, namedEntityField,
ATTR_MAX_NUMBER_OF_WORDS, Integer.toString(maxNumberOfWords), ATTR_TOKENIZER, tokenizer);
if (returnedFields != null) {
for (String returnedField : returnedFields) {
xmlWriter.startElement(NODE_RETURNED_FIELD, ATTR_NAME_FIELD, returnedField);
xmlWriter.endElement();
}
}
if (stopWordsMap != null) {
for (Map.Entry<String, Boolean> entry : stopWordsMap.entrySet()) {
xmlWriter.startElement(NODE_NAME_STOPWORDS_LIST, ATTR_STOPWORDS_LISTNAME, entry.getKey(),
ATTR_STOPWORDS_CASESENSITIVE, entry.getValue().toString());
xmlWriter.endElement();
}
}
if (!StringUtils.isEmpty(text)) {
xmlWriter.startElement(NODE_TEXT);
xmlWriter.textNode(text);
xmlWriter.endElement();
}
xmlWriter.endElement();
} finally {
rwl.r.unlock();
}
}
@Override
final public void setFromServletNoLock(final ServletTransaction transaction, final String prefix) {
String value = null;
if ((value = transaction.getParameterString(StringUtils.fastConcat(prefix, "text"))) != null)
text = value;
if ((value = transaction.getParameterString(StringUtils.fastConcat(prefix, "searchRequest"))) != null)
searchRequest = value;
if ((value = transaction.getParameterString(StringUtils.fastConcat(prefix, "namedEntityField"))) != null)
namedEntityField = value;
if ((value = transaction.getParameterString(StringUtils.fastConcat(prefix, "stopWordList"))) != null)
stopWordsMap.put(value, true);
Integer iValue;
if ((iValue = transaction.getParameterInteger(StringUtils.fastConcat(prefix, "maxNumberOfWords"))) != null)
maxNumberOfWords = iValue;
}
@Override
protected void resetNoLock() {
}
public List<FilterFactory> getFilterList(DeduplicateTokenPositionsFilter dtpf) throws SearchLibException {
List<FilterFactory> filterList = new ArrayList<FilterFactory>(10);
ShingleFilter shingleFilter = FilterFactory.create(config, ShingleFilter.class);
shingleFilter.setProperties(" ", 1, maxNumberOfWords);
filterList.add(shingleFilter);
if (dtpf == null)
dtpf = FilterFactory.create(config, DeduplicateTokenPositionsFilter.class);
filterList.add(dtpf);
if (stopWordsMap != null) {
for (Map.Entry<String, Boolean> entry : stopWordsMap.entrySet()) {
StopFilter stopFilter = FilterFactory.create(config, StopFilter.class);
stopFilter.setProperties(entry.getKey(), entry.getValue());
filterList.add(stopFilter);
}
}
IndexLookupFilter ilf = FilterFactory.create(config, IndexLookupFilter.class);
addReturnedField(namedEntityField);
ilf.setProperties(config.getIndexName(), searchRequest, namedEntityField,
StringUtils.join(returnedFields, '|'));
filterList.add(ilf);
RemoveIncludedTermFilter ritf = FilterFactory.create(config, RemoveIncludedTermFilter.class);
ritf.setProperties(namedEntityField, true);
filterList.add(ritf);
return filterList;
}
@Override
public AbstractResult<AbstractRequest> execute(ReaderInterface reader) throws SearchLibException {
try {
AbstractSearchRequest abstractSearchRequest = (AbstractSearchRequest) config.getNewRequest(searchRequest);
if (abstractSearchRequest == null)
throw new SearchLibException("Request not found: " + searchRequest);
LinkedHashSet<String> fieldNameSet = new LinkedHashSet<String>();
abstractSearchRequest.getReturnFieldList().populate(fieldNameSet);
ResultNamedEntityExtraction result = new ResultNamedEntityExtraction(this);
DeduplicateTokenPositionsFilter dtpf = FilterFactory.create(config, DeduplicateTokenPositionsFilter.class);
Analyzer analyzer = new Analyzer(config);
analyzer.setIndexTokenizer(TokenizerFactory.create(config, tokenizer));
analyzer.setQueryTokenizer(TokenizerFactory.create(config, tokenizer));
analyzer.add(getFilterList(dtpf));
analyzer.getQueryAnalyzer().populate(text, result);
result.resolvePositions(namedEntityField, dtpf.getLastTokenMap(), text);
return result;
} catch (IOException e) {
throw new SearchLibException(e);
} catch (ClassNotFoundException e) {
throw new SearchLibException(e);
}
}
@Override
public String getInfo() {
rwl.r.lock();
try {
StringBuilder sb = new StringBuilder();
sb.append("SearchRequest:");
if (searchRequest != null)
sb.append(searchRequest);
sb.append(" - NamedEntityField:");
if (namedEntityField != null)
sb.append(namedEntityField);
sb.append(" - StopWordsList:");
if (stopWordsMap != null)
sb.append(stopWordsMap.size());
return sb.toString();
} finally {
rwl.r.unlock();
}
}
/**
* @return the text
*/
public String getText() {
return text;
}
/**
* @param text
* the text to set
*/
public void setText(String text) {
this.text = text;
}
/**
* @return the searchRequest
*/
public String getSearchRequest() {
return searchRequest;
}
/**
* @param searchRequest
* the searchRequest to set
*/
public void setSearchRequest(String searchRequest) {
this.searchRequest = searchRequest;
}
/**
* @return the namedEntityField
*/
public String getNamedEntityField() {
return namedEntityField;
}
/**
* @param namedEntityField
* the namedEntityField to set
*/
public void setNamedEntityField(String namedEntityField) {
this.namedEntityField = namedEntityField;
addReturnedField(namedEntityField);
}
/**
* @return the stopWordsMap
*/
public Map<String, Boolean> getStopWordsMap() {
return stopWordsMap;
}
/**
* @return the maxNumberOfWords
*/
public int getMaxNumberOfWords() {
return maxNumberOfWords;
}
/**
* @param maxNumberOfWords
* the maxNumberOfWords to set
*/
public void setMaxNumberOfWords(int maxNumberOfWords) {
this.maxNumberOfWords = maxNumberOfWords;
}
/**
* @return the tokenizer
*/
public String getTokenizer() {
return tokenizer;
}
/**
* @param tokenizer
* the tokenizer to set
*/
public void setTokenizer(String tokenizer) {
this.tokenizer = tokenizer;
}
}