/**
* OpenSearchConnector
* Copyright 2012 by Michael Peter Christen
* First released 03.11.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.opensearch;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map.Entry;
import java.util.Properties;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.AbstractFederateSearchConnector;
import net.yacy.cora.federate.FederateSearchConnector;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard;
import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionSchema;
/**
* Handling of queries to remote OpenSearch systems. Iterates to a list of
* configured systems until number of needed results are available.
*/
public class OpenSearchConnector extends AbstractFederateSearchConnector implements FederateSearchConnector {
/**
* HTML mapping properties used to retrieve result from HTML when the results
* are not provided as a standard RSS/Atom feed but as simple HTML.
*/
private Properties htmlMapping;
/**
* @param instanceName open search instance name
* @return the html mapping configuration file name derived from the instance name
*/
public static String htmlMappingFileName(final String instanceName) {
return instanceName + ".html.map.properties";
}
/**
* @param urlTemplate OpenSearch URL template
*/
public OpenSearchConnector(final String urlTemplate) {
super();
this.baseurl = urlTemplate;
this.htmlMapping = new Properties();
}
@Override
public boolean init(final String name, final String cfgFileName) {
this.instancename = name;
this.localcfg = null;
this.htmlMapping.clear();
if (cfgFileName != null && !cfgFileName.isEmpty()) {
BufferedInputStream cfgFileStream = null;
try {
cfgFileStream = new BufferedInputStream(new FileInputStream(cfgFileName));
this.htmlMapping.load(cfgFileStream);
} catch (IOException e) {
ConcurrentLog.config("OpenSearchConnector." + this.instancename, "Error reading html mapping file : " + cfgFileName, e);
} finally {
if (cfgFileStream != null) {
try {
cfgFileStream.close();
} catch (IOException e) {
ConcurrentLog.config("OpenSearchConnector." + this.instancename, "Error closing html mapping file : " + cfgFileName, e);
}
}
}
}
return true;
}
/**
* replace Opensearchdescription search template parameter with actual values
*/
private String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) {
String tmps = searchurltemplate.replaceAll("\\?}", "}"); // some optional parameters may include question mark '{param?}='
tmps = tmps.replace("{startIndex}", Integer.toString(start));
tmps = tmps.replace("{startPage}", "");
tmps = tmps.replace("{count}", Integer.toString(rows));
tmps = tmps.replace("{language}", "");
tmps = tmps.replace("{inputEncoding}", StandardCharsets.UTF_8.name());
tmps = tmps.replace("{outputEncoding}", StandardCharsets.UTF_8.name());
return tmps.replace("{searchTerms}", query);
}
/**
* @param linkElement html link result node. Must not be null.
* @return and {@link URIMetadataNode} instance from the html link element or null when minimum required information is missing or malformed
*/
protected URIMetadataNode htmlLinkToMetadataNode(Element linkElement) {
URIMetadataNode doc = null;
String absoluteURL = linkElement.absUrl("href");
try {
if (!absoluteURL.isEmpty()) {
DigestURL uri = new DigestURL(absoluteURL);
doc = new URIMetadataNode(uri);
if(linkElement.hasText() && !this.htmlMapping.containsKey("title")) {
/* Let's use the link text as default title when no mapping is defined.*/
doc.setField(CollectionSchema.title.getSolrFieldName(), linkElement.text());
}
String targetLang = linkElement.attr("hreflang");
if(targetLang != null && !targetLang.isEmpty()) {
doc.setField(CollectionSchema.language_s.getSolrFieldName(), targetLang);
}
final String mime = TextParser.mimeOf(uri);
if (mime != null) {
doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime);
}
/*
* add collection "dht" which is used to differentiate metadata
* from full crawl data in the index
*/
doc.setField(CollectionSchema.collection_sxt.getSolrFieldName(), "dht");
}
} catch (MalformedURLException e) {
ConcurrentLog.fine("OpenSearchConnector." + this.instancename, "Malformed url : " + absoluteURL);
}
return doc;
}
/**
* Extract results from the HTML result stream, using the html mapping properties.
* Important : it is the responsibility of the caller to close the stream.
* @param resultStream HTML stream containing OpenSearch results. Must not be null.
* @param charsetName characters set name. May be null : in that case the eventual {@code http-equiv} meta tag will be used.
* @return a list of URI nodes, eventually empty.
* @throws IOException when a read/write exception occurred
*/
protected List<URIMetadataNode> parseHTMLResult(InputStream resultStream, String charsetName) throws IOException {
List<URIMetadataNode> docs = new ArrayList<>();
String resultSelector = this.htmlMapping.getProperty("_result");
String skuSelector = this.htmlMapping.getProperty("_sku");
if (resultSelector == null || skuSelector == null) {
ConcurrentLog.warn("OpenSearchConnector." + this.instancename, "HTML mapping is incomplete!");
return docs;
}
Document jsoupDoc = Jsoup.parse(resultStream, charsetName, this.baseurl);
Elements results = jsoupDoc.select(resultSelector);
for (Element result : results) {
Elements skuNodes = result.select(skuSelector);
if (!skuNodes.isEmpty()) {
Element skuNode = skuNodes.first();
if (!"a".equals(skuNode.tagName())) {
/*
* The selector may refer to a node with link(s) inside
*/
Elements links = skuNode.select("a[href]");
if (!links.isEmpty()) {
skuNode = links.first();
}
}
if (skuNode.hasAttr("href")) {
URIMetadataNode newDoc = htmlLinkToMetadataNode(skuNode);
if (newDoc != null) {
/* Let's handle other field mappings */
htmlResultToFields(result, newDoc);
docs.add(newDoc);
}
}
}
}
return docs;
}
/**
* Perform mapping from an HTML result node to YaCy fields using the htmlMapping configuration.
* @param resultNode html single result node
* @param newdoc result document to fill
*/
private void htmlResultToFields(Element resultNode, URIMetadataNode newdoc) {
for (Entry<Object, Object> entry : this.htmlMapping.entrySet()) {
if (entry.getKey() instanceof String && entry.getValue() instanceof String) {
String yacyFieldName = (String) entry.getKey();
String selector = (String) entry.getValue();
if (!yacyFieldName.startsWith("_")) {
/* If Switchboard environment is set, check the index configuration has this field enabled */
if (Switchboard.getSwitchboard() == null || Switchboard.getSwitchboard().index == null
|| Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration()
.contains(yacyFieldName)) {
Elements nodes = resultNode.select(selector);
SchemaDeclaration est;
try {
est = CollectionSchema.valueOf(yacyFieldName);
} catch(IllegalArgumentException e) {
ConcurrentLog.config("OpenSearchConnector." + this.instancename,
"Ignored " + yacyFieldName + " field mapping : not a field of this schema.");
continue;
}
if (est.isMultiValued()) {
if (!nodes.isEmpty()) {
for (Element node : nodes) {
String value = node.text();
if (!value.isEmpty()) {
newdoc.addField(yacyFieldName, value);
}
}
}
} else {
if (!nodes.isEmpty()) {
Element node = nodes.first();
String value = node.text();
if (!value.isEmpty()) {
/* Perform eventual type conversion */
try {
if (est.getType() == SolrType.num_integer) {
newdoc.setField(yacyFieldName, Integer.parseInt(value));
} else {
newdoc.setField(yacyFieldName, value);
}
} catch (NumberFormatException ex) {
continue;
}
}
}
}
}
}
}
}
}
/**
* queries remote system and returns the resultlist (waits until results
* transmitted or timeout) This is the main access routine used for the
* search and query operation For internal access delay time, also the
* this.lastaccessed time needs to be set here.
*
* @return query results (metadata) with fields according to YaCy schema
*/
@Override
public List<URIMetadataNode> query(QueryParams query) {
return query(query.getQueryGoal().getQueryString(false), 0, query.itemsPerPage);
}
/**
* Query the remote system at baseurl with the specified search terms
* @param searchTerms search terms
* @param startIndex index offset
* @param count maximum results number
* @return a result list eventually empty when no results where found or when an error occured
*/
public List<URIMetadataNode> query(final String searchTerms, final int startIndex, final int count) {
List<URIMetadataNode> docs = new ArrayList<URIMetadataNode>();
// see http://www.loc.gov/standards/sru/
String searchurl = this.parseSearchTemplate(baseurl, searchTerms, startIndex, count);
try {
DigestURL aurl = new DigestURL(searchurl);
try {
this.lastaccesstime = System.currentTimeMillis();
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
byte[] result = httpClient.GETbytes(aurl, null, null, false);
if(result == null) {
String details;
if(httpClient.getHttpResponse() != null && httpClient.getHttpResponse().getStatusLine() != null) {
details = " HTTP status code : " + httpClient.getStatusCode();
} else {
details = "";
}
throw new IOException("Could not get a response." + details);
}
if("text/html".equals(httpClient.getMimeType())) {
if (this.htmlMapping.isEmpty()) {
ConcurrentLog.warn("OpenSearchConnector." + this.instancename, "Received HTML result but mapping is not configured!");
} else {
/*
* Result was received as html : let's try to use the
* provided mapping to retrieve results from HTML
*/
docs = parseHTMLResult(new ByteArrayInputStream(result), httpClient.getCharacterEncoding());
}
} else {
/* Other mime types or unknown : let's try to parse the result as RSS or Atom Feed */
RSSReader rssReader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
if (rssReader != null) {
final RSSFeed feed = rssReader.getFeed();
if (feed != null) {
for (final RSSMessage item : feed) {
try {
DigestURL uri = new DigestURL(item.getLink());
URIMetadataNode doc = new URIMetadataNode(uri);
doc.setField(CollectionSchema.charset_s.getSolrFieldName(), StandardCharsets.UTF_8.name());
doc.setField(CollectionSchema.author.getSolrFieldName(), item.getAuthor());
doc.setField(CollectionSchema.title.getSolrFieldName(), item.getTitle());
doc.setField(CollectionSchema.language_s.getSolrFieldName(), item.getLanguage());
doc.setField(CollectionSchema.last_modified.getSolrFieldName(), item.getPubDate());
final String mime = TextParser.mimeOf(uri);
if (mime != null) {
doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime);
}
if (item.getCategory().isEmpty()) {
doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()));
} else {
doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()) + " " + item.getCategory());
}
doc.setField(CollectionSchema.publisher_t.getSolrFieldName(), item.getCopyright());
doc.setField(CollectionSchema.text_t.getSolrFieldName(), item.getDescriptions());
// we likely got only a search related snippet (take is as text content)
// add collection "dht" which is used to differentiate metadata from full crawl data in the index
doc.setField(CollectionSchema.collection_sxt.getSolrFieldName(), "dht");
if (item.getLat() != 0.0 && item.getLon() != 0.0) {
doc.setField(CollectionSchema.coordinate_p.getSolrFieldName(), item.getLat() + "," + item.getLon());
}
if (item.getSize() > 0) {
doc.setField(CollectionSchema.size_i.getSolrFieldName(), item.getSize());
}
docs.add(doc);
} catch (final MalformedURLException e) {
}
}
ConcurrentLog.info("OpenSearchConnector." + this.instancename, "received " + docs.size() + " results from " + this.instancename);
}
}
}
} catch (IOException ex) {
ConcurrentLog.logException(ex);
ConcurrentLog.info("OpenSearchConnector." + this.instancename, "no connection to " + searchurl);
}
} catch (MalformedURLException ee) {
ConcurrentLog.warn("OpenSearchConnector." + this.instancename, "malformed url " + searchurl);
}
return docs;
}
/**
* Main procedure : can be used to test results retrieval from an open search system
* @param args main arguments list:
* <ol>
* <li>OpenSearch URL template (required)</li>
* <li>Search term (required)</li>
* <li>Html mapping file path (optional)</li>
* </ol>
*/
public static void main(String args[]) {
try {
if (args.length < 2) {
System.out.println("Usage : java " + OpenSearchConnector.class.getCanonicalName()
+ " <templateURL> <\"searchTerms\"> [htmlMappingFile]");
return;
}
OpenSearchConnector connector = new OpenSearchConnector(args[0]);
String htmlMappingFile;
if (args.length > 2) {
htmlMappingFile = args[2];
} else {
htmlMappingFile = null;
}
connector.init("testConnector", htmlMappingFile);
String searchTerms = args[1];
if(searchTerms.length() > 2 && searchTerms.startsWith("\"") && searchTerms.endsWith("\"")) {
searchTerms = searchTerms.substring(1, searchTerms.length() - 1);
}
List<URIMetadataNode> docs = connector.query(searchTerms, 0, 20);
if (docs.isEmpty()) {
System.out.println("No results");
} else {
for (URIMetadataNode doc : docs) {
System.out.println("title : " + doc.getFieldValue(CollectionSchema.title.getSolrFieldName()));
System.out.println("sku : " + doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
System.out.println(
"Description : " + doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()) + "\n");
}
}
} finally {
/* Shutdown running threads */
Domains.close();
try {
HTTPClient.closeConnectionManager();
} catch (final InterruptedException e) {
}
ConcurrentLog.shutdown();
}
}
}