/** * OpenSearchConnector * Copyright 2012 by Michael Peter Christen * First released 03.11.2012 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.cora.federate.opensearch; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map.Entry; import java.util.Properties; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.feed.RSSReader; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.AbstractFederateSearchConnector; import net.yacy.cora.federate.FederateSearchConnector; import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; import net.yacy.search.query.QueryParams; import net.yacy.search.schema.CollectionSchema; /** * Handling of queries to remote OpenSearch systems. Iterates to a list of * configured systems until number of needed results are available. */ public class OpenSearchConnector extends AbstractFederateSearchConnector implements FederateSearchConnector { /** * HTML mapping properties used to retrieve result from HTML when the results * are not provided as a standard RSS/Atom feed but as simple HTML. */ private Properties htmlMapping; /** * @param instanceName open search instance name * @return the html mapping configuration file name derived from the instance name */ public static String htmlMappingFileName(final String instanceName) { return instanceName + ".html.map.properties"; } /** * @param urlTemplate OpenSearch URL template */ public OpenSearchConnector(final String urlTemplate) { super(); this.baseurl = urlTemplate; this.htmlMapping = new Properties(); } @Override public boolean init(final String name, final String cfgFileName) { this.instancename = name; this.localcfg = null; this.htmlMapping.clear(); if (cfgFileName != null && !cfgFileName.isEmpty()) { BufferedInputStream cfgFileStream = null; try { cfgFileStream = new BufferedInputStream(new FileInputStream(cfgFileName)); this.htmlMapping.load(cfgFileStream); } catch (IOException e) { ConcurrentLog.config("OpenSearchConnector." + this.instancename, "Error reading html mapping file : " + cfgFileName, e); } finally { if (cfgFileStream != null) { try { cfgFileStream.close(); } catch (IOException e) { ConcurrentLog.config("OpenSearchConnector." + this.instancename, "Error closing html mapping file : " + cfgFileName, e); } } } } return true; } /** * replace Opensearchdescription search template parameter with actual values */ private String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) { String tmps = searchurltemplate.replaceAll("\\?}", "}"); // some optional parameters may include question mark '{param?}=' tmps = tmps.replace("{startIndex}", Integer.toString(start)); tmps = tmps.replace("{startPage}", ""); tmps = tmps.replace("{count}", Integer.toString(rows)); tmps = tmps.replace("{language}", ""); tmps = tmps.replace("{inputEncoding}", StandardCharsets.UTF_8.name()); tmps = tmps.replace("{outputEncoding}", StandardCharsets.UTF_8.name()); return tmps.replace("{searchTerms}", query); } /** * @param linkElement html link result node. Must not be null. * @return and {@link URIMetadataNode} instance from the html link element or null when minimum required information is missing or malformed */ protected URIMetadataNode htmlLinkToMetadataNode(Element linkElement) { URIMetadataNode doc = null; String absoluteURL = linkElement.absUrl("href"); try { if (!absoluteURL.isEmpty()) { DigestURL uri = new DigestURL(absoluteURL); doc = new URIMetadataNode(uri); if(linkElement.hasText() && !this.htmlMapping.containsKey("title")) { /* Let's use the link text as default title when no mapping is defined.*/ doc.setField(CollectionSchema.title.getSolrFieldName(), linkElement.text()); } String targetLang = linkElement.attr("hreflang"); if(targetLang != null && !targetLang.isEmpty()) { doc.setField(CollectionSchema.language_s.getSolrFieldName(), targetLang); } final String mime = TextParser.mimeOf(uri); if (mime != null) { doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime); } /* * add collection "dht" which is used to differentiate metadata * from full crawl data in the index */ doc.setField(CollectionSchema.collection_sxt.getSolrFieldName(), "dht"); } } catch (MalformedURLException e) { ConcurrentLog.fine("OpenSearchConnector." + this.instancename, "Malformed url : " + absoluteURL); } return doc; } /** * Extract results from the HTML result stream, using the html mapping properties. * Important : it is the responsibility of the caller to close the stream. * @param resultStream HTML stream containing OpenSearch results. Must not be null. * @param charsetName characters set name. May be null : in that case the eventual {@code http-equiv} meta tag will be used. * @return a list of URI nodes, eventually empty. * @throws IOException when a read/write exception occurred */ protected List<URIMetadataNode> parseHTMLResult(InputStream resultStream, String charsetName) throws IOException { List<URIMetadataNode> docs = new ArrayList<>(); String resultSelector = this.htmlMapping.getProperty("_result"); String skuSelector = this.htmlMapping.getProperty("_sku"); if (resultSelector == null || skuSelector == null) { ConcurrentLog.warn("OpenSearchConnector." + this.instancename, "HTML mapping is incomplete!"); return docs; } Document jsoupDoc = Jsoup.parse(resultStream, charsetName, this.baseurl); Elements results = jsoupDoc.select(resultSelector); for (Element result : results) { Elements skuNodes = result.select(skuSelector); if (!skuNodes.isEmpty()) { Element skuNode = skuNodes.first(); if (!"a".equals(skuNode.tagName())) { /* * The selector may refer to a node with link(s) inside */ Elements links = skuNode.select("a[href]"); if (!links.isEmpty()) { skuNode = links.first(); } } if (skuNode.hasAttr("href")) { URIMetadataNode newDoc = htmlLinkToMetadataNode(skuNode); if (newDoc != null) { /* Let's handle other field mappings */ htmlResultToFields(result, newDoc); docs.add(newDoc); } } } } return docs; } /** * Perform mapping from an HTML result node to YaCy fields using the htmlMapping configuration. * @param resultNode html single result node * @param newdoc result document to fill */ private void htmlResultToFields(Element resultNode, URIMetadataNode newdoc) { for (Entry<Object, Object> entry : this.htmlMapping.entrySet()) { if (entry.getKey() instanceof String && entry.getValue() instanceof String) { String yacyFieldName = (String) entry.getKey(); String selector = (String) entry.getValue(); if (!yacyFieldName.startsWith("_")) { /* If Switchboard environment is set, check the index configuration has this field enabled */ if (Switchboard.getSwitchboard() == null || Switchboard.getSwitchboard().index == null || Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration() .contains(yacyFieldName)) { Elements nodes = resultNode.select(selector); SchemaDeclaration est; try { est = CollectionSchema.valueOf(yacyFieldName); } catch(IllegalArgumentException e) { ConcurrentLog.config("OpenSearchConnector." + this.instancename, "Ignored " + yacyFieldName + " field mapping : not a field of this schema."); continue; } if (est.isMultiValued()) { if (!nodes.isEmpty()) { for (Element node : nodes) { String value = node.text(); if (!value.isEmpty()) { newdoc.addField(yacyFieldName, value); } } } } else { if (!nodes.isEmpty()) { Element node = nodes.first(); String value = node.text(); if (!value.isEmpty()) { /* Perform eventual type conversion */ try { if (est.getType() == SolrType.num_integer) { newdoc.setField(yacyFieldName, Integer.parseInt(value)); } else { newdoc.setField(yacyFieldName, value); } } catch (NumberFormatException ex) { continue; } } } } } } } } } /** * queries remote system and returns the resultlist (waits until results * transmitted or timeout) This is the main access routine used for the * search and query operation For internal access delay time, also the * this.lastaccessed time needs to be set here. * * @return query results (metadata) with fields according to YaCy schema */ @Override public List<URIMetadataNode> query(QueryParams query) { return query(query.getQueryGoal().getQueryString(false), 0, query.itemsPerPage); } /** * Query the remote system at baseurl with the specified search terms * @param searchTerms search terms * @param startIndex index offset * @param count maximum results number * @return a result list eventually empty when no results where found or when an error occured */ public List<URIMetadataNode> query(final String searchTerms, final int startIndex, final int count) { List<URIMetadataNode> docs = new ArrayList<URIMetadataNode>(); // see http://www.loc.gov/standards/sru/ String searchurl = this.parseSearchTemplate(baseurl, searchTerms, startIndex, count); try { DigestURL aurl = new DigestURL(searchurl); try { this.lastaccesstime = System.currentTimeMillis(); final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent); byte[] result = httpClient.GETbytes(aurl, null, null, false); if(result == null) { String details; if(httpClient.getHttpResponse() != null && httpClient.getHttpResponse().getStatusLine() != null) { details = " HTTP status code : " + httpClient.getStatusCode(); } else { details = ""; } throw new IOException("Could not get a response." + details); } if("text/html".equals(httpClient.getMimeType())) { if (this.htmlMapping.isEmpty()) { ConcurrentLog.warn("OpenSearchConnector." + this.instancename, "Received HTML result but mapping is not configured!"); } else { /* * Result was received as html : let's try to use the * provided mapping to retrieve results from HTML */ docs = parseHTMLResult(new ByteArrayInputStream(result), httpClient.getCharacterEncoding()); } } else { /* Other mime types or unknown : let's try to parse the result as RSS or Atom Feed */ RSSReader rssReader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); if (rssReader != null) { final RSSFeed feed = rssReader.getFeed(); if (feed != null) { for (final RSSMessage item : feed) { try { DigestURL uri = new DigestURL(item.getLink()); URIMetadataNode doc = new URIMetadataNode(uri); doc.setField(CollectionSchema.charset_s.getSolrFieldName(), StandardCharsets.UTF_8.name()); doc.setField(CollectionSchema.author.getSolrFieldName(), item.getAuthor()); doc.setField(CollectionSchema.title.getSolrFieldName(), item.getTitle()); doc.setField(CollectionSchema.language_s.getSolrFieldName(), item.getLanguage()); doc.setField(CollectionSchema.last_modified.getSolrFieldName(), item.getPubDate()); final String mime = TextParser.mimeOf(uri); if (mime != null) { doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime); } if (item.getCategory().isEmpty()) { doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject())); } else { doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()) + " " + item.getCategory()); } doc.setField(CollectionSchema.publisher_t.getSolrFieldName(), item.getCopyright()); doc.setField(CollectionSchema.text_t.getSolrFieldName(), item.getDescriptions()); // we likely got only a search related snippet (take is as text content) // add collection "dht" which is used to differentiate metadata from full crawl data in the index doc.setField(CollectionSchema.collection_sxt.getSolrFieldName(), "dht"); if (item.getLat() != 0.0 && item.getLon() != 0.0) { doc.setField(CollectionSchema.coordinate_p.getSolrFieldName(), item.getLat() + "," + item.getLon()); } if (item.getSize() > 0) { doc.setField(CollectionSchema.size_i.getSolrFieldName(), item.getSize()); } docs.add(doc); } catch (final MalformedURLException e) { } } ConcurrentLog.info("OpenSearchConnector." + this.instancename, "received " + docs.size() + " results from " + this.instancename); } } } } catch (IOException ex) { ConcurrentLog.logException(ex); ConcurrentLog.info("OpenSearchConnector." + this.instancename, "no connection to " + searchurl); } } catch (MalformedURLException ee) { ConcurrentLog.warn("OpenSearchConnector." + this.instancename, "malformed url " + searchurl); } return docs; } /** * Main procedure : can be used to test results retrieval from an open search system * @param args main arguments list: * <ol> * <li>OpenSearch URL template (required)</li> * <li>Search term (required)</li> * <li>Html mapping file path (optional)</li> * </ol> */ public static void main(String args[]) { try { if (args.length < 2) { System.out.println("Usage : java " + OpenSearchConnector.class.getCanonicalName() + " <templateURL> <\"searchTerms\"> [htmlMappingFile]"); return; } OpenSearchConnector connector = new OpenSearchConnector(args[0]); String htmlMappingFile; if (args.length > 2) { htmlMappingFile = args[2]; } else { htmlMappingFile = null; } connector.init("testConnector", htmlMappingFile); String searchTerms = args[1]; if(searchTerms.length() > 2 && searchTerms.startsWith("\"") && searchTerms.endsWith("\"")) { searchTerms = searchTerms.substring(1, searchTerms.length() - 1); } List<URIMetadataNode> docs = connector.query(searchTerms, 0, 20); if (docs.isEmpty()) { System.out.println("No results"); } else { for (URIMetadataNode doc : docs) { System.out.println("title : " + doc.getFieldValue(CollectionSchema.title.getSolrFieldName())); System.out.println("sku : " + doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); System.out.println( "Description : " + doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()) + "\n"); } } } finally { /* Shutdown running threads */ Domains.close(); try { HTTPClient.closeConnectionManager(); } catch (final InterruptedException e) { } ConcurrentLog.shutdown(); } } }