/** * AbstractFederateSearchConnector.java * Copyright 2015 by Burkhard Buelte * First released 19.01.2015 at http://yacy.net * * This library is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt If not, see * <http://www.gnu.org/licenses/>. */ package net.yacy.cora.federate; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import javax.servlet.http.HttpServletResponse; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.storage.Configuration; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; import net.yacy.search.query.SearchEvent; import net.yacy.search.schema.CollectionSchema; import org.apache.solr.common.SolrDocument; /** * Base implementation class for Federated Search Connectors providing the basic * funcitonality to search none YaCy systems * <ul> * <li> init() to read config file * <li> toYaCySchema() to convert remote schema fields to YaCy internal schema * names, called by query() * <li> query() needs to be implemented in specific connectors * <li> search() call's query() in a thread and adds results to internal search request. * </ul> * Subclasses should/need to override query() and maybe toYaCySchema() if more * is needed as a basic field mapping */ abstract public class AbstractFederateSearchConnector implements FederateSearchConnector { public String instancename; // just a identifying name protected SchemaConfiguration localcfg; // the schema conversion cfg for each fieldname, yacyname = remote fieldname public long lastaccesstime = -1; // last time accessed, used for search delay calculation protected String baseurl; /** * Inits the connector with the remote field names and matches to yacy * schema and other specific settings from config file. Every connector * needs at least a query target (where to query) and some definition to * convert the remote serch result to the internal result presentation * (field mapping) * * @param instanceName internal name * @param cfgFileName e.g. DATA/SETTINGS/FEDERATECFG/instanceName.SCHEMA * @return true if success false if not */ @Override public boolean init(String instance, String cfgFileName) { this.instancename = instance; File instanceCfgFile = new File(cfgFileName); if (instanceCfgFile.exists()) { try { this.localcfg = new SchemaConfiguration(instanceCfgFile); } catch (IOException ex) { ConcurrentLog.config(this.instancename, "error reading schema " + cfgFileName); return false; } // mandatory to contain a mapping for "sku" or alternatively "cfg_skufieldname" for a conversion to a final url if (this.localcfg.contains(CollectionSchema.sku) || this.localcfg.contains("_skufieldname")) { return true; } ConcurrentLog.config(this.instancename, "mandatory mapping for sku or _skufieldname missing in " + cfgFileName); return false; } this.localcfg = null; return false; } /** * queries a remote system and adds the results to the searchevent and to * the crawler if addResultsToLocalIndex is true * * @param theSearch receiving the results */ @Override public void search(final SearchEvent theSearch) { final Thread job = new Thread() { @Override public void run() { Thread.currentThread().setName("heuristic:" + instancename); ConcurrentLog.info("YACY SEARCH (federated)", "Send search query to " + instancename); theSearch.oneFeederStarted(); List<URIMetadataNode> doclist = query(theSearch.getQuery()); if (doclist != null) { ConcurrentLog.info("YACY SEARCH (federated)", "Got " + doclist.size() + " documents from " + instancename); Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); // add nodes doesn't allow null Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(); // add nodes doesn't allow null theSearch.addNodes(doclist, facets, snippets, false, instancename, doclist.size()); for (URIMetadataNode doc : doclist) { theSearch.addHeuristic(doc.hash(), instancename, false); } } else { ConcurrentLog.info("YACY SEARCH (federated)", "Got no results from " + instancename); } // that's all we need to display serach result theSearch.oneFeederTerminated(); // optional: add to crawler to get the full resource (later) if (doclist != null && !doclist.isEmpty() && theSearch.addResultsToLocalIndex) { Collection<DigestURL> urls = new ArrayList<DigestURL>(); for (URIMetadataNode doc : doclist) { urls.add(doc.url()); } Switchboard.getSwitchboard().addToCrawler(urls, false); } } }; job.start(); } /** * Converts a remote schema result to YaCy schema using the fieldname * mapping provided as config file * * @param remote result (with remote fieldnames) * @return SolrDocument with field names according to the YaCy schema */ protected URIMetadataNode toYaCySchema(final SolrDocument doc) throws MalformedURLException { // set YaCy id String urlstr; if (localcfg.contains("sku")) { urlstr = (String) doc.getFieldValue(localcfg.get("sku").getValue()); } else { urlstr = (String) doc.getFieldValue(localcfg.get("_skufieldname").getValue()); if (this.localcfg.contains("_skuprefix")) { String skuprefix = this.localcfg.get("_skuprefix").getValue(); urlstr = skuprefix + urlstr; } } final DigestURL url = new DigestURL(urlstr); URIMetadataNode newdoc = new URIMetadataNode(url); Iterator<Configuration.Entry> it = localcfg.entryIterator(); while (it.hasNext()) { Configuration.Entry et = it.next(); String yacyfieldname = et.key(); // config defines yacyfieldname = remotefieldname String remotefieldname = et.getValue(); if (remotefieldname != null && !remotefieldname.isEmpty()) { if (Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration().contains(yacyfieldname)) { // check if in local config SchemaDeclaration est = CollectionSchema.valueOf(yacyfieldname); if (est.isMultiValued()) { if (doc.getFieldValues(remotefieldname) != null) { newdoc.addField(yacyfieldname, doc.getFieldValues(remotefieldname)); // } } else { if (doc.getFieldValue(remotefieldname) != null) { Object val = doc.getFirstValue(remotefieldname); // watch out for type conversion try { if (est.getType() == SolrType.num_integer && val instanceof String) { newdoc.setField(yacyfieldname, Integer.parseInt((String) val)); } else { newdoc.setField(yacyfieldname, val); } } catch (Exception ex) { continue; // catch possible parse or type mismatch, skip the field } } } } } } newdoc.addField(CollectionSchema.httpstatus_i.name(), HttpServletResponse.SC_OK); // yacy required return newdoc; } }