/** * OpensearchResponseWriter * Copyright 2012 by Michael Peter Christen * First released 06.08.2012 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.cora.federate.solr.responsewriter; import java.io.IOException; import java.io.Writer; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.lod.vocabulary.DublinCore; import net.yacy.cora.lod.vocabulary.Geo; import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.search.schema.CollectionSchema; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexableField; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.XML; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.QueryResponseWriter; import org.apache.solr.response.ResultContext; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocList; import org.apache.solr.search.SolrIndexSearcher; public class OpensearchResponseWriter implements QueryResponseWriter { // define a list of simple YaCySchema -> RSS Token matchings private static final Map<String, String> field2tag = new HashMap<String, String>(); // pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching private static final CollectionSchema[] extrafields = new CollectionSchema[]{ CollectionSchema.id, CollectionSchema.title, CollectionSchema.description_txt, CollectionSchema.text_t, CollectionSchema.h1_txt, CollectionSchema.h2_txt, CollectionSchema.h3_txt, CollectionSchema.h4_txt, CollectionSchema.h5_txt, CollectionSchema.h6_txt, }; static final Set<String> SOLR_FIELDS = new HashSet<String>(); static { field2tag.put(CollectionSchema.coordinate_p.getSolrFieldName() + "_0_coordinate", Geo.Lat.getURIref()); field2tag.put(CollectionSchema.coordinate_p.getSolrFieldName() + "_1_coordinate", Geo.Long.getURIref()); field2tag.put(CollectionSchema.publisher_t.getSolrFieldName(), DublinCore.Publisher.getURIref()); field2tag.put(CollectionSchema.author.getSolrFieldName(), DublinCore.Creator.getURIref()); SOLR_FIELDS.addAll(field2tag.keySet()); for (CollectionSchema field: extrafields) SOLR_FIELDS.add(field.getSolrFieldName()); } private String title; public static class ResHead { public int offset, rows, numFound; //public int status, QTime; //public String df, q, wt; //public float maxScore; } public OpensearchResponseWriter() { super(); } public void setTitle(String searchPageTitle) { this.title = searchPageTitle; } @Override public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) { return CONTENT_TYPE_XML_UTF8; } @Override public void init(@SuppressWarnings("rawtypes") NamedList n) { } @Override public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException { NamedList<?> values = rsp.getValues(); assert values.get("responseHeader") != null; assert values.get("response") != null; SimpleOrderedMap<Object> responseHeader = (SimpleOrderedMap<Object>) rsp.getResponseHeader(); DocList response = ((ResultContext) values.get("response")).docs; @SuppressWarnings("unchecked") SimpleOrderedMap<Object> facetCounts = (SimpleOrderedMap<Object>) values.get("facet_counts"); @SuppressWarnings("unchecked") SimpleOrderedMap<Object> facetFields = facetCounts == null || facetCounts.size() == 0 ? null : (SimpleOrderedMap<Object>) facetCounts.get("facet_fields"); @SuppressWarnings("unchecked") SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) values.get("highlighting"); Map<String, LinkedHashSet<String>> snippets = highlighting(highlighting); // parse response header ResHead resHead = new ResHead(); NamedList<?> val0 = (NamedList<?>) responseHeader.get("params"); resHead.rows = Integer.parseInt((String) val0.get("rows")); resHead.offset = response.offset(); // equal to 'start' resHead.numFound = response.matches(); //resHead.df = (String) val0.get("df"); //resHead.q = (String) val0.get("q"); //resHead.wt = (String) val0.get("wt"); //resHead.status = (Integer) responseHeader.get("status"); //resHead.QTime = (Integer) responseHeader.get("QTime"); //resHead.maxScore = response.maxScore(); // write header writer.write(( "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>\n" + "<rss version=\"2.0\"\n" + " xmlns:yacy=\"http://www.yacy.net/\"\n" + " xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\"\n" + " xmlns:media=\"http://search.yahoo.com/mrss/\"\n" + " xmlns:atom=\"http://www.w3.org/2005/Atom\"\n" + " xmlns:dc=\"" + DublinCore.NAMESPACE + "\"\n" + " xmlns:geo=\"" + Geo.NAMESPACE + "\"\n" + ">\n").toCharArray()); openTag(writer, "channel"); solitaireTag(writer, "opensearch:totalResults", Integer.toString(resHead.numFound)); solitaireTag(writer, "opensearch:startIndex", Integer.toString(resHead.offset)); solitaireTag(writer, "opensearch:itemsPerPage", Integer.toString(resHead.rows)); solitaireTag(writer, RSSMessage.Token.title.name(), this.title); writer.write("<atom:link rel=\"search\" href=\"/opensearchdescription.xml\" type=\"application/opensearchdescription+xml\"/>"); solitaireTag(writer, "description", "Search Result"); //solitaireTag(writer, "link", ""); //solitaireTag(writer, "image", ""); // parse body final int responseCount = response.size(); SolrIndexSearcher searcher = request.getSearcher(); DocIterator iterator = response.iterator(); String urlhash = null; for (int i = 0; i < responseCount; i++) { openTag(writer, "item"); int id = iterator.nextDoc(); Document doc = searcher.doc(id, SOLR_FIELDS); List<IndexableField> fields = doc.getFields(); int fieldc = fields.size(); List<String> texts = new ArrayList<String>(); List<String> descriptions = new ArrayList<String>(); String title = ""; for (int j = 0; j < fieldc; j++) { IndexableField value = fields.get(j); String fieldName = value.name(); // apply generic matching rule String stag = field2tag.get(fieldName); if (stag != null) { solitaireTag(writer, stag, value.stringValue()); continue; } // take apart the url if (CollectionSchema.sku.getSolrFieldName().equals(fieldName)) { String u = value.stringValue(); solitaireTag(writer, RSSMessage.Token.link.name(), u); try { MultiProtocolURL url = new MultiProtocolURL(u); solitaireTag(writer, YaCyMetadata.host.getURIref(), url.getHost()); solitaireTag(writer, YaCyMetadata.path.getURIref(), url.getPath()); solitaireTag(writer, YaCyMetadata.file.getURIref(), url.getFileName()); } catch (final MalformedURLException e) {} continue; } // if the rule is not generic, use the specific here if (CollectionSchema.id.getSolrFieldName().equals(fieldName)) { urlhash = value.stringValue(); solitaireTag(writer, RSSMessage.Token.guid.name(), urlhash, "isPermaLink=\"false\""); continue; } if (CollectionSchema.title.getSolrFieldName().equals(fieldName)) { title = value.stringValue(); texts.add(title); continue; } if (CollectionSchema.last_modified.getSolrFieldName().equals(fieldName)) { Date d = new Date(Long.parseLong(value.stringValue())); solitaireTag(writer, RSSMessage.Token.pubDate.name(), HeaderFramework.formatRFC1123(d)); continue; } if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) { String description = value.stringValue(); descriptions.add(description); solitaireTag(writer, DublinCore.Description.getURIref(), description); texts.add(description); continue; } if (CollectionSchema.text_t.getSolrFieldName().equals(fieldName)) { texts.add(value.stringValue()); continue; } if (CollectionSchema.size_i.getSolrFieldName().equals(fieldName)) { int size = value.numericValue().intValue(); solitaireTag(writer, YaCyMetadata.size.getURIref(), Integer.toString(size)); solitaireTag(writer, YaCyMetadata.sizename.getURIref(), RSSMessage.sizename(size)); continue; } if (CollectionSchema.h1_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h2_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h3_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h4_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h5_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h6_txt.getSolrFieldName().equals(fieldName)) { // because these are multi-valued fields, there can be several of each texts.add(value.stringValue()); continue; } } // compute snippet from texts solitaireTag(writer, RSSMessage.Token.title.name(), title.length() == 0 ? (texts.size() == 0 ? "" : texts.get(0)) : title); LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash); String tagname = RSSMessage.Token.description.name(); if (snippet == null || snippet.size() == 0) { writer.write("<"); writer.write(tagname); writer.write('>'); for (String d: descriptions) { XML.escapeCharData(d, writer); } writer.write("</"); writer.write(tagname); writer.write(">\n"); } else { removeSubsumedTitle(snippet, title); solitaireTag(writer, tagname, getLargestSnippet(snippet)); // snippet may be size=0 } solitaireTag(writer, DublinCore.Subject.getURIref(), doc.get(CollectionSchema.keywords.getSolrFieldName())); closeTag(writer, "item"); } openTag(writer, "yacy:navigation"); // the facets can be created with the options &facet=true&facet.mincount=1&facet.field=host_s&facet.field=url_file_ext_s&facet.field=url_protocol_s&facet.field=author_sxt @SuppressWarnings("unchecked") NamedList<Integer> domains = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.host_s.getSolrFieldName()); @SuppressWarnings("unchecked") NamedList<Integer> filetypes = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.url_file_ext_s.getSolrFieldName()); @SuppressWarnings("unchecked") NamedList<Integer> protocols = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.url_protocol_s.getSolrFieldName()); @SuppressWarnings("unchecked") NamedList<Integer> authors = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.author_sxt.getSolrFieldName()); @SuppressWarnings("unchecked") NamedList<Integer> collections = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.collection_sxt.getSolrFieldName()); if (domains != null) { openTag(writer, "yacy:facet name=\"domains\" displayname=\"Domains\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\""); for (Map.Entry<String, Integer> entry: domains) facetEntry(writer, "site", entry.getKey(), Integer.toString(entry.getValue())); closeTag(writer, "yacy:facet"); } if (filetypes != null) { openTag(writer, "yacy:facet name=\"filetypes\" displayname=\"Filetypes\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\""); for (Map.Entry<String, Integer> entry: filetypes) facetEntry(writer, "filetype", entry.getKey(), Integer.toString(entry.getValue())); closeTag(writer, "yacy:facet"); } if (protocols != null) { openTag(writer, "yacy:facet name=\"protocols\" displayname=\"Protocols\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\""); for (Map.Entry<String, Integer> entry: protocols) facetEntry(writer, "protocol", entry.getKey(), Integer.toString(entry.getValue())); closeTag(writer, "yacy:facet"); } if (authors != null) { openTag(writer, "yacy:facet name=\"authors\" displayname=\"Authors\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\""); for (Map.Entry<String, Integer> entry: authors) facetEntry(writer, "author", entry.getKey(), Integer.toString(entry.getValue())); closeTag(writer, "yacy:facet"); } if (collections != null) { openTag(writer, "yacy:facet name=\"collections\" displayname=\"Collections\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\""); for (Map.Entry<String, Integer> entry: collections) facetEntry(writer, "collection", entry.getKey(), Integer.toString(entry.getValue())); closeTag(writer, "yacy:facet"); } closeTag(writer, "yacy:navigation"); closeTag(writer, "channel"); writer.write("</rss>\n".toCharArray()); } /** * produce snippets from solr (they call that 'highlighting') * @param val * @return a map from urlhashes to a list of snippets for that url */ @SuppressWarnings("unchecked") public static Map<String, LinkedHashSet<String>> highlighting(final SimpleOrderedMap<Object> val) { Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); if (val == null) return snippets; int sz = val.size(); Object v, vv; for (int i = 0; i < sz; i++) { String n = val.getName(i); v = val.getVal(i); if (v instanceof SimpleOrderedMap) { int sz1 = ((SimpleOrderedMap<Object>) v).size(); LinkedHashSet<String> t = new LinkedHashSet<String>(); for (int j = 0; j < sz1; j++) { vv = ((SimpleOrderedMap<Object>) v).getVal(j); if (vv instanceof String[]) { for (String t0: ((String[]) vv)) t.add(t0); } } snippets.put(n, t); } } return snippets; } final static Pattern keymarks = Pattern.compile("<b>|</b>"); public static void removeSubsumedTitle(LinkedHashSet<String> snippets, String title) { if (title == null || title.length() == 0 || snippets == null || snippets.size() == 0) return; snippets.remove(title); String tlc = title.toLowerCase(); Iterator<String> i = snippets.iterator(); while (i.hasNext()) { String s = i.next().toLowerCase(); s = keymarks.matcher(s).replaceAll(""); if (tlc.toLowerCase().indexOf(s) >= 0 || s.toLowerCase().indexOf(tlc) >= 0) i.remove(); } return; } /** * @param snippets snippets list eventually empty * @return the largest snippet containing at least a space character among the list, or null */ public static String getLargestSnippet(LinkedHashSet<String> snippets) { if (snippets == null || snippets.size() == 0) return null; String l = null; for (String s: snippets) { if ((l == null || s.length() > l.length()) && s.indexOf(' ') > 0) l = s; } if(l != null) { l = l.replaceAll("\"", "'"); } return l; } public static void openTag(final Writer writer, final String tag) throws IOException { writer.write('<'); writer.write(tag); writer.write(">\n"); } public static void closeTag(final Writer writer, final String tag) throws IOException { writer.write("</"); writer.write(tag); writer.write(">\n"); } public static void solitaireTag(final Writer writer, final String tagname, String value) throws IOException { if (value == null || value.length() == 0) return; writer.write("<"); writer.write(tagname); writer.write('>'); XML.escapeCharData(value, writer); writer.write("</"); writer.write(tagname); writer.write(">\n"); } public static void solitaireTag(final Writer writer, final String tagname, String value, String attr) throws IOException { if (value == null || value.length() == 0) return; writer.write("<"); writer.write(tagname); if (attr.charAt(0) != ' ') writer.write(' '); writer.write(attr); writer.write('>'); writer.write(value); writer.write("</"); writer.write(tagname); writer.write(">\n"); } private static void facetEntry(final Writer writer, final String modifier, final String propname, String value) throws IOException { writer.write("<yacy:element name=\""); XML.escapeCharData(propname, writer); writer.write("\" count=\""); XML.escapeCharData(value, writer); writer.write("\" modifier=\""); writer.write(modifier); writer.write("%3A"); XML.escapeCharData(propname, writer); writer.write("\" />\n"); } }