OpensearchResponseWriter.java example

Explorer
yacy_search_server-master
/**
 *  OpensearchResponseWriter
 *  Copyright 2012 by Michael Peter Christen
 *  First released 06.08.2012 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.cora.federate.solr.responsewriter;

import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.lod.vocabulary.DublinCore;
import net.yacy.cora.lod.vocabulary.Geo;
import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.search.schema.CollectionSchema;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.XML;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.QueryResponseWriter;
import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;

public class OpensearchResponseWriter implements QueryResponseWriter {

    // define a list of simple YaCySchema -> RSS Token matchings
    private static final Map<String, String> field2tag = new HashMap<String, String>();

    // pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching
    private static final CollectionSchema[] extrafields = new CollectionSchema[]{
        CollectionSchema.id, CollectionSchema.title, CollectionSchema.description_txt, CollectionSchema.text_t,
        CollectionSchema.h1_txt, CollectionSchema.h2_txt, CollectionSchema.h3_txt, CollectionSchema.h4_txt, CollectionSchema.h5_txt, CollectionSchema.h6_txt,
        };
    static final Set<String> SOLR_FIELDS = new HashSet<String>();
    static {
        field2tag.put(CollectionSchema.coordinate_p.getSolrFieldName() + "_0_coordinate", Geo.Lat.getURIref());
        field2tag.put(CollectionSchema.coordinate_p.getSolrFieldName() + "_1_coordinate", Geo.Long.getURIref());
        field2tag.put(CollectionSchema.publisher_t.getSolrFieldName(), DublinCore.Publisher.getURIref());
        field2tag.put(CollectionSchema.author.getSolrFieldName(), DublinCore.Creator.getURIref());
        SOLR_FIELDS.addAll(field2tag.keySet());
        for (CollectionSchema field: extrafields) SOLR_FIELDS.add(field.getSolrFieldName());
    }

    private String title;

    public static class ResHead {
        public int offset, rows, numFound;
        //public int status, QTime;
        //public String df, q, wt;
        //public float maxScore;
    }

    public OpensearchResponseWriter() {
        super();
    }

    public void setTitle(String searchPageTitle) {
        this.title = searchPageTitle;
    }

    @Override
    public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) {
        return CONTENT_TYPE_XML_UTF8;
    }

    @Override
    public void init(@SuppressWarnings("rawtypes") NamedList n) {
    }

    @Override
    public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp) throws IOException {
        
        NamedList<?> values = rsp.getValues();
        
        assert values.get("responseHeader") != null;
        assert values.get("response") != null;

        SimpleOrderedMap<Object> responseHeader = (SimpleOrderedMap<Object>) rsp.getResponseHeader();
        DocList response = ((ResultContext) values.get("response")).docs;
        @SuppressWarnings("unchecked")
        SimpleOrderedMap<Object> facetCounts = (SimpleOrderedMap<Object>) values.get("facet_counts");
        @SuppressWarnings("unchecked")
        SimpleOrderedMap<Object> facetFields = facetCounts == null || facetCounts.size() == 0 ? null : (SimpleOrderedMap<Object>) facetCounts.get("facet_fields");
        @SuppressWarnings("unchecked")
        SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) values.get("highlighting");
        Map<String, LinkedHashSet<String>> snippets = highlighting(highlighting);

        // parse response header
        ResHead resHead = new ResHead();
        NamedList<?> val0 = (NamedList<?>) responseHeader.get("params");
        resHead.rows = Integer.parseInt((String) val0.get("rows"));
        resHead.offset = response.offset(); // equal to 'start'
        resHead.numFound = response.matches();
        //resHead.df = (String) val0.get("df");
        //resHead.q = (String) val0.get("q");
        //resHead.wt = (String) val0.get("wt");
        //resHead.status = (Integer) responseHeader.get("status");
        //resHead.QTime = (Integer) responseHeader.get("QTime");
        //resHead.maxScore = response.maxScore();

        // write header
        writer.write((
                        "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
                        "<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>\n" +
                        "<rss version=\"2.0\"\n" +
                        "    xmlns:yacy=\"http://www.yacy.net/\"\n" +
                        "    xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\"\n" +
                        "    xmlns:media=\"http://search.yahoo.com/mrss/\"\n" +
                        "    xmlns:atom=\"http://www.w3.org/2005/Atom\"\n" +
                        "    xmlns:dc=\"" + DublinCore.NAMESPACE + "\"\n" +
                        "    xmlns:geo=\"" + Geo.NAMESPACE + "\"\n" +
                        ">\n").toCharArray());
        openTag(writer, "channel");
        solitaireTag(writer, "opensearch:totalResults", Integer.toString(resHead.numFound));
        solitaireTag(writer, "opensearch:startIndex", Integer.toString(resHead.offset));
        solitaireTag(writer, "opensearch:itemsPerPage", Integer.toString(resHead.rows));
        solitaireTag(writer, RSSMessage.Token.title.name(), this.title);
        writer.write("<atom:link rel=\"search\" href=\"/opensearchdescription.xml\" type=\"application/opensearchdescription+xml\"/>");
        solitaireTag(writer, "description", "Search Result");
        //solitaireTag(writer, "link", "");
        //solitaireTag(writer, "image", "");

        // parse body
        final int responseCount = response.size();
        SolrIndexSearcher searcher = request.getSearcher();
        DocIterator iterator = response.iterator();
        String urlhash = null;
        for (int i = 0; i < responseCount; i++) {
            openTag(writer, "item");
            int id = iterator.nextDoc();
            Document doc = searcher.doc(id, SOLR_FIELDS);
            List<IndexableField> fields = doc.getFields();
            int fieldc = fields.size();
            List<String> texts = new ArrayList<String>();
            List<String> descriptions = new ArrayList<String>();
            String title = "";
            for (int j = 0; j < fieldc; j++) {
                IndexableField value = fields.get(j);
                String fieldName = value.name();

                // apply generic matching rule
                String stag = field2tag.get(fieldName);
                if (stag != null) {
                    solitaireTag(writer, stag, value.stringValue());
                    continue;
                }
                
                // take apart the url
                if (CollectionSchema.sku.getSolrFieldName().equals(fieldName)) {
                    String u = value.stringValue();
                    solitaireTag(writer, RSSMessage.Token.link.name(), u);
                    try {
                        MultiProtocolURL url = new MultiProtocolURL(u);
                        solitaireTag(writer, YaCyMetadata.host.getURIref(), url.getHost());
                        solitaireTag(writer, YaCyMetadata.path.getURIref(), url.getPath());
                        solitaireTag(writer, YaCyMetadata.file.getURIref(), url.getFileName());
                    } catch (final MalformedURLException e) {}
                    continue;
                }
                
                // if the rule is not generic, use the specific here
                if (CollectionSchema.id.getSolrFieldName().equals(fieldName)) {
                    urlhash = value.stringValue();
                    solitaireTag(writer, RSSMessage.Token.guid.name(), urlhash, "isPermaLink=\"false\"");
                    continue;
                }
                if (CollectionSchema.title.getSolrFieldName().equals(fieldName)) {
                    title = value.stringValue();
                    texts.add(title);
                    continue;
                }
                if (CollectionSchema.last_modified.getSolrFieldName().equals(fieldName)) {
                    Date d = new Date(Long.parseLong(value.stringValue()));
                    solitaireTag(writer, RSSMessage.Token.pubDate.name(), HeaderFramework.formatRFC1123(d));
                    continue;
                }
                if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) {
                    String description = value.stringValue();
                    descriptions.add(description);
                    solitaireTag(writer, DublinCore.Description.getURIref(), description);
                    texts.add(description);
                    continue;
                }
                if (CollectionSchema.text_t.getSolrFieldName().equals(fieldName)) {
                    texts.add(value.stringValue());
                    continue;
                }
                if (CollectionSchema.size_i.getSolrFieldName().equals(fieldName)) {
                    int size = value.numericValue().intValue();
                    solitaireTag(writer, YaCyMetadata.size.getURIref(), Integer.toString(size));
                    solitaireTag(writer, YaCyMetadata.sizename.getURIref(), RSSMessage.sizename(size));
                    continue;
                }
                if (CollectionSchema.h1_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h2_txt.getSolrFieldName().equals(fieldName) ||
                    CollectionSchema.h3_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h4_txt.getSolrFieldName().equals(fieldName) ||
                    CollectionSchema.h5_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h6_txt.getSolrFieldName().equals(fieldName)) {
                    // because these are multi-valued fields, there can be several of each
                    texts.add(value.stringValue());
                    continue;
                }
            }
            
            // compute snippet from texts
            solitaireTag(writer, RSSMessage.Token.title.name(), title.length() == 0 ? (texts.size() == 0 ? "" : texts.get(0)) : title);
            LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
            String tagname = RSSMessage.Token.description.name();
            if (snippet == null || snippet.size() == 0) {
                writer.write("<"); writer.write(tagname); writer.write('>');
                for (String d: descriptions) {
                    XML.escapeCharData(d, writer);
                }
                writer.write("</"); writer.write(tagname); writer.write(">\n");
            } else {
                removeSubsumedTitle(snippet, title);
                solitaireTag(writer, tagname, getLargestSnippet(snippet)); // snippet may be size=0
            }

            solitaireTag(writer, DublinCore.Subject.getURIref(), doc.get(CollectionSchema.keywords.getSolrFieldName()));
            
            closeTag(writer, "item");
        }

        openTag(writer, "yacy:navigation");
        
        // the facets can be created with the options &facet=true&facet.mincount=1&facet.field=host_s&facet.field=url_file_ext_s&facet.field=url_protocol_s&facet.field=author_sxt
        @SuppressWarnings("unchecked")
        NamedList<Integer> domains = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.host_s.getSolrFieldName());
        @SuppressWarnings("unchecked")
        NamedList<Integer> filetypes = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.url_file_ext_s.getSolrFieldName());
        @SuppressWarnings("unchecked")
        NamedList<Integer> protocols = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.url_protocol_s.getSolrFieldName());
        @SuppressWarnings("unchecked")
        NamedList<Integer> authors = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.author_sxt.getSolrFieldName());
        @SuppressWarnings("unchecked")
        NamedList<Integer> collections = facetFields == null ? null : (NamedList<Integer>) facetFields.get(CollectionSchema.collection_sxt.getSolrFieldName());
        
        if (domains != null) {
            openTag(writer, "yacy:facet name=\"domains\" displayname=\"Domains\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\"");
            for (Map.Entry<String, Integer> entry: domains) facetEntry(writer, "site", entry.getKey(), Integer.toString(entry.getValue()));
            closeTag(writer, "yacy:facet");
        }
        if (filetypes != null) {
            openTag(writer, "yacy:facet name=\"filetypes\" displayname=\"Filetypes\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\"");
            for (Map.Entry<String, Integer> entry: filetypes) facetEntry(writer, "filetype", entry.getKey(), Integer.toString(entry.getValue()));
            closeTag(writer, "yacy:facet");
        }
        if (protocols != null) {
            openTag(writer, "yacy:facet name=\"protocols\" displayname=\"Protocols\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\"");
            for (Map.Entry<String, Integer> entry: protocols) facetEntry(writer, "protocol", entry.getKey(), Integer.toString(entry.getValue()));
            closeTag(writer, "yacy:facet");
        }
        if (authors != null) {
            openTag(writer, "yacy:facet name=\"authors\" displayname=\"Authors\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\"");
            for (Map.Entry<String, Integer> entry: authors) facetEntry(writer, "author", entry.getKey(), Integer.toString(entry.getValue()));
            closeTag(writer, "yacy:facet");
        }
        if (collections != null) {
            openTag(writer, "yacy:facet name=\"collections\" displayname=\"Collections\" type=\"String\" min=\"0\" max=\"0\" mean=\"0\"");
            for (Map.Entry<String, Integer> entry: collections) facetEntry(writer, "collection", entry.getKey(), Integer.toString(entry.getValue()));
            closeTag(writer, "yacy:facet");
        }
        closeTag(writer, "yacy:navigation");
        
        closeTag(writer, "channel");
        writer.write("</rss>\n".toCharArray());
    }
    
    /**
     * produce snippets from solr (they call that 'highlighting')
     * @param val
     * @return a map from urlhashes to a list of snippets for that url
     */
    @SuppressWarnings("unchecked")
    public static Map<String, LinkedHashSet<String>> highlighting(final SimpleOrderedMap<Object> val) {
        Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>();
        if (val == null) return snippets;
        int sz = val.size();
        Object v, vv;
        for (int i = 0; i < sz; i++) {
            String n = val.getName(i);
            v = val.getVal(i);
            if (v instanceof SimpleOrderedMap) {
                int sz1 = ((SimpleOrderedMap<Object>) v).size();
                LinkedHashSet<String> t = new LinkedHashSet<String>();
                for (int j = 0; j < sz1; j++) {
                    vv = ((SimpleOrderedMap<Object>) v).getVal(j);
                    if (vv instanceof String[]) {
                        for (String t0: ((String[]) vv)) t.add(t0);
                    }
                }
                snippets.put(n, t);
            }
        }
        return snippets;
    }

    final static Pattern keymarks = Pattern.compile("<b>|</b>");
    
    public static void removeSubsumedTitle(LinkedHashSet<String> snippets, String title) {
        if (title == null || title.length() == 0 || snippets == null || snippets.size() == 0) return;
        snippets.remove(title);
        String tlc = title.toLowerCase();
        Iterator<String> i = snippets.iterator();
        while (i.hasNext()) {
            String s = i.next().toLowerCase();
            s = keymarks.matcher(s).replaceAll("");
            if (tlc.toLowerCase().indexOf(s) >= 0 || s.toLowerCase().indexOf(tlc) >= 0) i.remove();
        }
        return;
    }

    /**
     * @param snippets snippets list eventually empty
     * @return the largest snippet containing at least a space character among the list, or null
     */
    public static String getLargestSnippet(LinkedHashSet<String> snippets) {
        if (snippets == null || snippets.size() == 0) return null;
        String l = null;
        for (String s: snippets) {
            if ((l == null || s.length() > l.length()) && s.indexOf(' ') > 0) l = s;
        }
        if(l != null) {
        	l = l.replaceAll("\"", "'");
        }
        return l;
    }
    
    public static void openTag(final Writer writer, final String tag) throws IOException {
        writer.write('<'); writer.write(tag); writer.write(">\n");
    }

    public static void closeTag(final Writer writer, final String tag) throws IOException {
        writer.write("</"); writer.write(tag); writer.write(">\n");
    }

    public static void solitaireTag(final Writer writer, final String tagname, String value) throws IOException {
        if (value == null || value.length() == 0) return;
        writer.write("<"); writer.write(tagname); writer.write('>');
        XML.escapeCharData(value, writer);
        writer.write("</"); writer.write(tagname); writer.write(">\n");
    }

    public static void solitaireTag(final Writer writer, final String tagname, String value, String attr) throws IOException {
        if (value == null || value.length() == 0) return;
        writer.write("<"); writer.write(tagname);
        if (attr.charAt(0) != ' ') writer.write(' ');
        writer.write(attr);
        writer.write('>');
        writer.write(value);
        writer.write("</"); writer.write(tagname); writer.write(">\n");
    }

    private static void facetEntry(final Writer writer, final String modifier, final String propname, String value) throws IOException {
        writer.write("<yacy:element name=\""); XML.escapeCharData(propname, writer);
        writer.write("\" count=\""); XML.escapeCharData(value, writer);
        writer.write("\" modifier=\""); writer.write(modifier); writer.write("%3A"); XML.escapeCharData(propname, writer);
        writer.write("\" />\n");
    }

}