/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.components.search; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.logger.AbstractLogEnabled; import org.apache.avalon.framework.service.ServiceException; import org.apache.avalon.framework.service.ServiceManager; import org.apache.avalon.framework.service.Serviceable; import org.apache.avalon.framework.thread.ThreadSafe; import org.apache.cocoon.ProcessingException; import org.apache.commons.lang.StringUtils; import org.apache.excalibur.xml.sax.SAXParser; import org.apache.lucene.document.DateField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.List; /** * A simple class building lucene documents from xml content. * * <p>It has two parameters that effect the way it works:</p> * <p> * <tt><store-fields/></tt> * Sets which tags in your content are stored in Lucene as fields, * during the indexing process. Allows them to be output with search hits. * </p><p> * <tt><content-view-query/></tt> * Sets the view the indexer will request for indexing content. * </p><p> * Example configuration (goes in cocoon.xconf) * <pre><tt> * <lucene-xml-indexer logger="core.search.lucene"> * <store-fields>title, summary</store-fields> * <content-view-query>cocoon-view=search</content-view-query> * </lucene-xml-indexer> * </tt></pre></p> * * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a> * @author <a href="mailto:jeremy@apache.org">Jeremy Quinn</a> * @version CVS $Id$ */ public class SimpleLuceneXMLIndexerImpl extends AbstractLogEnabled implements LuceneXMLIndexer, Configurable, Serviceable, ThreadSafe { /** * The service manager instance * * @since */ protected ServiceManager manager = null; /** * Config element name specifying query-string appendend for requesting links * of an URL. * <p> * Its value is <code>link-view-query</code>. * </p> * * @since */ public final static String CONTENT_VIEW_QUERY_CONFIG = "content-view-query"; /** * append this string to the url in order to get the * content view of the url * * @since */ final static String CONTENT_VIEW_QUERY_DEFAULT = "cocoon-view=content"; /** * Config element name specifying the tags to be added as Stored, Untokenised, Unindexed Fields. * <p> * Its value is <code>field-tags</code>. * </p> * * @since */ public final static String FIELDTAGS_CONFIG = "store-fields"; /** * set of allowed content types * * @since */ final HashSet allowedContentType; /** * @since */ public SimpleLuceneXMLIndexerImpl() { allowedContentType = new HashSet(); allowedContentType.add("text/xml"); allowedContentType.add("text/xhtml"); fieldTags = new HashSet(); } private String contentViewQuery = CONTENT_VIEW_QUERY_DEFAULT; private HashSet fieldTags; /** * configure * * @param configuration * @exception ConfigurationException * @since */ public void configure(Configuration configuration) throws ConfigurationException { Configuration[] children; children = configuration.getChildren(FIELDTAGS_CONFIG); if (children != null && children.length > 0) { fieldTags = new HashSet(); for (int i = 0; i < children.length; i++) { String pattern = children[i].getValue(); String params[] = StringUtils.split(pattern, ", "); for (int index = 0; index < params.length; index++) { String tokenized_pattern = params[index]; if (!tokenized_pattern.equals("")) { this.fieldTags.add(tokenized_pattern); if (getLogger().isDebugEnabled()) { getLogger().debug("add field: " + tokenized_pattern); } } } } } else { if (getLogger().isDebugEnabled()) { getLogger().debug("Do not add any fields"); } } this.contentViewQuery = configuration.getChild(CONTENT_VIEW_QUERY_CONFIG, true).getValue(CONTENT_VIEW_QUERY_DEFAULT); if (getLogger().isDebugEnabled()) { getLogger().debug("content view: " + this.contentViewQuery); } } /** * Set the current <code>ServiceManager</code> instance used by this * <code>Serviceable</code>. * * @param manager Description of Parameter * @exception ServiceException Description of Exception * @since */ public void service(ServiceManager manager) throws ServiceException { this.manager = manager; } /** * Build lucenen documents from a URL * * @param url the content of this url gets indexed. * @exception ProcessingException Description of Exception * @since */ public List build(URL url) throws ProcessingException { try { URL contentURL = new URL(url, url.getFile() + ((url.getFile().indexOf("?") == -1) ? "?" : "&") + contentViewQuery); URLConnection contentURLConnection = contentURL.openConnection(); if (contentURLConnection == null) { throw new ProcessingException("Can not open connection to URL " + contentURL + " (null connection)"); } String contentType = contentURLConnection.getContentType(); if (contentType == null) { if (getLogger().isDebugEnabled()) { getLogger().debug("Ignoring " + contentURL + " (no content type)"); } return Collections.EMPTY_LIST; } int index = contentType.indexOf(';'); if (index != -1) { contentType = contentType.substring(0, index); } if (allowedContentType.contains(contentType)) { if (getLogger().isDebugEnabled()) { getLogger().debug("Indexing " + contentURL + " (" + contentType + ")"); } LuceneIndexContentHandler luceneIndexContentHandler = new LuceneIndexContentHandler(); luceneIndexContentHandler.setFieldTags(fieldTags); indexDocument(contentURLConnection, luceneIndexContentHandler); // // document is parsed // Iterator it = luceneIndexContentHandler.iterator(); while (it.hasNext()) { Document d = (Document) it.next(); d.add(Field.UnIndexed(URL_FIELD, url.toString())); // store ... false, index ... true, token ... false d.add(new Field(UID_FIELD, uid(contentURLConnection), false, true, false)); } return luceneIndexContentHandler.allDocuments(); } else { if (getLogger().isDebugEnabled()) { getLogger().debug("Ignoring " + contentURL + " (" + contentType + ")"); } return Collections.EMPTY_LIST; } } catch (IOException ioe) { throw new ProcessingException("Cannot read URL " + url, ioe); } } /** * index input stream producing lucene Documents * * @param contentURLConnection the xml content which should get indexed. * @param luceneIndexContentHandler ContentHandler for generating * a lucene Document from XML content. * @exception ProcessingException Description of Exception * @since */ private void indexDocument(URLConnection contentURLConnection, LuceneIndexContentHandler luceneIndexContentHandler) throws ProcessingException { InputStream is = null; InputSource in = null; SAXParser parser = null; try { is = contentURLConnection.getInputStream(); in = new InputSource(is); // get an XML parser parser = (SAXParser) this.manager.lookup(SAXParser.ROLE); //reader.setErrorHandler(new CocoonErrorHandler()); parser.parse(in, luceneIndexContentHandler); // // document is parsed // } catch (IOException ioe) { throw new ProcessingException("Cannot read!", ioe); } catch (SAXException saxe) { throw new ProcessingException("Cannot parse!", saxe); } catch (ServiceException se) { throw new ProcessingException("Cannot lookup xml parser!", se); } finally { if (parser != null) { this.manager.release(parser); } } } /** * return a unique uid of a url connection * * @param urlConnection Description of Parameter * @return String unique uid of a urlConnection * @since */ private String uid(URLConnection urlConnection) { // Append path and date into a string in such a way that lexicographic // sorting gives the same results as a walk of the file hierarchy. Thus // null (\u0000) is used both to separate directory components and to // separate the path from the date. return urlConnection.toString().replace('/', '\u0000') + "\u0000" + DateField.timeToString(urlConnection.getLastModified()); } }