/********************************************************************************** * $URL: https://source.sakaiproject.org/svn/search/trunk/search-impl/impl/src/java/org/sakaiproject/search/component/dao/impl/SearchIndexBuilderWorkerDaoJdbcImpl.java $ * $Id: SearchIndexBuilderWorkerDaoJdbcImpl.java 103115 2012-01-13 14:05:23Z david.horwitz@uct.ac.za $ *********************************************************************************** * * Copyright (c) 2003, 2004, 2005, 2006, 2007, 2008 The Sakai Foundation * * Licensed under the Educational Community License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.opensource.org/licenses/ECL-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **********************************************************************************/package org.sakaiproject.search.util; import java.io.IOException; import java.io.Reader; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.CompressionTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.sakaiproject.search.api.EntityContentProducer; import org.sakaiproject.search.api.SearchService; import org.sakaiproject.search.api.StoredDigestContentProducer; /** * Utilities for indexing documents * @author dhorwitz * @since 1.5.0 * */ public class DocumentIndexingUtils { private static final Log log = LogFactory.getLog(DocumentIndexingUtils.class); /** * Index a Sakai entity to a document * @param ref the reference of the entity * @param digestStorageUtil * @param sep the {@link EntityContentProducer} to index the document * @param the URL of the server * @return The Lucene document suitable for adding to the Index */ public static Document createIndexDocument(String ref, DigestStorageUtil digestStorageUtil, EntityContentProducer sep, String serverURL, Reader contentReader) { Document doc = new Document(); String container = sep.getContainer(ref); if (container == null) container = ""; //$NON-NLS-1$ //The reference of the Sakai object stored and index unanalyzed doc.add(new Field(SearchService.FIELD_REFERENCE, CompressionTools.compressString(filterNull(ref)), Field.Store.YES)); doc.add(new Field(SearchService.FIELD_REFERENCE, filterNull(ref), Field.Store.NO, Field.Index.NOT_ANALYZED)); log.debug("added " + ref + " as the reference field"); log.debug("container is:" + container); //The date of indexing String timeStamp = String .valueOf(System.currentTimeMillis()); doc.add(new Field(SearchService.DATE_STAMP, timeStamp, Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.add(new Field(SearchService.DATE_STAMP, CompressionTools.compressString(timeStamp), Field.Store.YES)); //The Container //TODO: check if this is ever used doc.add(new Field(SearchService.FIELD_CONTAINER, filterNull(container), Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.add(new Field(SearchService.FIELD_CONTAINER, CompressionTools.compressString(filterNull(container)), Field.Store.YES)); //the type of the object //TODO: check if this is ever used /*doc.add(new Field(SearchService.FIELD_TYPE, filterNull(sep.getType(ref)), Field.Store.COMPRESS, Field.Index.NOT_ANALYZED)); */ //the opject subtype //TODO: check if this is ever used /*doc.add(new Field(SearchService.FIELD_SUBTYPE, filterNull(sep.getSubType(ref)), Field.Store.COMPRESS, Field.Index.NOT_ANALYZED)); */ // add last part of the index as this is the filename String idIndex = sep.getId(ref); if (idIndex != null && idIndex.indexOf("/") > 0) { idIndex = idIndex.substring(idIndex.lastIndexOf("/")); } idIndex = filterPunctuation(idIndex); doc.add(new Field(SearchService.FIELD_CONTENTS, idIndex, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); // add the title String title = filterPunctuation(sep.getTitle(ref)); doc.add(new Field(SearchService.FIELD_CONTENTS, title, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); if (sep.isContentFromReader(ref)) { contentReader = sep.getContentReader(ref); if (log.isDebugEnabled()) { log.debug("Adding Content for " + ref + " using " + contentReader); } doc.add(new Field(SearchService.FIELD_CONTENTS, contentReader, Field.TermVector.YES)); } else { String content = sep.getContent(ref); //its possible that there is no content to index if (content != null && content.trim().length() > 0) { if (log.isDebugEnabled()) { log.debug("Adding Content for " + ref + " as [" + content + "]"); } int docCount = digestStorageUtil.getDocCount(ref) + 1; doc.add(new Field(SearchService.FIELD_CONTENTS, filterNull(content), Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); if (sep instanceof StoredDigestContentProducer) { doc.add(new Field(SearchService.FIELD_DIGEST_COUNT, Integer.valueOf(docCount).toString(), Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); digestStorageUtil.saveContentToStore(ref, content, docCount); if (docCount > 2) { digestStorageUtil.cleanOldDigests(ref); } } } } //The document title String docTitle = filterNull(sep.getTitle(ref)); doc.add(new Field(SearchService.FIELD_TITLE, docTitle, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); doc.add(new Field(SearchService.FIELD_TITLE, CompressionTools .compressString(docTitle), Field.Store.YES)); // The tool - used for limiting searches //does not need to be stored doc.add(new Field(SearchService.FIELD_TOOL, filterNull(sep.getTool()), Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.add(new Field(SearchService.FIELD_TOOL, CompressionTools.compressString(filterNull(sep.getTool())), Field.Store.YES)); //The document URL - should not be indexed doc.add(new Field(SearchService.FIELD_URL, CompressionTools.compressString(filterUrl(filterNull(sep.getUrl(ref)), serverURL)), Field.Store.YES)); //the owning site String siteId = filterNull(sep.getSiteId(ref)); doc.add(new Field(SearchService.FIELD_SITEID, siteId, Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.add(new Field(SearchService.FIELD_SITEID, CompressionTools.compressString(siteId), Field.Store.YES)); // add the custom properties Map<String, ?> m = sep.getCustomProperties(ref); if (m != null) { Set<?> entries = m.entrySet(); for (Iterator<?> cprops = entries.iterator(); cprops .hasNext();) { Entry<String, ?> entry = (Entry<String, ?>) cprops.next(); String key = entry.getKey(); Object value = entry.getValue(); String[] values = null; if (value instanceof String) { values = new String[1]; values[0] = (String) value; } if (value instanceof String[]) { values = (String[]) value; } if (values == null) { log .info("Null Custom Properties value has been suppled by " //$NON-NLS-1$ + sep + " in index " //$NON-NLS-1$ + key); } else { for (int i = 0; i < values.length; i++) { if (key.startsWith("T")) { key = key.substring(1); String val = (filterNull(values[i])); doc.add(new Field(key, val, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); doc.add(new Field(key, CompressionTools.compressString(val), Field.Store.YES)); } else { String val = filterNull(values[i]); doc.add(new Field(key, val, Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.add(new Field(key, CompressionTools.compressString(val), Field.Store.YES)); } } } } } log.debug("Indexing Document " + doc); //$NON-NLS-1$ return doc; } /** * @param title * @return */ private static String filterNull(String s) { if (s == null) { return ""; } return s; } private static String filterPunctuation(String term) { if ( term == null ) { return ""; } char[] endTerm = term.toCharArray(); for ( int i = 0; i < endTerm.length; i++ ) { if ( !Character.isLetterOrDigit(endTerm[i]) ) { endTerm[i] = ' '; } } return new String(endTerm); } /** * @param string * @return */ private static String filterUrl(String url, String serverURL) { if (url != null && url.startsWith(serverURL)) { String absUrl = url.substring(serverURL.length()); if (!absUrl.startsWith("/")) { absUrl = "/" + absUrl; } return absUrl; } return url; } }