/* * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */ package com.xpn.xwiki.plugin.lucene; import java.util.HashMap; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import com.xpn.xwiki.XWikiContext; import com.xpn.xwiki.doc.XWikiAttachment; import com.xpn.xwiki.doc.XWikiDocument; /** * Holds all data but the content of an attachment to be indexed. The content is retrieved at indexing time, which * should save us some memory especially when rebuilding an index for a big wiki. * * @version $Id: $ */ public class AttachmentData extends IndexData { /** * Mapping from common file name endings to mime types. This is uses as a fallback when text extraction by using the * mime type delivered by xwiki doesn't work. */ static final Map<String, String> MIMETYPES = new HashMap<String, String>(); static { MIMETYPES.put("pdf", "application/pdf"); MIMETYPES.put("doc", "application/msword"); MIMETYPES.put("sxw", "application/vnd.sun.xml.writer"); MIMETYPES.put("xml", "text/xml"); MIMETYPES.put("txt", "text/plain"); MIMETYPES.put("ppt", "application/ms-powerpoint"); MIMETYPES.put("xls", "application/ms-excel"); } private static final Log LOG = LogFactory.getLog(AttachmentData.class); private int size; private String filename; /** * @param attachment * @param context */ public AttachmentData(final XWikiDocument document, final XWikiAttachment attachment, final XWikiContext context) { super(attachment.getDoc(), context); setModificationDate(attachment.getDate()); setAuthor(attachment.getAuthor()); setSize(attachment.getFilesize()); setFilename(attachment.getFilename()); } public void addDataToLuceneDocument(Document luceneDoc, XWikiDocument doc, XWikiContext context) { super.addDataToLuceneDocument(luceneDoc, doc, context); if (filename != null) { luceneDoc.add(new Field(IndexFields.FILENAME, filename, Field.Store.YES, Field.Index.TOKENIZED)); luceneDoc.add(new Field(IndexFields.FILENAME + IndexFields.UNTOKENIZED, filename.toUpperCase(), Field.Store.NO, Field.Index.UN_TOKENIZED)); } } /** * @param size The size to set. */ public void setSize(int size) { this.size = size; } /** * @see IndexData#getType() */ public String getType() { return LucenePlugin.DOCTYPE_ATTACHMENT; } /** * @return Returns the filename. */ public String getFilename() { return filename; } /** * @param filename The filename to set. */ public void setFilename(String filename) { this.filename = filename; } /** * overridden to append the filename * * @see IndexData#getId() */ public String getId() { return new StringBuffer(super.getId()).append(".file.").append(filename).toString(); } /** * @return a string containing the result of {@link IndexData#getFullText} plus the full text content of this * attachment, as far as it could be extracted. */ public String getFullText(XWikiDocument doc, XWikiContext context) { StringBuffer retval = new StringBuffer(super.getFullText(doc, context)); String contentText = null; contentText = getContentAsText(doc, context); if (contentText != null) { retval.append(" ").append(contentText).toString(); } return retval.toString(); } private String getContentAsText(XWikiDocument doc, XWikiContext context) { String contentText = null; try { XWikiAttachment att = doc.getAttachment(filename); LOG.debug("have attachment for filename " + filename + ": " + att); byte[] content = att.getContent(context); if (filename != null) { String[] nameParts = filename.split("\\."); if (nameParts.length > 1) { contentText = TextExtractor.getText(content, MIMETYPES.get(nameParts[nameParts.length - 1].toLowerCase())); } } } catch (Exception e) { LOG.error("error getting content of attachment", e); } return contentText; } }