/** * OLAT - Online Learning and Training<br> * http://www.olat.org * <p> * Licensed under the Apache License, Version 2.0 (the "License"); <br> * you may not use this file except in compliance with the License.<br> * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing,<br> * software distributed under the License is distributed on an "AS IS" BASIS, <br> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> * See the License for the specific language governing permissions and <br> * limitations under the License. * <p> * Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br> * University of Zurich, Switzerland. * <hr> * <a href="http://www.openolat.org"> * OpenOLAT - Online Learning and Training</a><br> * This file has been modified by the OpenOLAT community. Changes are licensed * under the Apache 2.0 license as the original file. */ package org.olat.search.service.document.file; import java.io.IOException; import java.util.Date; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.queryparser.classic.ParseException; import org.olat.core.CoreSpringFactory; import org.olat.core.commons.modules.bc.meta.MetaInfo; import org.olat.core.commons.modules.bc.meta.tagged.MetaTagged; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; import org.olat.core.util.vfs.LocalImpl; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.QueryException; import org.olat.search.SearchModule; import org.olat.search.SearchService; import org.olat.search.ServiceNotAvailableException; import org.olat.search.model.AbstractOlatDocument; import org.olat.search.model.OlatDocument; import org.olat.search.service.SearchResourceContext; import org.olat.search.service.SearchServiceImpl; /** * Lucene document mapper. * <p>Supported file-types : * <lu> * <li>pdf => PDF document</li> * <li>xls => Excel document</li> * <li>doc => Word document</li> * <li>ppt => Power-point document</li> * <li>odt, ods, odp, odf, odg => OpenDocument document</li> * <li>htm, html, xhtml, xml => HTML document</li> * <li>txt, tex, README, csv => Text document</li> * @author Christian Guretzki */ public class FileDocumentFactory { private static OLog log = Tracing.createLoggerFor(FileDocumentFactory.class); private final static String PDF_SUFFIX = "pdf"; private final static String EXCEL_SUFFIX = "xls"; private final static String WORD_SUFFIX = "doc"; private final static String POWERPOINT_SUFFIX = "ppt"; private final static String EXCEL_X_SUFFIX = "xlsx"; private final static String WORD_X_SUFFIX = "docx"; private final static String POWERPOINT_X_SUFFIX = "pptx"; private final static String OD_TEXT_SUFFIX = "odt"; private final static String OD_SPREADSHEET_SUFFIX = "ods"; private final static String OD_PRESENTATION_SUFFIX = "odp"; private final static String OD_FORMULA_SUFFIX = "odf"; private final static String OD_GRAPHIC_SUFFIX = "odg"; private final static String HTML_SUFFIX = "htm html xhtml"; private final static String XML_SUFFIX = "xml"; private final static String TEXT_SUFFIX = "txt tex readme csv"; //as a special parser; private static final String IMS_MANIFEST_FILE = "imsmanifest.xml"; private int excludedFileSizeCount = 0; private static SearchModule searchModule; /** * [used by spring] * @param searchModule */ public FileDocumentFactory(SearchModule module) { searchModule = module; } public static int getMaxFileSize() { return searchModule == null ? 120000 : (int)searchModule.getMaxFileSize(); } private Document getDocumentFromCurrentIndex(SearchResourceContext leafResourceContext, VFSLeaf leaf) { try { String resourceUrl = leafResourceContext.getResourceUrl(); SearchService searchService = CoreSpringFactory.getImpl(SearchServiceImpl.class); Document indexedDoc = searchService.doSearch(resourceUrl); if(indexedDoc != null) { String timestamp = indexedDoc.get(AbstractOlatDocument.TIME_STAMP_NAME); if(timestamp != null) { Date indexLastModification = DateTools.stringToDate(timestamp); Date docLastModificationDate = new Date(leaf.getLastModified()); if(leaf instanceof MetaTagged) { MetaInfo metaInfo = ((MetaTagged)leaf).getMetaInfo(); Date metaDate = metaInfo.getMetaLastModified(); if(metaDate != null && metaDate.after(docLastModificationDate)) { docLastModificationDate = metaDate; } } if(docLastModificationDate.compareTo(indexLastModification) < 0) { OlatDocument olatDoc = new OlatDocument(indexedDoc); return olatDoc.getLuceneDocument(); } } } } catch (ServiceNotAvailableException | ParseException | QueryException | java.text.ParseException e) { log.error("", e); } return null; } public Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException, DocumentAccessException { Document indexedDocument = getDocumentFromCurrentIndex(leafResourceContext, leaf); if(indexedDocument != null) { return indexedDocument; } try { Document doc = null; String fileName = leaf.getName(); String suffix = FileTypeDetector.getSuffix(leaf); if (log.isDebug()) log.debug("suffix=" + suffix); if (PDF_SUFFIX.indexOf(suffix) >= 0) { if(searchModule.getPdfFileEnabled()) { doc = PdfDocument.createDocument(leafResourceContext, leaf); } } else if (HTML_SUFFIX.indexOf(suffix) >= 0) { doc = HtmlDocument.createDocument(leafResourceContext, leaf); } else if (XML_SUFFIX.indexOf(suffix) >= 0) { if(IMS_MANIFEST_FILE.equals(fileName)) { doc = IMSMetadataDocument.createDocument(leafResourceContext, leaf); } else { doc = XmlDocument.createDocument(leafResourceContext, leaf); } } else if (TEXT_SUFFIX.indexOf(suffix) >= 0) { doc = TextDocument.createDocument(leafResourceContext, leaf); //microsoft openxml } else if (suffix.indexOf(WORD_X_SUFFIX) >= 0) { doc = WordOOXMLDocument.createDocument(leafResourceContext, leaf); } else if (suffix.indexOf(EXCEL_X_SUFFIX) >= 0) { if (searchModule.getExcelFileEnabled()) { doc = ExcelOOXMLDocument.createDocument(leafResourceContext, leaf); } } else if (suffix.indexOf(POWERPOINT_X_SUFFIX) >= 0) { if(searchModule.getPptFileEnabled()) { doc = PowerPointOOXMLDocument.createDocument(leafResourceContext, leaf); } //microsoft } else if (WORD_SUFFIX.indexOf(suffix) >= 0) { doc = WordDocument.createDocument(leafResourceContext, leaf); } else if (POWERPOINT_SUFFIX.indexOf(suffix) >= 0) { if(searchModule.getPptFileEnabled()) { doc = PowerPointDocument.createDocument(leafResourceContext, leaf); } } else if (EXCEL_SUFFIX.indexOf(suffix) >= 0) { if (searchModule.getExcelFileEnabled()) { doc = ExcelDocument.createDocument(leafResourceContext, leaf); } //open document } else if (OD_TEXT_SUFFIX.indexOf(suffix) >= 0 || OD_SPREADSHEET_SUFFIX.indexOf(suffix) >= 0 || OD_PRESENTATION_SUFFIX.indexOf(suffix) >= 0 || OD_FORMULA_SUFFIX.indexOf(suffix) >= 0 || OD_GRAPHIC_SUFFIX.indexOf(suffix) >= 0) { doc = OpenDocument.createDocument(leafResourceContext, leaf); } if(doc == null) { doc = createUnkownDocument(leafResourceContext, leaf); } return doc; } catch(DocumentNotImplementedException e) { log.warn("Cannot index document (no indexer for it):" + leaf, e); return createUnkownDocument(leafResourceContext, leaf); } catch (DocumentException e) { log.warn("Cannot index document:" + leaf, e); return createUnkownDocument(leafResourceContext, leaf); } } private Document createUnkownDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) { try { return UnkownDocument.createDocument(leafResourceContext, leaf); } catch (Exception e) { return null; } } /** * Check if certain file is supported. * @param fileName * @return */ public boolean isFileSupported(VFSLeaf leaf) { String fileName = leaf.getName(); if (fileName == null || fileName.startsWith(".")) { //don't index all mac os x hidden files return false; } long fileSize = leaf.getSize(); if(fileSize == 0) { return false;// don't index empty files } // 1. Check if file is not on fileBlackList if (searchModule.getFileBlackList().contains(fileName)) { // File name is on blacklist return false; } if(leaf instanceof LocalImpl) { String path = ((LocalImpl)leaf).getBasefile().getAbsolutePath(); if (!isFileSupported(path)) { return false; } } String suffix; try { suffix = FileTypeDetector.getSuffix(leaf); } catch (DocumentNotImplementedException e) { return false; } // 2. Check for certain file-type the file size if (searchModule.getFileSizeSuffixes().contains(suffix)) { long maxFileSize = searchModule.getMaxFileSize(); if ( (maxFileSize != 0) && (fileSize > maxFileSize) ) { log.info("File too big, exlude from search index. filename=" + fileName); excludedFileSizeCount++; return false; } } return true; } public boolean isFileSupported(String path) { return !searchModule.getFileBlackList().contains(path); } public int getExcludedFileSizeCount( ) { return excludedFileSizeCount; } public void resetExcludedFileSizeCount( ) { excludedFileSizeCount = 0; } }