/** * OLAT - Online Learning and Training<br> * http://www.olat.org * <p> * Licensed under the Apache License, Version 2.0 (the "License"); <br> * you may not use this file except in compliance with the License.<br> * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing,<br> * software distributed under the License is distributed on an "AS IS" BASIS, <br> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> * See the License for the specific language governing permissions and <br> * limitations under the License. * <p> * Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br> * University of Zurich, Switzerland. * <hr> * <a href="http://www.openolat.org"> * OpenOLAT - Online Learning and Training</a><br> * This file has been modified by the OpenOLAT community. Changes are licensed * under the Apache 2.0 license as the original file. */ package org.olat.search.service.document.file; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.document.Document; import org.olat.core.CoreSpringFactory; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; import org.olat.search.service.SearchServiceFactory; import org.olat.search.service.document.file.pdf.PdfExtractor; /** * Lucene document mapper. * @author Christian Guretzki */ public class PdfDocument extends FileDocument { private static final long serialVersionUID = 6432923202585881794L; private static final OLog log = Tracing.createLoggerFor(PdfDocument.class); public final static String FILE_TYPE = "type.file.pdf"; private boolean externalIndexer; private String pdfTextBufferPath; private String filePath; public PdfDocument() { this(SearchServiceFactory.getService().getSearchModuleConfig().getPdfTextBufferPath(), SearchServiceFactory.getService().getSearchModuleConfig().isPdfExternalIndexer()); } public PdfDocument(String pdfTextBufferPath, boolean externalIndexer) { this.pdfTextBufferPath = pdfTextBufferPath; this.externalIndexer = externalIndexer; } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { PdfDocument textDocument = new PdfDocument(); textDocument.setFilePath(getPdfTextTmpFilePath(leafResourceContext)); textDocument.init(leafResourceContext,leaf); textDocument.setFileType(FILE_TYPE); textDocument.setCssIcon(CSSHelper.createFiletypeIconCssClassFor(leaf.getName())); if (log.isDebug() ) log.debug(textDocument.toString()); return textDocument.getLuceneDocument(); } private void setFilePath(String filePath2) { this.filePath = filePath2; } /** * Create a file-path for certain SearchResourceContext. * E.g. '04\1601914104anuale_print.pdf' */ private static String getPdfTextTmpFilePath(SearchResourceContext leafResourceContext) { int hashCode = Math.abs(leafResourceContext.getResourceUrl().hashCode()); String hashCodeAsString = Integer.toString(hashCode); String splitDirName = hashCodeAsString.substring(hashCodeAsString.length()-2); String pdfTextTmpFilePath = splitDirName + File.separator + hashCodeAsString + leafResourceContext.getFilePath(); if (log.isDebug()) log.debug("PdfTextTmpFilePath=" + pdfTextTmpFilePath); return pdfTextTmpFilePath; } @Override protected FileContent readContent(VFSLeaf leaf) throws DocumentException, DocumentAccessException { try { String bean = externalIndexer ? "pdfExternalIndexer" : "pdfInternalIndexer"; PdfExtractor extractor = (PdfExtractor)CoreSpringFactory.getBean(bean); File pdfTextFile = new File(pdfTextBufferPath, getFilePath() + ".tmp"); if (isNewPdfFile(leaf, pdfTextFile)) { //prepare dirs if(!pdfTextFile.getParentFile().exists()) { pdfTextFile.getParentFile().mkdirs(); } extractor.extract(leaf, pdfTextFile); } // text file with extracted text exist => read pdf text from there return getPdfTextFromBuffer(pdfTextFile); } catch (DocumentAccessException ex) { // pass exception throw ex; } catch (Exception ex) { throw new DocumentException("Can not read PDF content. File=" + leaf.getName(), ex); } } private FileContent getPdfTextFromBuffer(File pdfTextFile) throws IOException { if (log.isDebug()) log.debug("readContent from text file start..."); try(BufferedReader br = new BufferedReader(new FileReader(pdfTextFile)); LimitedContentWriter sb = new LimitedContentWriter(5000, FileDocumentFactory.getMaxFileSize())) { //search the title char[] cbuf = new char[4096]; int length = br.read(cbuf); int indexSep = 0; String title = ""; if(length > 0) { String firstChunk = new String(cbuf, 0, length); indexSep = firstChunk.indexOf("\u00A0|\u00A0"); if(indexSep > 0) { title = firstChunk.substring(0, indexSep); sb.append(firstChunk.substring(indexSep + 3)); } else { sb.append(firstChunk); } while((length = br.read(cbuf)) > 0) { sb.write(cbuf, 0, length); } } return new FileContent(title, sb.toString()); } catch(IOException e) { throw e; } } private String getFilePath() { return filePath; } private boolean isNewPdfFile(VFSLeaf leaf, File pdfTextFile) { if (pdfTextFile == null) { return true; } if (!pdfTextFile.exists()) { return true; } if (leaf.getLastModified() > pdfTextFile.lastModified() ) { // pdf file is newer => delete it pdfTextFile.delete(); return true; } return false; } }