/** * OLAT - Online Learning and Training<br> * http://www.olat.org * <p> * Licensed under the Apache License, Version 2.0 (the "License"); <br> * you may not use this file except in compliance with the License.<br> * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing,<br> * software distributed under the License is distributed on an "AS IS" BASIS, <br> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br> * See the License for the specific language governing permissions and <br> * limitations under the License. * <p> * Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br> * University of Zurich, Switzerland. * <hr> * <a href="http://www.openolat.org"> * OpenOLAT - Online Learning and Training</a><br> * This file has been modified by the OpenOLAT community. Changes are licensed * under the Apache 2.0 license as the original file. */ package org.olat.search.service.document.file; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Writer; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.document.Document; import org.apache.poi.poifs.eventfilesystem.POIFSReader; import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.util.LittleEndian; import org.olat.core.gui.util.CSSHelper; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; import org.olat.core.util.io.LimitedContentWriter; import org.olat.core.util.vfs.VFSLeaf; import org.olat.search.service.SearchResourceContext; /** * Lucene document mapper. * @author Christian Guretzki */ public class PowerPointDocument extends FileDocument { private static final long serialVersionUID = -6107766953370631805L; private static final OLog log = Tracing.createLoggerFor(PowerPointDocument.class); public final static String FILE_TYPE = "type.file.ppt"; public PowerPointDocument() { super(); } public static Document createDocument(SearchResourceContext leafResourceContext, VFSLeaf leaf) throws IOException,DocumentException,DocumentAccessException { PowerPointDocument powerPointDocument = new PowerPointDocument(); powerPointDocument.init(leafResourceContext,leaf); powerPointDocument.setFileType(FILE_TYPE); powerPointDocument.setCssIcon(CSSHelper.createFiletypeIconCssClassFor(leaf.getName())); if (log.isDebug()) log.debug(powerPointDocument.toString()); return powerPointDocument.getLuceneDocument(); } @Override public FileContent readContent(VFSLeaf leaf) throws IOException,DocumentException { if (log.isDebug()) log.debug("read PPT Content of leaf=" + leaf.getName()); try (BufferedInputStream bis = new BufferedInputStream(leaf.getInputStream())) { LimitedContentWriter oStream = new LimitedContentWriter(100000, FileDocumentFactory.getMaxFileSize()); extractText(bis, oStream); return new FileContent(oStream.toString()); } catch (Exception e) { throw new DocumentException("Can not read PPT Content. File=" + leaf.getName(), e); } } private void extractText(InputStream inStream, Writer outWriter) throws IOException { POIFSReader r = new POIFSReader(); /* Register a listener for *all* documents. */ r.registerListener(new MyPOIFSReaderListener(outWriter)); r.read(inStream); } private static class MyPOIFSReaderListener implements POIFSReaderListener { private final Writer oStream; public MyPOIFSReaderListener(Writer oStream) { this.oStream = oStream; } @Override public void processPOIFSReaderEvent(POIFSReaderEvent event) { int errorCounter = 0; try { DocumentInputStream dis = event.getStream(); byte btoWrite[] = new byte[dis.available()]; dis.read(btoWrite, 0, dis.available()); for (int i = 0; i < btoWrite.length - 20; i++) { long type = LittleEndian.getUShort(btoWrite, i + 2); long size = LittleEndian.getUInt(btoWrite, i + 4); if (type == 4008) { try { String chunk = new String(btoWrite, i + 4 + 1, (int)(size + 3)); oStream.write(removeUnvisibleChars(chunk)); } catch( IndexOutOfBoundsException ex) { errorCounter++; } } } } catch (Exception ex) { // Remove general Exception later, for now make it run log.warn("Can not read PPT content.", ex); } if (errorCounter > 0 && log.isDebug()) { log.debug("Could not parse ppt properly. There were " + errorCounter + " IndexOutOfBoundsException"); } } /** * Remove unvisible chars form input string. * * @param inputString * @return Return filtered string */ private String removeUnvisibleChars(String inputString) { Pattern p = Pattern.compile("[^a-zA-Z0-9\n\r!&#<>{}]"); Matcher m = p.matcher(inputString); String output = m.replaceAll(" "); return output; } } }