/* * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */ package org.xwiki.officeimporter.internal.builder; import java.io.InputStream; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.Collections; import java.util.HashMap; import java.util.Map; import javax.inject.Inject; import javax.inject.Named; import javax.inject.Provider; import javax.inject.Singleton; import org.apache.commons.lang3.StringUtils; import org.w3c.dom.Document; import org.xwiki.bridge.DocumentAccessBridge; import org.xwiki.component.annotation.Component; import org.xwiki.component.manager.ComponentManager; import org.xwiki.model.reference.DocumentReference; import org.xwiki.model.reference.EntityReferenceSerializer; import org.xwiki.officeimporter.OfficeImporterException; import org.xwiki.officeimporter.builder.PresentationBuilder; import org.xwiki.officeimporter.converter.OfficeConverterException; import org.xwiki.officeimporter.document.XDOMOfficeDocument; import org.xwiki.officeimporter.server.OfficeServer; import org.xwiki.rendering.block.Block; import org.xwiki.rendering.block.ExpandedMacroBlock; import org.xwiki.rendering.block.XDOM; import org.xwiki.rendering.listener.MetaData; import org.xwiki.rendering.parser.Parser; import org.xwiki.rendering.renderer.BlockRenderer; import org.xwiki.xml.html.HTMLCleaner; import org.xwiki.xml.html.HTMLCleanerConfiguration; import org.xwiki.xml.html.HTMLUtils; /** * Default implementation of {@link PresentationBuilder}. * * @version $Id: df9806d2a7ed07446dab63f6c1ae6b020f8ae713 $ * @since 2.1M1 */ @Component @Singleton public class DefaultPresentationBuilder implements PresentationBuilder { /** * Provides the component manager used by {@link XDOMOfficeDocument}. */ @Inject @Named("context") private Provider<ComponentManager> contextComponentManagerProvider; /** * Used to obtain document converter. */ @Inject private OfficeServer officeServer; /** * Used to access current context document. */ @Inject private DocumentAccessBridge documentAccessBridge; /** * Used to serialize the reference document name. */ @Inject private EntityReferenceSerializer<String> entityReferenceSerializer; /** * Office HTML cleaner. */ @Inject @Named("openoffice") private HTMLCleaner officeHTMLCleaner; /** * The component used to parse the XHTML obtained after cleaning. */ @Inject @Named("xhtml/1.0") private Parser xhtmlParser; @Override public XDOMOfficeDocument build(InputStream officeFileStream, String officeFileName, DocumentReference documentReference) throws OfficeImporterException { // Invoke the office document converter. Map<String, byte[]> artifacts = importPresentation(officeFileStream, officeFileName); // Create presentation HTML. String html = buildPresentationHTML(artifacts, StringUtils.substringBeforeLast(officeFileName, ".")); // Clear and adjust presentation HTML (slide image URLs are updated to point to the corresponding attachments). html = cleanPresentationHTML(html, documentReference); // Create the XDOM. XDOM xdom = buildPresentationXDOM(html, documentReference); return new XDOMOfficeDocument(xdom, artifacts, this.contextComponentManagerProvider.get()); } /** * Invokes the Office Server to convert the given input stream. The result is a map of artifacts including slide * images. * * @param officeFileStream the office presentation byte stream * @param officeFileName the name of the office presentation that is being imported * @return the map of artifacts created by the Office Server * @throws OfficeImporterException if converting the office presentation fails */ protected Map<String, byte[]> importPresentation(InputStream officeFileStream, String officeFileName) throws OfficeImporterException { Map<String, InputStream> inputStreams = new HashMap<String, InputStream>(); inputStreams.put(officeFileName, officeFileStream); try { // The office converter uses the output file name extension to determine the output format/syntax. // The returned artifacts are of three types: imgX.jpg (slide screen shot), imgX.html (HTML page that // display the corresponding slide screen shot) and textX.html (HTML page that display the text extracted // from the corresponding slide). We use "img0.html" as the output file name because the corresponding // artifact displays a screen shot of the first presentation slide. return this.officeServer.getConverter().convert(inputStreams, officeFileName, "img0.html"); } catch (OfficeConverterException e) { String message = "Error while converting document [%s] into html."; throw new OfficeImporterException(String.format(message, officeFileName), e); } } /** * Builds the presentation HTML from the presentation artifacts. There are two types of presentation artifacts: * slide image and slide text. The returned HTML will display all the slide images. Slide text is currently ignored. * All artifacts except slide images are removed from {@code presentationArtifacts}. Slide images names are prefixed * with the given {@code nameSpace} to avoid name conflicts. * * @param presentationArtifacts the map of presentation artifacts; this method removes some of the presentation * artifacts and renames others so be aware of the side effects * @param nameSpace the prefix to add in front of all slide image names to prevent name conflicts * @return the presentation HTML */ protected String buildPresentationHTML(Map<String, byte[]> presentationArtifacts, String nameSpace) { StringBuilder presentationHTML = new StringBuilder(); // Iterate all the slides. int i = 0; String slideImageKeyFormat = "img%s.jpg"; byte[] slideImage = presentationArtifacts.remove(String.format(slideImageKeyFormat, i)); while (slideImage != null) { // Remove unused artifacts. // imgX.html is an HTML page that displays the corresponding slide image. presentationArtifacts.remove(String.format("img%s.html", i)); // textX.html is an HTML page that displays the text extracted from the corresponding slide. presentationArtifacts.remove(String.format("text%s.html", i)); // Rename slide image to prevent name conflicts when it will be attached to the target document. String slideImageName = String.format("%s-slide%s.jpg", nameSpace, i); presentationArtifacts.put(slideImageName, slideImage); // Append slide image to the presentation HTML. String slideImageURL = null; try { // We need to encode the slide image name in case it contains special URL characters. slideImageURL = URLEncoder.encode(slideImageName, "UTF-8"); } catch (UnsupportedEncodingException e) { // This should never happen. } presentationHTML.append(String.format("<p><img src=\"%s\"/></p>", slideImageURL)); // Move to the next slide. slideImage = presentationArtifacts.remove(String.format(slideImageKeyFormat, ++i)); } return presentationHTML.toString(); } /** * Cleans the presentation HTML. This method must be called mainly to ensure that the slide image URLs are updated * to point to the corresponding attachments. * * @param dirtyHTML the HTML to be cleaned * @param targetDocumentReference the document where the slide images will be attached * @return the cleaned HTML */ protected String cleanPresentationHTML(String dirtyHTML, DocumentReference targetDocumentReference) { HTMLCleanerConfiguration configuration = this.officeHTMLCleaner.getDefaultConfiguration(); configuration.setParameters(Collections.singletonMap("targetDocument", this.entityReferenceSerializer.serialize(targetDocumentReference))); Document xhtmlDocument = this.officeHTMLCleaner.clean(new StringReader(dirtyHTML), configuration); HTMLUtils.stripHTMLEnvelope(xhtmlDocument); return HTMLUtils.toString(xhtmlDocument); } /** * Parses the given HTML text into an XDOM tree. * * @param html the HTML text to parse * @param targetDocumentReference specifies the document where the presentation will be imported; we use the target * document reference to get the syntax of the target document and to set the {@code BASE} meta data on * the created XDOM * @return a XDOM tree * @throws OfficeImporterException if parsing the given HTML fails */ protected XDOM buildPresentationXDOM(String html, DocumentReference targetDocumentReference) throws OfficeImporterException { try { ComponentManager contextComponentManager = this.contextComponentManagerProvider.get(); String syntaxId = this.documentAccessBridge.getDocument(targetDocumentReference).getSyntax().toIdString(); BlockRenderer renderer = contextComponentManager.getInstance(BlockRenderer.class, syntaxId); Map<String, String> galleryParameters = Collections.emptyMap(); ExpandedMacroBlock gallery = new ExpandedMacroBlock("gallery", galleryParameters, renderer, false, contextComponentManager); gallery.addChild(this.xhtmlParser.parse(new StringReader(html))); XDOM xdom = new XDOM(Collections.singletonList((Block) gallery)); // Make sure (image) references are resolved relative to the target document reference. xdom.getMetaData().addMetaData(MetaData.BASE, entityReferenceSerializer.serialize(targetDocumentReference)); return xdom; } catch (Exception e) { throw new OfficeImporterException("Failed to build presentation XDOM.", e); } } }