/* * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */ package org.xwiki.officeimporter.internal.builder; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; import javax.inject.Inject; import javax.inject.Named; import javax.inject.Singleton; import org.apache.commons.lang3.StringUtils; import org.apache.tika.parser.html.HtmlEncodingDetector; import org.w3c.dom.Document; import org.xwiki.component.annotation.Component; import org.xwiki.model.reference.DocumentReference; import org.xwiki.model.reference.EntityReferenceSerializer; import org.xwiki.officeimporter.OfficeImporterException; import org.xwiki.officeimporter.builder.XHTMLOfficeDocumentBuilder; import org.xwiki.officeimporter.converter.OfficeConverterException; import org.xwiki.officeimporter.document.XHTMLOfficeDocument; import org.xwiki.officeimporter.server.OfficeServer; import org.xwiki.xml.html.HTMLCleaner; import org.xwiki.xml.html.HTMLCleanerConfiguration; /** * Default implementation of {@link XHTMLOfficeDocumentBuilder}. * * @version $Id: 1bc0fa620a75b41020df2d4bb444ad95834f12b4 $ * @since 2.1M1 */ @Component @Singleton public class DefaultXHTMLOfficeDocumentBuilder implements XHTMLOfficeDocumentBuilder { /** * Used to serialize the reference document name. */ @Inject private EntityReferenceSerializer<String> entityReferenceSerializer; /** * Used to obtain document converter. */ @Inject private OfficeServer officeServer; /** * Office HTML cleaner. */ @Inject @Named("openoffice") private HTMLCleaner officeHtmlCleaner; /** * Used to determine the encoding of the HTML byte array produced by the office server. */ private HtmlEncodingDetector htmlEncodingDetector = new HtmlEncodingDetector(); @Override public XHTMLOfficeDocument build(InputStream officeFileStream, String officeFileName, DocumentReference reference, boolean filterStyles) throws OfficeImporterException { // Invoke the office document converter. Map<String, InputStream> inputStreams = new HashMap<String, InputStream>(); inputStreams.put(officeFileName, officeFileStream); Map<String, byte[]> artifacts; // The office converter uses the output file name extension to determine the output format/syntax. String outputFileName = StringUtils.substringBeforeLast(officeFileName, ".") + ".html"; try { artifacts = this.officeServer.getConverter().convert(inputStreams, officeFileName, outputFileName); } catch (OfficeConverterException ex) { String message = "Error while converting document [%s] into html."; throw new OfficeImporterException(String.format(message, officeFileName), ex); } // Prepare the parameters for HTML cleaning. Map<String, String> params = new HashMap<String, String>(); params.put("targetDocument", this.entityReferenceSerializer.serialize(reference)); // Extract the images that are embedded through the Data URI scheme and add them to the other artifacts so that // they end up as attachments. params.put("attachEmbeddedImages", "true"); if (filterStyles) { params.put("filterStyles", "strict"); } // Parse and clean the HTML output. HTMLCleanerConfiguration configuration = this.officeHtmlCleaner.getDefaultConfiguration(); configuration.setParameters(params); Reader html = getReader(artifacts.remove(outputFileName)); Document xhtmlDoc = this.officeHtmlCleaner.clean(html, configuration); @SuppressWarnings("unchecked") Map<String, byte[]> embeddedImages = (Map<String, byte[]>) xhtmlDoc.getUserData("embeddedImages"); if (embeddedImages != null) { artifacts.putAll(embeddedImages); } // Return a new XHTMLOfficeDocument instance. return new XHTMLOfficeDocument(xhtmlDoc, artifacts); } /** * Detects the proper encoding of the given byte array and returns a reader. * * @param html HTML text as a byte array * @return a reader for the given HTML byte array, that has the proper encoding */ private Reader getReader(byte[] html) { InputStream htmlInputStream = new ByteArrayInputStream(html); Charset charset = null; try { charset = htmlEncodingDetector.detect(htmlInputStream, null); } catch (IOException e) { // Shouldn't happen. } if (charset == null) { charset = Charset.forName("UTF-8"); } return new InputStreamReader(htmlInputStream, charset); } }