/* * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */ package org.xwiki.officeimporter.internal.filter; import java.net.URLDecoder; import java.nio.charset.Charset; import java.util.HashMap; import java.util.List; import java.util.Map; import javax.inject.Inject; import javax.inject.Named; import javax.inject.Singleton; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.slf4j.Logger; import org.w3c.dom.Attr; import org.w3c.dom.Comment; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.xwiki.bridge.DocumentAccessBridge; import org.xwiki.component.annotation.Component; import org.xwiki.model.reference.AttachmentReference; import org.xwiki.model.reference.DocumentReference; import org.xwiki.model.reference.DocumentReferenceResolver; import org.xwiki.rendering.listener.reference.ResourceReference; import org.xwiki.rendering.listener.reference.ResourceType; import org.xwiki.rendering.renderer.reference.ResourceReferenceSerializer; import org.xwiki.xml.XMLUtils; import org.xwiki.xml.html.filter.AbstractHTMLFilter; import com.github.ooxi.jdatauri.DataUri; /** * This filter performs the following transformations on the {@code <img>} tags: * <ul> * <li>Changes the image source to point to the attached file and adds the XHTML markers (comments) required in order to * convert the XHTML to the right wiki syntax. For this you need to specify the "targetDocument" cleaning * parameter.</li> * <li>Collects the images embedded through the Data URI scheme when the "attachEmbeddedImages" cleaning parameter is * set to true. The result can be accessed from the user data associated with the filtered document, under the * "embeddedImages" key.</li> * <li>Removes the "align" attribute as it can cause problems. First, the office server has a problem with center * aligning images (it aligns them to left). Then, the office server uses {@code <br clear"xxx">} to avoid content * wrapping around images which is not valid XHTML.</li> * </ul> * * @version $Id: cddbdba11f4af7911c719bebd1bcec5783c66712 $ * @since 1.8M1 */ @Component @Named("officeimporter/image") @Singleton public class ImageFilter extends AbstractHTMLFilter { private static final String UTF_8 = "UTF-8"; private static final String EMBEDDED_IMAGES = "embeddedImages"; @Inject private Logger logger; /** * The {@link DocumentAccessBridge} component. */ @Inject private DocumentAccessBridge documentAccessBridge; /** * Used to serialize the image reference as XHTML comment. */ @Inject @Named("xhtmlmarker") private ResourceReferenceSerializer xhtmlMarkerSerializer; /** * The component used to parse string document references. */ @Inject @Named("currentmixed") private DocumentReferenceResolver<String> stringDocumentReferenceResolver; @Override public void filter(Document htmlDocument, Map<String, String> cleaningParams) { String targetDocumentName = cleaningParams.get("targetDocument"); DocumentReference targetDocumentReference = targetDocumentName == null ? null : this.stringDocumentReferenceResolver.resolve(targetDocumentName); boolean attachEmbeddedImages = Boolean.valueOf(cleaningParams.get("attachEmbeddedImages")); if (attachEmbeddedImages) { htmlDocument.setUserData(EMBEDDED_IMAGES, new HashMap<String, byte[]>(), null); } List<Element> images = filterDescendants(htmlDocument.getDocumentElement(), new String[] {TAG_IMG}); for (Element image : images) { Attr source = image.getAttributeNode(ATTRIBUTE_SRC); if (source != null && targetDocumentReference != null) { filterImageSource(source, targetDocumentReference); } // The 'align' attribute of images creates a lot of problems. First,the office server has a problem with // center aligning images (it aligns them to left). Next, the office server uses <br clear"xxx"> for // avoiding content wrapping around images which is not valid XHTML. There for, to be consistent and simple // we will remove the 'align' attribute of all the images so that they are all left aligned. image.removeAttribute(ATTRIBUTE_ALIGN); } } private void filterImageSource(Attr source, DocumentReference targetDocumentReference) { String fileName = null; try { fileName = getFileName(source); } catch (Exception e) { this.logger.warn("Failed to extract the image file name. Root cause is [{}]", ExceptionUtils.getRootCauseMessage(e)); this.logger.debug("Full stacktrace is: ", e); } if (StringUtils.isEmpty(fileName)) { return; } // Set image source attribute relative to the reference document. AttachmentReference attachmentReference = new AttachmentReference(fileName, targetDocumentReference); source.setValue(this.documentAccessBridge.getAttachmentURL(attachmentReference, false)); ResourceReference imageReference = new ResourceReference(fileName, ResourceType.ATTACHMENT); imageReference.setTyped(false); Comment beforeComment = source.getOwnerDocument().createComment( XMLUtils.escapeXMLComment("startimage:" + this.xhtmlMarkerSerializer.serialize(imageReference))); Comment afterComment = source.getOwnerDocument().createComment("stopimage"); Element image = source.getOwnerElement(); image.getParentNode().insertBefore(beforeComment, image); image.getParentNode().insertBefore(afterComment, image.getNextSibling()); } private String getFileName(Attr source) throws MimeTypeException { String value = source.getValue(); String fileName = null; @SuppressWarnings("unchecked") Map<String, byte[]> embeddedImages = (Map<String, byte[]>) source.getOwnerDocument().getUserData(EMBEDDED_IMAGES); if (embeddedImages != null && value.startsWith("data:")) { // An image embedded using the Data URI scheme. DataUri dataURI = DataUri.parse(value, Charset.forName(UTF_8)); fileName = dataURI.getFilename(); if (StringUtils.isEmpty(fileName)) { fileName = String.valueOf(Math.abs(dataURI.hashCode())); if (!StringUtils.isEmpty(dataURI.getMime())) { String extension = MimeTypes.getDefaultMimeTypes().forName(dataURI.getMime()).getExtension(); fileName += extension; } } embeddedImages.put(fileName, dataURI.getData()); } else if (!value.contains("://")) { // A relative path. int separator = value.lastIndexOf('/'); fileName = separator < 0 ? value : value.substring(separator + 1); try { // We have to decode the image file name in case it contains URL special characters. fileName = URLDecoder.decode(fileName, UTF_8); } catch (Exception e) { // This shouldn't happen. Use the encoded image file name. } } return fileName; } }