/* * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */ package org.xwiki.officeimporter.internal.cleaner; import java.io.Reader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import javax.inject.Inject; import javax.inject.Named; import javax.inject.Singleton; import org.w3c.dom.Document; import org.xwiki.component.annotation.Component; import org.xwiki.xml.html.HTMLCleaner; import org.xwiki.xml.html.HTMLCleanerConfiguration; import org.xwiki.xml.html.filter.HTMLFilter; /** * {@link HTMLCleaner} for cleaning HTML generated from an office server. * * @version $Id: 6a1d4314372915b173afb416ecc6156ae9b87dc0 $ * @since 1.8M1 */ @Component @Named("openoffice") @Singleton public class OfficeHTMLCleaner implements HTMLCleaner { /** * Default html cleaner component used internally. */ @Inject private HTMLCleaner defaultHtmlCleaner; /** * {@link HTMLFilter} for stripping various tags. */ @Inject @Named("officeimporter/stripper") private HTMLFilter stripperFilter; /** * {@link HTMLFilter} filtering styles. */ @Inject @Named("officeimporter/style") private HTMLFilter styleFilter; /** * {@link HTMLFilter} for stripping redundant tags. */ @Inject @Named("officeimporter/redundancy") private HTMLFilter redundancyFilter; /** * {@link HTMLFilter} for cleaning empty paragraphs. */ @Inject @Named("officeimporter/paragraph") private HTMLFilter paragraphFilter; /** * {@link HTMLFilter} for filtering image tags. */ @Inject @Named("officeimporter/image") private HTMLFilter imageFilter; /** * {@link HTMLFilter} for filtering HTML anchors. */ @Inject @Named("officeimporter/anchor") private HTMLFilter anchorFilter; /** * {@link HTMLFilter} for filtering lists. */ @Inject @Named("officeimporter/list") private HTMLFilter listFilter; /** * {@link HTMLFilter} for filtering tables. */ @Inject @Named("officeimporter/table") private HTMLFilter tableFilter; /** * {@link HTMLFilter} for filtering line breaks. */ @Inject @Named("officeimporter/linebreak") private HTMLFilter lineBreakFilter; @Override public Document clean(Reader originalHtmlContent) { // Add special parameters used in filters HTMLCleanerConfiguration configuration = getDefaultConfiguration(); configuration.setParameters(Collections.singletonMap("filterStyles", "strict")); return clean(originalHtmlContent, configuration); } @Override public Document clean(Reader originalHtmlContent, HTMLCleanerConfiguration configuration) { return this.defaultHtmlCleaner.clean(originalHtmlContent, configuration); } @Override public HTMLCleanerConfiguration getDefaultConfiguration() { HTMLCleanerConfiguration configuration = this.defaultHtmlCleaner.getDefaultConfiguration(); // Add office cleaning filters after the default filters. List<HTMLFilter> filters = new ArrayList<HTMLFilter>(configuration.getFilters()); filters.addAll(Arrays.asList( this.stripperFilter, this.styleFilter, this.redundancyFilter, this.paragraphFilter, this.imageFilter, this.anchorFilter, this.listFilter, this.tableFilter, this.lineBreakFilter)); configuration.setFilters(filters); return configuration; } }