/*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.xwiki.officeimporter.internal.filter;
import java.util.List;
import java.util.Map;
import javax.inject.Named;
import javax.inject.Singleton;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xwiki.component.annotation.Component;
import org.xwiki.xml.html.filter.AbstractHTMLFilter;
import org.xwiki.xml.html.filter.ElementSelector;
/**
* This filter is used to remove those tags that doesn't play any role with the representation of information. This type
* of tags can result from other filters (like the style filter) or Open Office specific formatting choices (like
* newlines being represented by empty paragraphs). For an example, empty {@code <span/>} or {@code <div/>} tags will be
* ripped off within this filter.
*
* @version $Id: d44cf476ea96ac774cc2aa4d03bba8a1fc46f78a $
* @since 1.8M1
*/
@Component
@Named("officeimporter/redundancy")
@Singleton
public class RedundancyFilter extends AbstractHTMLFilter
{
/**
* List of those tags which will be filtered if no attributes are present.
*/
private static final String[] FILTERED_IF_NO_ATTRIBUTES_TAGS = new String[] {TAG_SPAN, TAG_DIV};
/**
* List of those tags which will be filtered if no textual content is present inside them.
*/
private static final String[] FILTERED_IF_NO_CONTENT_TAGS = new String[] {
TAG_EM, TAG_STRONG, TAG_DFN, TAG_CODE, TAG_SAMP, TAG_KBD, TAG_VAR, TAG_CITE, TAG_ABBR,
TAG_ACRONYM, TAG_ADDRESS, TAG_BLOCKQUOTE, TAG_Q, TAG_PRE, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6};
@Override
public void filter(Document document, Map<String, String> cleaningParams)
{
List<Element> elementsWithNoAttributes =
filterDescendants(document.getDocumentElement(), FILTERED_IF_NO_ATTRIBUTES_TAGS, new ElementSelector()
{
@Override
public boolean isSelected(Element element)
{
return !element.hasAttributes();
}
});
for (Element element : elementsWithNoAttributes) {
replaceWithChildren(element);
}
List<Element> elementsWithNoContent =
filterDescendants(document.getDocumentElement(), FILTERED_IF_NO_CONTENT_TAGS, new ElementSelector()
{
@Override
public boolean isSelected(Element element)
{
return element.getTextContent().trim().equals("");
}
});
for (Element element : elementsWithNoContent) {
String textContent = element.getTextContent();
if (textContent.equals("")) {
element.getParentNode().removeChild(element);
} else {
element.setTextContent(textContent.replaceAll(" ", " "));
}
}
}
}