//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentmanipulators;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import uk.gov.dstl.baleen.contentmanipulators.helpers.ContentManipulator;
/**
* Recursively remove empty HTML tags to clean the document.
*
* This will not remove the body tag, but everything either will be remove it is empty (or only
* holds empty elements).
*
*/
public class RemoveEmptyText implements ContentManipulator {
@Override
public void manipulate(Document document) {
Element body = document.body();
while (!removeEmpty(body)) {
// Repeat as needed.... work done in the while
}
}
private boolean removeEmpty(Element document) {
Elements emptyNodes = document.select(":empty").not("body");
if (emptyNodes.isEmpty()) {
return true;
}
emptyNodes.remove();
return false;
}
}