/* * This is eMonocot, a global online biodiversity information resource. * * Copyright © 2011–2015 The Board of Trustees of the Royal Botanic Gardens, Kew and The University of Oxford * * eMonocot is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * eMonocot is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * The complete text of the GNU Affero General Public License is in the source repository as the file * ‘COPYING’. It is also available from <http://www.gnu.org/licenses/>. */ package org.emonocot.harvest.common; import org.apache.commons.lang.StringEscapeUtils; import org.owasp.validator.html.AntiSamy; import org.owasp.validator.html.CleanResults; import org.owasp.validator.html.Policy; import org.owasp.validator.html.PolicyException; import org.owasp.validator.html.ScanException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.Resource; public class HtmlSanitizer { private Logger logger = LoggerFactory.getLogger(HtmlSanitizer.class); private Resource policyFile = new ClassPathResource("/META-INF/antisamy-policy.xml"); private Policy policy = null; private AntiSamy antiSamy = new AntiSamy(); public void setPolicyFile(Resource policyFile) { this.policyFile = policyFile; } public void afterPropertiesSet() throws Exception { policy = Policy.getInstance(policyFile.getInputStream()); } public String sanitize(String unclean) { if (unclean == null || unclean.isEmpty()) { return unclean; } else if (unclean.matches(".*\\<[^>]+>.*")) { String unescaped = StringEscapeUtils.unescapeHtml(unclean); CleanResults cleanResults; try { cleanResults = antiSamy.scan(unescaped, policy); return cleanResults.getCleanHTML(); } catch (PolicyException pe) { throw new RuntimeException(pe); } catch (ScanException se) { if (unclean.length() > 36) { logger.error( "Could not sanitize html " + unclean.substring(0, 36), se); return null; } else { logger.error("Could not sanitize html " + unclean, se); return null; } } catch (Exception e) { logger.error("Could not sanitize html " + unclean, e); return null; } } else { String unescaped = StringEscapeUtils.unescapeHtml(unclean); return unescaped.replace("\0", ""); } } }