/** * Copyright 2008 - 2009 Pro-Netics S.P.A. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.pronetics.madstore.crawler.impl; import it.pronetics.madstore.crawler.Stage; import it.pronetics.madstore.crawler.model.Page; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.CompactXmlSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XmlSerializer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Preprocessing {@link it.pronetics.madstore.crawler.Stage} implementation for cleaning up (X)HTML pages * and forcing UTF-8 charset encoding. * * @author Salvatore Incandela * @author Sergio Bossa */ public class PreprocessingStage implements Stage { private static final Logger LOG = LoggerFactory.getLogger(PreprocessingStage.class); public Page execute(Page page) { try { LOG.info("Cleaning up page: {}", page.getLink()); HtmlCleaner htmlCleaner = new HtmlCleaner(); CleanerProperties cleanerProperties = htmlCleaner.getProperties(); cleanerProperties.setOmitComments(true); cleanerProperties.setTranslateSpecialEntities(false); cleanerProperties.setRecognizeUnicodeChars(false); cleanerProperties.setOmitUnknownTags(true); cleanerProperties.setOmitDoctypeDeclaration(false); cleanerProperties.setOmitXmlDeclaration(false); cleanerProperties.setUseCdataForScriptAndStyle(true); TagNode tagNode = htmlCleaner.clean(page.getData()); tagNode.removeAttribute("xmlns:xml"); XmlSerializer xmlSerializer = new CompactXmlSerializer(cleanerProperties); String cleanedPage = xmlSerializer.getXmlAsString(tagNode, "UTF-8"); LOG.debug("Cleaned page: {}", cleanedPage); return new Page(page.getLink(), cleanedPage); } catch (Exception e) { LOG.warn(e.getMessage(), e); return null; } } }