/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.storage.importers; import org.apache.log4j.Logger; import org.htmlparser.Parser; import org.htmlparser.beans.StringBean; import org.htmlparser.util.ParserException; /** * This importer uses org.htmlpraser to obtain plain text from an HTML file. * * @author Jorge Villalon * */ public class HtmlImporter extends AbstractImporter implements Importer { private static Logger logger = Logger.getLogger(HtmlImporter.class); @Override public String getCleanContent(String content) { String clean = null; try { Parser parser = new Parser(); parser.setInputHTML(content); StringBean bean = new StringBean(); parser.visitAllNodesWith(bean); clean = bean.getStrings(); } catch (ParserException e) { logger.error(e); } return clean; } @Override protected String[] getFileExtensions() { String[] extensions = new String[3]; extensions[0] = "xhtml"; extensions[1] = "html"; extensions[2] = "htm"; return extensions; } }