/* * Copyright (c) 2013 Websquared, Inc. * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Public License v2.0 * which accompanies this distribution, and is available at * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html * * Contributors: * swsong - initial API and implementation */ package org.fastcatsearch.util; import java.util.regex.Pattern; import org.fastcatsearch.ir.common.IRException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class HTMLTagRemover { private static Logger logger = LoggerFactory.getLogger(HTMLTagRemover.class); private static final Pattern ptnTagScript = Pattern.compile("<[\\s]*script[\\s]*[^>]*>[\\s\\S]*?<[\\s]*[/][\\s]*script[\\s]*>", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); private static final Pattern ptnTagStyle = Pattern.compile("<[\\s]*style[\\s]*[^>]*>[\\s\\S]*?<[\\s]*[/][\\s]*style[\\s]*>", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); private static final Pattern ptnTagTitle = Pattern.compile("<[\\s]*?title[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?title[\\s]*?>", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); private static final Pattern ptnContentHtml = Pattern.compile("<[/]?[a-z0-9]+([^>]*)[/]?>", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); private static final Pattern ptnContentComment = Pattern.compile("<![-]*[\\s\\S가-힣ㄱ-ㅎ]+?[-]*>", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); //<!뒤에 -- 가 없을수도.. <!DOCTYPE , >앞에 -- 가 없을수도있다. private static final Pattern ptnChrSpecial = Pattern.compile("([&][a-zA-Z]{1,8}[;])", Pattern.CASE_INSENSITIVE); private static final Pattern ptnChrSpecial2 = Pattern.compile("([&][#][0-9]{1,4}[;])", Pattern.CASE_INSENSITIVE); private static final Pattern ptnSpaces = Pattern.compile("[ \\t]{2,}"); // 빈 공백 긴것 삭제 private static final Pattern ptnMultiLine1 = Pattern.compile("^\\s*[\\n\\r]"); // 여러 공백삭제. private static final Pattern ptnMultiLine2 = Pattern.compile("[\\n\\r\\t]{2,}"); // 여러줄 공백삭제. public static String clean(String targetString) throws IRException { String htmlStr = targetString.replaceAll("<", "<").replaceAll(">", ">"); try { htmlStr = ptnTagScript.matcher(htmlStr).replaceAll(""); // clean script htmlStr = ptnTagStyle.matcher(htmlStr).replaceAll(""); // clean style htmlStr = ptnTagTitle.matcher(htmlStr).replaceAll(""); // clean style htmlStr = ptnContentHtml.matcher(htmlStr).replaceAll(""); // clean html htmlStr = ptnContentComment.matcher(htmlStr).replaceAll(""); // clean add1 htmlStr = htmlStr.replaceAll(""", "\""); htmlStr = htmlStr.replaceAll("'", "'"); htmlStr = ptnChrSpecial.matcher(htmlStr).replaceAll(" "); htmlStr = ptnChrSpecial2.matcher(htmlStr).replaceAll(" "); htmlStr = ptnSpaces.matcher(htmlStr).replaceAll(" "); htmlStr = ptnMultiLine1.matcher(htmlStr).replaceAll(""); htmlStr = ptnMultiLine2.matcher(htmlStr).replaceAll("\n"); return htmlStr; } catch (Exception e) { logger.error("HTML Tag clean Error:" + e.getMessage(), e); throw new IRException(e); } } }