/******************************************************************************* * Copyright (c) 2014 Red Hat, Inc. * Distributed under license by Red Hat, Inc. All rights reserved. * This program is made available under the terms of the * Eclipse Public License v1.0 which accompanies this distribution, * and is available at http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Red Hat, Inc. - initial API and implementation ******************************************************************************/ package org.jboss.tools.cdi.seam.core.test.international; /** * * @author Victor Rubezhny */ public class HTML2TextUtil { /** * Cuts all the html tags/comments/styles from the html-text and returns the only printable text. * * @param html * @return plain text */ public static String html2Text(String html) { StringBuilder sb = new StringBuilder(); int state = 0; // // JBIDE-16135: CSS part contains the fontnames that are OS and setup dependent, // So we should exclude it from compare // int styleStart = html.toLowerCase().indexOf("<style"); int styleEnd = html.toLowerCase().indexOf("/style>"); while (styleStart != -1 && styleEnd > styleStart) { html = html.substring(0, styleStart) + html.substring(styleEnd + "/style>".length()); styleStart = html.toLowerCase().indexOf("<style"); styleEnd = html.toLowerCase().indexOf("/style>"); } // JBIDE-16135: pragmas and comments should be remived also int commentStart = html.indexOf("<!--"); int commentEnd = html.indexOf("-->"); while (commentStart != -1 && commentEnd > commentStart) { html = html.substring(0, commentStart) + html.substring(commentEnd + "-->".length()); commentStart = html.indexOf("<!--"); commentEnd = html.indexOf("-->"); } html = html.trim(); for (char ch : html.toCharArray()) { switch (state) { case (int)'<': // Read to null until '>'-char is read if (ch != '>') continue; state = 0; break; default: if (ch == '<') { state = '<'; continue; } sb.append(ch); break; } } return sb.toString().trim(); } }