/* * Created on 30-Nov-2004 * Created by Paul Gardner * Copyright (C) 2004, 2005, 2006 Aelitis, All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * AELITIS, SAS au capital de 46,603.30 euros * 8 Allee Lenotre, La Grille Royale, 78600 Le Mesnil le Roi, France. * */ package org.gudy.azureus2.core3.html; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.StringTokenizer; import org.gudy.azureus2.core3.internat.MessageText; import org.gudy.azureus2.core3.xml.util.XUXmlWriter; /** * @author parg * */ public class HTMLUtils { /** * returns a list of strings for each line in a basic text representation * @param indent * @param text * @return */ public static List convertHTMLToText( String indent, String text ) { int pos = 0; text = text.replaceAll("<ol>",""); text = text.replaceAll("</ol>",""); text = text.replaceAll("<ul>",""); text = text.replaceAll("</ul>",""); text = text.replaceAll("</li>",""); text = text.replaceAll("<li>","\n\t*"); String lc_text = text.toLowerCase( MessageText.LOCALE_ENGLISH ); List lines = new ArrayList(); while( true ){ String line; String[] tokens = new String[]{ "<br>", "<p>" }; String token = null; int p1 = -1; for (int i=0;i<tokens.length;i++){ int x = lc_text.indexOf( tokens[i], pos ); if ( x != -1 ){ if ( p1 == -1 || x < p1 ){ token = tokens[i]; p1 = x; } } } if ( p1 == -1 ){ line = text.substring(pos); }else{ line = text.substring(pos,p1); pos = p1+token.length(); } lines.add( indent + line ); if ( p1 == -1 ){ break; } } return( lines ); } public static String convertListToString(List list) { StringBuffer result = new StringBuffer(); String separator = ""; Iterator iter = list.iterator(); while(iter.hasNext()) { String line = iter.next().toString(); result.append(separator); result.append(line); separator = "\n"; } return result.toString(); } public static String convertHTMLToText2( String content ) { int pos = 0; String res = ""; content = removeTagPairs( content, "script" ); content = content.replaceAll( " ", " " ); content = content.replaceAll( "[\\s]+", " " ); while(true){ int p1 = content.indexOf( "<", pos ); if ( p1 == -1 ){ res += content.substring(pos); break; } int p2 = content.indexOf( ">", p1 ); if ( p2 == -1 ){ res += content.substring(pos); break; } String tag = content.substring(p1+1,p2).toLowerCase( MessageText.LOCALE_ENGLISH ); res += content.substring(pos,p1); if ( tag.equals("p") || tag.equals("br")){ if ( res.length() > 0 && res.charAt(res.length()-1) != '\n' ){ res += "\n"; } } pos = p2+1; } res = res.replaceAll( "[ \\t\\x0B\\f\\r]+", " " ); res = res.replaceAll( "[ \\t\\x0B\\f\\r]+\\n", "\n" ); res = res.replaceAll( "\\n[ \\t\\x0B\\f\\r]+", "\n" ); if ( res.length() > 0 && Character.isWhitespace(res.charAt(0))){ res = res.substring(1); } return( res ); } public static String splitWithLineLength( String str, int length ) { String res = ""; StringTokenizer tok = new StringTokenizer(str, "\n"); while( tok.hasMoreTokens()){ String line = tok.nextToken(); while( line.length() > length ){ if ( res.length() > 0 ){ res += "\n"; } boolean done = false; for (int i=length-1;i>=0;i--){ if ( Character.isWhitespace( line.charAt(i))){ done = true; res += line.substring(0,i); line = line.substring(i+1); break; } } if ( !done ){ res += line.substring(0,length); line = line.substring( length ); } } if ( res.length() > 0 && line.length() > 0 ){ res += "\n"; res += line; } } return( res ); } public static String removeTagPairs( String content, String tag_name ) { tag_name = tag_name.toLowerCase( MessageText.LOCALE_ENGLISH ); String lc_content = content.toLowerCase( MessageText.LOCALE_ENGLISH ); int pos = 0; String res = ""; int level = 0; int start_pos = -1; while(true){ int start_tag_start = lc_content.indexOf( "<" + tag_name, pos ); int end_tag_start = lc_content.indexOf( "</" + tag_name, pos ); if ( level == 0 ){ if ( start_tag_start == -1 ){ res += content.substring(pos); break; } res += content.substring(pos,start_tag_start); start_pos = start_tag_start; level = 1; pos = start_pos+1; }else{ if ( end_tag_start == -1 ){ res += content.substring(pos); break; } if ( start_tag_start == -1 || end_tag_start < start_tag_start ){ level--; int end_end = lc_content.indexOf( '>', end_tag_start ); if( end_end == -1 ){ break; } pos = end_end + 1; }else{ level++; pos = start_tag_start+1; } } } return( res ); } public static Object[] getLinks( String content_in ) { int pos = 0; List urls = new ArrayList(); String content_out = ""; String current_url = null; int current_url_start = -1; while(true){ int p1 = content_in.indexOf( "<", pos ); if ( p1 == -1 ){ break; } p1++; int p2 = content_in.indexOf( ">", p1 ); if ( p2 == -1 ){ break; } if ( p1 > pos ){ content_out += content_in.substring( pos, p1-1 ); } pos = p2+1; String tag = content_in.substring( p1, p2 ).trim(); String lc_tag = tag.toLowerCase( MessageText.LOCALE_ENGLISH ); if ( lc_tag.startsWith("a " )){ int hr_start = lc_tag.indexOf( "href"); if ( hr_start == -1 ){ continue; } hr_start = lc_tag.indexOf("=", hr_start); if ( hr_start == -1 ){ continue; } hr_start += 1; while( hr_start < lc_tag.length() && Character.isWhitespace(lc_tag.charAt(hr_start))){ hr_start++; } int hr_end = lc_tag.length()-1; while( hr_end >= lc_tag.length() && Character.isWhitespace(lc_tag.charAt(hr_end))){ hr_end--; } String href = tag.substring(hr_start, hr_end+1 ).trim(); if ( href.startsWith("\"")){ int endQuotePos = href.indexOf('\"', 1); if (endQuotePos == -1) { href = href.substring(1,href.length()-1); } else { href = href.substring(1,endQuotePos); } } current_url = href; current_url_start = content_out.length(); }else if ( lc_tag.startsWith( "/" ) && lc_tag.substring(1).trim().equals( "a" )){ if ( current_url != null ){ int len = content_out.length() - current_url_start; urls.add( new Object[]{ current_url, new int[]{ current_url_start, len }}); } current_url = null; } } if ( pos < content_in.length()){ content_out += content_in.substring( pos ); } return( new Object[]{ content_out, urls }); } public static String expand( String str ) { str = XUXmlWriter.unescapeXML( str ); str = str.replaceAll( " ", " " ); return( str ); } public static void main( String[] args ) { Object[] obj = getLinks( "aaaaaaa <a href=\"http://here/parp \">link< / a > prute <a href=\"http://here/pa\">klink</a>" ); System.out.println( obj[0] ); List urls = (List)obj[1]; for (int i=0;i<urls.size();i++){ Object[] entry = (Object[])urls.get(i); System.out.println( " " + entry[0] + ((int[])entry[1])[0] + "," + ((int[])entry[1])[1] ); } } }