/**
MIT License
Copyright (c) 2009
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
package sw4j.util.web;
import java.net.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import sw4j.util.Sw4jException;
import sw4j.util.ToolSafe;
import sw4j.util.ToolURI;
/**
* provide functions for processing HTML character escape encoding
*
* @author Li Ding
*
* Feb 22, 2007, li udpated escape HTML code
*/
public class ToolWeb {
public final static int MAX_LEN_URL = 250;
public final static int MAX_LEN_HOST_URL = 200;
public final static int MAX_LEN_SUFFIX = 10;
public static void main(String[] args) {
testSuffix();
}
public static void testAll(){
testEscape();
testSuffix();
}
public static void testEscape(){
String temp = "Holly Br�gge Jimison";
System.out.println(temp);
temp = ToolWeb.escapeHTML(temp);
System.out.println(temp);
temp = ToolWeb.unescapeHTML(temp);
System.out.println(temp);
}
/*
public static void testValidateURI(){
String [] szURIs = new String []{
//bad
"http",
"http://",
"xyz://da.bb/asdf",
"http://da.bb/asdf\"asdfas",
"http:///aa",
"http://high_g.ciao.jp/blog/index.rdf", // cannot have _ in host name according to RFC
"http://:980/sadf",
"irc://localhost/foo#SHA",
//correct
"http://daf:8080/aa",
"http://aa.d/%7edaf%3baads",
"http://sa.d?http://dfa.d#dafse",
"http://sa.d?http://dfa.d",
"http://dev.w3.org/cvsweb/2000/10/swap/Attic/logic.n3?rev=1.2",
"http://sw.deri.org/~aharth/2004/11/rdfquery-perf/univ20/University18_2.nt",
"mailto://da.bb/asdf",
"HTTp://aa.bb", //
"http://aa.bb",
};
for (int i=0; i<szURIs.length; i++){
System.out.println("testing: " + szURIs[i]);
boolean ret =validateAbsoluteURI(szURIs[i]);
System.out.println("====> "+ret);
System.out.println();
}
}
*/
public static void testSuffix(){
String [] szURIs = new String []{
"http://dev.w3.org/cvsweb/2000/10/swap/Attic/logic.n3?rev=1.2",
"http://sw.deri.org/~aharth/2004/11/rdfquery-perf/univ20/University18_2.nt",
"http://onohiroki.cycling.jp/tb/tb.cgi",
"http://dev.w3.org/cvsweb/2000/10/swap/grammar/n3.n3?rev=1.21",
"http://jip.kwark.org/Gfx/2000/09/edin.lela.jpg.html?tmpl=image-foaf",
"http://www.Department14.University12.edu/FullProfessor4",
"http://www.livejournal.com/users/idoj_nilbog/data/foaf",
"http://orlando.openguides.org/index.cgi?id=Text_Formatting_Examples;format=rdf",
"http://www.communityprogrammes.org.uk/events/cgfl-london/.meta.rdf",
};
for (int i=0; i<szURIs.length; i++){
System.out.println("testing: " + szURIs[i]);
System.out.println("====> "+getSuffix(szURIs[i]));
System.out.println();
}
}
/////////////////////////////////////////////
// validate URL
/////////////////////////////////////////////
/////////////////////////////////////////////
// process URL
/////////////////////////////////////////////
public static boolean isLongURL(String szURL){
return szURL.length()>=MAX_LEN_URL;
}
public static boolean isLongURI(String szURL){
return szURL.length()>=MAX_LEN_URL;
}
public static boolean isLongHostURL(String szURL){
return szURL.length()>=MAX_LEN_HOST_URL;
}
public static final String decodeURL(String szURL){
try {
return URLDecoder.decode(szURL,"UTF8");
}catch (IllegalArgumentException e){
e.printStackTrace();
}catch (java.io.UnsupportedEncodingException e){
e.printStackTrace();
}
return szURL; //cannot decode it
}
public static String getSuffix(String szURI){
final String NO_SUFFIX ="---";
String suffix = NO_SUFFIX;
String szFileName="";
URI uri;
try {
uri = ToolURI.string2uri(szURI);
} catch (Sw4jException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return NO_SUFFIX;
}
szFileName = uri.getPath();
// non-empty query part
if (null!=uri.getQuery())
return NO_SUFFIX;
// empty path
if ((null==szFileName)||(szFileName.length()==0))
return NO_SUFFIX;
// find separator .
int index = szFileName.lastIndexOf(".");
if ((index<=0)||(szFileName.endsWith(".")))
return NO_SUFFIX;
// can not have / after .
int indexBS = szFileName.lastIndexOf("/");
if (index<indexBS)
return NO_SUFFIX;
suffix= szFileName.substring(index+1);
// validate
if (!Character.isLetter(suffix.charAt(0)))
return NO_SUFFIX;
if (suffix.length()>MAX_LEN_SUFFIX)
return NO_SUFFIX;
return suffix.toLowerCase();
}
/////////////////////////////////////////////
// encode and decode HTML with escape chars
/////////////////////////////////////////////
// source: http://www.w3.org/MarkUp/html-spec/html-spec_13.html
// http://www.theukwebdesigncompany.com/articles/entity-escape-characters.php
// source: http://www.thesauruslex.com/typo/eng/enghtml.htm
private static HashMap<String,String> map = new HashMap<String,String>();
static {
map.put("�","€");
// ! &33;
map.put("\"",""");
// # &35;
// $ &36;
// % &37;
map.put("&","&");
/*
' &39;
( &40;
) &41;
* &42;
+ &43;
, &44;
- &45;
. &46;
/ &47;
// 0-9
: &58;
; &59;
*/
map.put("<","<");
// = &61;
map.put(">",">");
/*
? &63;
@ &64;
// A-Z
[ &91;
\ &92;
] &93;
^ &94;
_ &95;
` &96;
// a-z
{ &123;
| &124;
} &125;
~ &126;
// Non-breaking space &160; ,"
*/
map.put("�","¡");
map.put("�","¢");
map.put("�","£");
map.put("�","¤");
map.put("�","¥");
map.put("�","¦");
map.put("�","§");
map.put("�","¨");
map.put("�","©");
map.put("�","ª");
// � &171;
map.put("�","¬");
map.put("�","");
map.put("�","®");
map.put("�","¯");
map.put("�","°");
map.put("�","±");
map.put("�","²");
map.put("�","³");
map.put("�","´");
map.put("�","µ");
map.put("�","¶");
map.put("�","·");
map.put("�","¸");
map.put("�","¹");
map.put("�","º");
map.put("�","»");
map.put("�","¼");
map.put("�","½");
map.put("�","¾");
map.put("�","¿");
map.put("�","À");
map.put("�","Á");
// � Â �
map.put("�","Ã");
map.put("�","Ä");
map.put("�","Å");
map.put("�","Æ");
map.put("�","Ç");
map.put("�","È");
map.put("�","É");
map.put("�","Ê");
// � �
map.put("�","Ì");
map.put("�","Í");
map.put("�","Î");
map.put("�","Ï");
map.put("�","Ð");
map.put("�","Ñ");
map.put("�","Ò");
map.put("�","Ó");
map.put("�","Ô");
map.put("�","Õ");
map.put("�","Ö");
map.put("�","×");
map.put("�","Ø");
map.put("�","Ù");
map.put("�","Ú");
map.put("�","Û");
map.put("�","Ü");
map.put("�","Ý");
map.put("�","Þ");
map.put("�","ß");
map.put("�","à");
map.put("�","á");
map.put("�","â");
map.put("�","ã");
map.put("�","ä");
map.put("�","å");
map.put("�","æ");
map.put("�","ç");
map.put("�","è");
map.put("�","é");
map.put("�","ê");
map.put("�","ë");
map.put("�","ì");
map.put("�","í");
map.put("�","î");
map.put("�","ï");
map.put("�","ð");
map.put("�","ñ");
map.put("�","ò");
map.put("�","ó");
map.put("�","ô");
map.put("�","õ");
map.put("�","ö");
map.put("�","÷");
map.put("�","ø");
map.put("�","ù");
map.put("�","ú");
map.put("�","û");
map.put("�","ü");
map.put("�","ý");
map.put("�","þ");
/*
� &255;
A &256;
a &257;
A &258;
a &259;
A &260;
a &261;
C &262;
c &263;
C &264;
c &265;
C &266;
c &267;
C &268;
c &269;
D &270;
d &271;
� &272;
d &273;
E &274;
e &275;
E &276;
e &277
E &278;
e &279;
E &280;
e &281;
E &282;
e &283;
G &284;
g &285;
G &286;
g &287;
G &288;
g &289;
G &290;
g &291;
H &292;
h &293;
H &294;
h &295;
I &296;
i &297;
I &298;
i &299;
I &300;
i &301;
I &302;
i &303;
I &304;
i &305;
? &306;
? &307;
J &308;
j &309;
K &310;
k &311;
? &312;
L &313;
l &314;
L &315;
l &316;
L &317
l &318;
? &319;
? &320;
L &321;
l &322;
N &323;
n &324;
N &325;
n &326;
N &327;
n &328;
? &329;
? &330;
? &331;
O &332;
o &333;
O &334;
o &335;
O &336;
o &337;
� &338;
� &339;
R &340;
r &341;
R &342;
r &343;
R &344;
r &345;
S &346;
s &347;
S &348;
s &349;
S &350;
s &351;
� &352;
� &353;
T &354;
t &355;
T &356;
t &357
T &358;
t &359;
U &360;
u &361;
U &362;
u &363;
U &364;
u &365;
U &366;
u &367;
U &368;
u &369;
U &370;
u &371;
W &372;
w &373;
Y &374;
y &375;
� &376;
Z &377;
z &378;
Z &379;
z &380;
� &381;
� &382;
? &383;
R &340;
r &341;
R &342;
r &343;
R &344;
r &345;
S &346;
s &347;
S &348;
s &349;
S &350;
s &351;
� &352;
� &353;
T &354;
t &355;
T &356;
t &577;
T &358;
t &359;
U &360;
u &361;
U &362;
u &363;
U &364;
u &365;
U &366;
u &367;
U &368;
u &369;
U &370;
u &371;
W &372;
w &373;
Y &374;
y &375;
� &376;
Z &377
z &378;
Z &379;
z &380;
� &381;
� &382;
? &383;
*/
/*
Albanian
map.put("�","Ç");map.put("�","ç");
map.put("�","Ë");map.put("�","ë");
// Catalan
map.put("�","À");map.put("�","à");
map.put("�","Ç");map.put("�","ç");
map.put("�","È");map.put("�","è");
map.put("�","É");map.put("�","é");
map.put("�","Í");map.put("�","í");
map.put("�","Ï");map.put("�","ï");
map.put("�","Ò");map.put("�","ò");
map.put("�","Ó");map.put("�","ó");
map.put("�","Ú");map.put("�","ú");
map.put("�","Ü");map.put("�","ü");
map.put("�","ª");map.put("�","º");
map.put("�","·");map.put("","");
// Croatian
map.put("?","Ć");map.put("?","ć");
map.put("?","Č");map.put("?","č");
map.put("?","Đ");map.put("?","đ");
map.put("�","Š");map.put("�","š");
map.put("�","Ž");map.put("�","ž");
// Czech
map.put("�","Á");map.put("�","á");
map.put("?","Č");map.put("?","č");
map.put("?","Ď");map.put("?","ď");
map.put("�","É");map.put("�","é");
map.put("?","Ě");map.put("?","ě");
map.put("�","Í");map.put("�","í");
map.put("?","Ň");map.put("?","ň");
map.put("�","Ó");map.put("�","ó");
map.put("?","Ř");map.put("?","ř");
map.put("�","Š");map.put("�","š");
map.put("?","Ť");map.put("?","ť");
map.put("�","Ú");map.put("�","ú");
map.put("?","Ů");map.put("?","ů");
map.put("�","Ý");map.put("�","ý");
map.put("�","Ž");map.put("�","ž");
// Danish
map.put("�","Æ");map.put("�","æ");
map.put("�","Ø");map.put("�","ø");
map.put("�","Å");map.put("�","å");
// Dutch
map.put("�","É");map.put("�","é");
map.put("�","Ë");map.put("�","ë");
map.put("�","Ó");map.put("�","ó");
// Esperanto
map.put("?","Ĉ");map.put("?","ĉ");
map.put("?","Ĝ");map.put("?","ĝ");
map.put("?","Ĥ");map.put("?","ĥ");
map.put("?","Ĵ,");map.put("?","ĵ");
map.put("?","Ŝ");map.put("?","ŝ");
map.put("?","Ŭ");map.put("?","ŭ");
// Estonian
map.put("�","Ä");map.put("�","ä");
map.put("�","Ö");map.put("�","ö");
map.put("�","Õ");map.put("�","õ");
map.put("�","Ü");map.put("�","ü");
// Faroese
map.put("�","Á");map.put("�","á");
map.put("�","Ð");map.put("�","ð");
map.put("�","Í");map.put("�","í");
map.put("�","Ó");map.put("�","ó");
map.put("�","Ú");map.put("�","ú");
map.put("�","Ý");map.put("�","ý");
map.put("�","Æ");map.put("�","æ");
map.put("�","Ø");map.put("�","ø");
// Finnish
map.put("�","Ä");map.put("�","ä");
map.put("�","Ö");map.put("�","ö");
// French
map.put("�","À");map.put("�","à");
map.put("�","Â");map.put("�","â");
map.put("�","Ç");map.put("�","ç");
map.put("�","È");map.put("�","è");
map.put("�","É");map.put("�","é");
map.put("�","Ê");map.put("�","ê");
map.put("�","Ë");map.put("�","ë");
map.put("�","Î");map.put("�","î");
map.put("�","Ï");map.put("�","ï");
map.put("�","Ô");map.put("�","ô");
map.put("�","Œ");map.put("�","œ");
map.put("�","Ù");map.put("�","ù");
map.put("�","Û");map.put("�","û");
map.put("�","Ü");map.put("�","ü");
map.put("�","Ÿ");map.put("�","ÿ");
// German
map.put("�","Ä");map.put("�","ä");
map.put("�","Ö");map.put("�","ö");
map.put("�","Ü");map.put("�","ü");
map.put("�","ß");
// Hungarian
map.put("�","Á");map.put("�","á");
map.put("�","É");map.put("�","é");
map.put("�","Í");map.put("�","í");
map.put("�","Ó");map.put("�","ó");
map.put("?","Ő");map.put("?","ő");
map.put("�","Ú");map.put("�","ú");
map.put("�","Ü");map.put("�","ü");
map.put("?","Ű");map.put("?","ű");
// Icelandic
map.put("�","Á");map.put("�","á");
map.put("�","Ð");map.put("�","ð");
map.put("�","É");map.put("�","é");
map.put("�","Í");map.put("�","í");
map.put("�","Ó");map.put("�","ó");
map.put("�","Ú");map.put("�","ú");
map.put("�","Ý");map.put("�","ý");
map.put("�","Þ");map.put("�","þ");
map.put("�","Æ");map.put("�","æ");
map.put("�","Ö");map.put("�","¨");
// Italian
map.put("�","À");map.put("�","à");
map.put("�","Â");map.put("�","â");
map.put("�","È");map.put("�","è");
map.put("�","É");map.put("�","é");
map.put("�","Ê");map.put("�","ê");
map.put("�","Ì");map.put("�","ì");
map.put("�","Í");map.put("�","í");
map.put("�","Î");map.put("�","î");
map.put("�","Ï");map.put("�","ï");
map.put("�","Ò");map.put("�","ò");
map.put("�","Ô");map.put("�","ô");
map.put("�","Ù");map.put("�","ù");
map.put("�","Û");map.put("�","û");
// Latvian
map.put("?","Ā");map.put("?","ā");
map.put("?","Č");map.put("?","č");
map.put("?","Ē");map.put("?","ē");
map.put("?","Ģ");map.put("?","ģ");
map.put("?","Ī,");map.put("?","ī");
map.put("?","Ķ");map.put("?","ķ");
map.put("?","Ļ");map.put("?","ļ");
map.put("?","Ņ");map.put("?","ņ");
map.put("?","Ŗ");map.put("?","ŗ");
map.put("�","Š");map.put("�","š");
map.put("?","Ū");map.put("?","ū");
map.put("�","Ž");map.put("�","ž");
// Norwegian
map.put("�","Æ");map.put("�","æ");
map.put("�","Ø");map.put("�","ø");
map.put("�","Å");map.put("�","å");
// Polish
map.put("?","Ą");map.put("?","ą");
map.put("?","Ć");map.put("?","ć");
map.put("�","É");map.put("�","é");
map.put("?","Ę");map.put("?","ę");
map.put("?","Ł");map.put("?","ł");
map.put("?","Ń");map.put("?","ń");
map.put("�","Ó");map.put("�","ó");
map.put("?","Ś");map.put("?","ś");
map.put("?","Ź");map.put("?","ź");
map.put("?","Ż");map.put("?","ż");
// Portuguese
map.put("�","À");map.put("�","à");
map.put("�","Á");map.put("�","á");
map.put("�","Â");map.put("�","â");
map.put("�","Ã");map.put("�","ã");
map.put("�","Ç");map.put("�","ç");
map.put("�","È");map.put("�","è");
map.put("�","É");map.put("�","é");
map.put("�","Ê");map.put("�","ê");
map.put("�","Ì");map.put("�","ì");
map.put("�","Í");map.put("�","í");
map.put("�","Ï");map.put("�","ï");
map.put("�","Ò");map.put("�","ò");
map.put("�","Ó");map.put("�","ó");
map.put("�","Õ");map.put("�","õ");
map.put("�","Ù");map.put("�","ù");
map.put("�","Ú");map.put("�","ú");
map.put("�","Ü");map.put("�","ü");
map.put("�","ª");
map.put("�","º");
// Romanian
map.put("?","Ă");map.put("?","ă");
map.put("�","Â");map.put("�","â");
map.put("�","Î");map.put("�","î");
map.put("?","Ş");map.put("?","ş");
map.put("?","Ţ");map.put("?","ţ");
// Slovak
map.put("�","Á");map.put("�","á");
map.put("�","Ä");map.put("�","ä");
map.put("?","Č");map.put("?","č");
map.put("?","Ď");map.put("?","ď");
map.put("�","É");map.put("�","é");
map.put("?","Ĺ");map.put("?","ĺ");
map.put("?","Ľ");map.put("?","ľ");
map.put("?","Ň");map.put("?","ň");
map.put("�","Ó");map.put("�","ó");
map.put("�","Ô");map.put("�","ô");
map.put("?","Ŕ");map.put("?","ŕ");
map.put("�","Š");map.put("�","š");
map.put("?","Ť");map.put("?","ť");
map.put("�","Ú");map.put("�","ú");
map.put("�","Ý");map.put("�","ý");
map.put("�","Ž");map.put("�","ž");
// Slovene
map.put("?","Č");map.put("?","č");
map.put("�","Š");map.put("�","š");
map.put("�","Ž");map.put("�","ž");
// Spanish
map.put("�","Á");map.put("�","á");
map.put("�","É");map.put("�","é");
map.put("�","Í");map.put("�","í");
map.put("�","Ó");map.put("�","ó");
map.put("�","Ñ");map.put("�","ñ");
map.put("�","Ú");map.put("�","ú");
map.put("�","Ü");map.put("�","ü");
map.put("�","¡");map.put("�","ª");
map.put("�","¿");map.put("�","º");
// Swedish
map.put("�","Å");map.put("�","å");
map.put("�","Ä");map.put("�","ä");
map.put("�","Ö");map.put("�","ö");
// Turkish
map.put("�","Ç");map.put("�","ç");
map.put("?","Ğ");map.put("?","ğ");
map.put("?","İ");map.put("?","ı");
map.put("�","Ö");map.put("�","ö");
map.put("?","Ş");map.put("?","ş");
map.put("�","Ü");map.put("�","ü");
*/
};
public static final String escapeHTML(String s){
StringBuffer sb = new StringBuffer();
int n = s.length();
for (int i = 0; i < n; i++) {
String c = s.substring(i,i+1);
Object escaped = map.get(c);
if (null!=escaped)
sb.append(escaped);
else
sb.append(c);
}
return sb.toString();
}
public static final String unescapeHTML(String s){
String temp =s;
Iterator<Map.Entry<String, String>> iter = map.entrySet().iterator();
while(iter.hasNext()){
Map.Entry<String, String> entry = iter.next();
String key = (String) entry.getKey();
String value = (String) entry.getValue();
temp = temp.replaceAll(value,key);
}
return temp;
}
public static String extractHtmlHeadMetaRedirectedURL(String text){
Pattern pattern;
Matcher matcher;
String temp;
//int cURL =0;
pattern = Pattern.compile ("<[m|M][e|E][t|T][a|A][^>]+>");
matcher = pattern.matcher( text);
while (matcher.find()){
temp = matcher.group();
System.out.println(temp.trim());
StringTokenizer st = new StringTokenizer(temp,"\t\n \"<>;");
if (!st.hasMoreTokens())
return null;
if (!st.nextToken().equalsIgnoreCase("meta"))
return null;
if (!st.hasMoreTokens())
return null;
if (!st.nextToken().equalsIgnoreCase("http-equiv="))
return null;
if (!st.hasMoreTokens())
return null;
if (!st.nextToken().equalsIgnoreCase("refresh"))
return null;
if (!st.hasMoreTokens())
return null;
if (!st.nextToken().equalsIgnoreCase("content="))
return null;
while(st.hasMoreTokens()){
String token = st.nextToken();
//System.out.println(token);
if (token.startsWith("url="))
return token.substring(4);
}
}
return null;
}
/*
//source http://www.rgagnon.com/javadetails/java-0306.html
public static final String escapeHTML(String s){
StringBuffer sb = new StringBuffer();
int n = s.length();
for (int i = 0; i < n; i++) {
char c = s.charAt(i);
switch (c) {
case '�': sb.append("à");break;
case '�': sb.append("À");break;
case '�': sb.append("â");break;
case '�': sb.append("Â");break;
case '�': sb.append("ä");break;
case '�': sb.append("Ä");break;
case '�': sb.append("å");break;
case '�': sb.append("Å");break;
case '�': sb.append("æ");break;
case '�': sb.append("Æ");break;
case '�': sb.append("ç");break;
case '�': sb.append("Ç");break;
case '�': sb.append("é");break;
case '�': sb.append("É");break;
case '�': sb.append("è");break;
case '�': sb.append("È");break;
case '�': sb.append("ê");break;
case '�': sb.append("Ê");break;
case '�': sb.append("ë");break;
case '�': sb.append("Ë");break;
case '�': sb.append("ï");break;
case '�': sb.append("Ï");break;
case '�': sb.append("ô");break;
case '�': sb.append("Ô");break;
case '�': sb.append("ö");break;
case '�': sb.append("Ö");break;
case '�': sb.append("ø");break;
case '�': sb.append("Ø");break;
case '�': sb.append("ß");break;
case '�': sb.append("ù");break;
case '�': sb.append("Ù");break;
case '�': sb.append("û");break;
case '�': sb.append("Û");break;
case '�': sb.append("ü");break;
case '�': sb.append("Ü");break;
default: sb.append(c); break;
}
}
return sb.toString();
}
*/
public static String removeHtmlComment(String szContent){
return removeMarkup(szContent, "<!--","-->");
}
public static String removeMarkup(String szContent, String szBegin, String szEnd){
String first ="";
String rest = szContent;
while (true) {
//find start of
int index1=rest.indexOf(szBegin);
if (index1<0)
break;
int index2=rest.indexOf(szEnd,index1);
if (index2<0)
break;
first += rest.substring(0,index1);
index2 += szEnd.length();
if (index2 >= rest.length())
break;
rest = rest.substring(index2);
if (ToolSafe.isEmpty(rest))
break;
}
first += rest;
//remove whitespace
return first.trim();
// the following regular expression code takes too much time
// so we get rid of them
//
//String temp = text.replaceAll("<!--[^-]+[-[^-]+]*[--[^>]+]*-->","");
//temp = temp.trim();
//return temp;
}
public static ArrayList<String> extractMarkup(String szContent, String szBegin, String szEnd){
String rest = szContent;
ArrayList<String> data = new ArrayList<String>();
while (true) {
//find start of
int index1=rest.indexOf(szBegin);
if (index1<0)
break;
int index2=rest.indexOf(szEnd,index1);
if (index2<0)
break;
String szTemp = rest.substring(index1,index2+ szEnd.length());
data.add(szTemp);
rest = rest.substring(index2+ szEnd.length());
}
return data;
}
public static String string2htmlStringWhitespace(String szText){
String temp =szText;
temp = temp.replaceAll("\t"," ");
temp = temp.replaceAll("\\s"," ");
return temp;
}
public static String encloseXmlText(String szText){
return String.format("<![CDATA[%s]]>", szText);
// this will avoid parse errors such as & in the text.
}
public static String writeXmlNode(String szMarkup, String szText){
return String.format("<%s>%s</%s>", szMarkup, encloseXmlText(szText), szMarkup);
// this will avoid parse errors such as & in the text.
}
public static String writeXmlNode(String szMarkup, String szText, String szAttribute){
return String.format("<%s %s>%s</%s>", szMarkup, szAttribute, encloseXmlText(szText), szMarkup);
// this will avoid parse errors such as & in the text.
}
}