// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Somik Raha // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v $ // $Author: derrickoswald $ // $Date: 2005/05/15 11:49:05 $ // $Revision: 1.47 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.util; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.Tag; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; import org.htmlparser.tags.CompositeTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class ParserUtils { public static String removeChars(String s, char occur) { StringBuffer newString = new StringBuffer(); char ch; for (int i = 0; i < s.length(); i++) { ch = s.charAt(i); if (ch != occur) newString.append(ch); } return newString.toString(); } public static String removeEscapeCharacters(String inputString) { inputString = ParserUtils.removeChars(inputString, '\r'); inputString = ParserUtils.removeChars(inputString, '\n'); inputString = ParserUtils.removeChars(inputString, '\t'); return inputString; } public static String removeTrailingBlanks(String text) { char ch = ' '; while (ch == ' ') { ch = text.charAt(text.length() - 1); if (ch == ' ') text = text.substring(0, text.length() - 1); } return text; } /** * Search given node and pick up any objects of given type. * @param node The node to search. * @param type The class to search for. * @return A node array with the matching nodes. */ public static Node[] findTypeInNode(Node node, Class type) { NodeFilter filter; NodeList ret; ret = new NodeList (); filter = new NodeClassFilter (type); node.collectInto (ret, filter); return (ret.toNodeArray ()); } /** * Split the input string considering as string separator * all the not numerical characters * with the only exception of the characters specified in charsDoNotBeRemoved param. * <BR>For example if you call splitButDigits("<DIV> +12.5, +3.4 </DIV>", "+."), * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (1,2,3,4 and 5 are digits and +,. are chars that do not be removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The array of strings as output. */ public static String[] splitButDigits (String input, String charsDoNotBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuffer str = new StringBuffer(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if ((Character.isDigit(input.charAt(index))) || (charFound)) { str.append(input.charAt(index)); toBeAdd=false; } else if (!toBeAdd) toBeAdd=true; // finished to parse one string if (toBeAdd && (str.length()!=0)) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } } // add the last string if (str.length()!=0) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } output.trimToSize(); Object[] outputObj = output.toArray(); String[] outputStr = new String[output.size()]; for (int i=0; i<output.size(); i++) outputStr[i] = new String((String) outputObj[i]); return outputStr; } /** * Remove from the input string all the not numerical characters * with the only exception of the characters specified in charsDoNotBeRemoved param. * <BR>For example if you call trimButDigits("<DIV> +12.5 </DIV>", "+."), * <BR>you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed). * <BR>For example if you call trimButDigits("<DIV> +1 2 . 5 </DIV>", "+."), * <BR>you obtain a string "+12.5" as output (the spaces between 1 and 2, 2 and ., . and 5 are removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The string as output. */ public static String trimButDigits (String input, String charsDoNotBeRemoved) { StringBuffer output = new StringBuffer(); boolean charFound=false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if ((Character.isDigit(input.charAt(index))) || (charFound)) output.append(input.charAt(index)); } return output.toString(); } /** * Remove from the beginning and the end of the input string all the not numerical characters * with the only exception of the characters specified in charsDoNotBeRemoved param. * <BR>The removal process removes only chars at the beginning and at the end of the string. * <BR>For example if you call trimButDigitsBeginEnd("<DIV> +12.5 </DIV>", "+."), * <BR>you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed). * <BR>For example if you call trimButDigitsBeginEnd("<DIV> +1 2 . 5 </DIV>", "+."), * <BR>you obtain a string "+1 2 . 5" as output (the spacess inside the string are not removed). * @param input - The string in input. * @param charsDoNotBeRemoved - The chars that do not be removed. * @return The string as output. */ public static String trimButDigitsBeginEnd (String input, String charsDoNotBeRemoved) { String output = new String(); int begin=0; int end=input.length()-1; boolean charFound=false; boolean ok=true; for (int index=begin; (index<input.length()) && ok; index++) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if ( (Character.isDigit(input.charAt(index))) || (charFound) ) { begin=index; ok=false; } } ok=true; for (int index=end; (index>=0) && ok; index--) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if ( (Character.isDigit(input.charAt(index))) || (charFound) ) { end=index; ok=false; } } output=input.substring(begin,end+1); return output; } /** * Split the input string considering as string separator * all the spaces and tabs like chars and * the chars specified in the input variable charsToBeRemoved. * <BR>For example if you call splitSpaces("<DIV> +12.5, +3.4 </DIV>", "<>DIV/,"), * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (space chars and <,>,D,I,V,/ and the comma are chars that must be removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The array of strings as output. */ public static String[] splitSpaces (String input, String charsToBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuffer str = new StringBuffer(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound))) { str.append(input.charAt(index)); toBeAdd=false; } else if (!toBeAdd) toBeAdd=true; // finished to parse one string if (toBeAdd && (str.length()!=0)) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } } // add the last string if (str.length()!=0) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } output.trimToSize(); Object[] outputObj = output.toArray(); String[] outputStr = new String[output.size()]; for (int i=0; i<output.size(); i++) outputStr[i] = new String((String) outputObj[i]); return outputStr; } /** * Remove from the input string all the spaces and tabs like chars. * Remove also the chars specified in the input variable charsToBeRemoved. * <BR>For example if you call trimSpaces("<DIV> +12.5 </DIV>", "<>DIV/"), * <BR>you obtain a string "+12.5" as output (space chars and <,>,D,I,V,/ are chars that must be removed). * <BR>For example if you call trimSpaces("<DIV> Trim All Spaces Also The Ones Inside The String </DIV>", "<>DIV/"), * <BR>you obtain a string "TrimAllSpacesAlsoTheOnesInsideTheString" as output (all the spaces inside the string are removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The string as output. */ public static String trimSpaces (String input, String charsToBeRemoved) { StringBuffer output = new StringBuffer(); boolean charFound=false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound))) output.append(input.charAt(index)); } return output.toString(); } /** * Remove from the beginning and the end of the input string all the spaces and tabs like chars. * Remove also the chars specified in the input variable charsToBeRemoved. * <BR>The removal process removes only chars at the beginning and at the end of the string. * <BR>For example if you call trimSpacesBeginEnd("<DIV> +12.5 </DIV>", "<>DIV/"), * <BR>you obtain a string "+12.5" as output (space chars and <,>,D,I,V,/ are chars that must be removed). * <BR>For example if you call trimSpacesBeginEnd("<DIV> Trim all spaces but not the ones inside the string </DIV>", "<>DIV/"), * <BR>you obtain a string "Trim all spaces but not the ones inside the string" as output (all the spaces inside the string are preserved). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The string as output. */ public static String trimSpacesBeginEnd (String input, String charsToBeRemoved) { String output = new String(); int begin=0; int end=input.length()-1; boolean charFound=false; boolean ok=true; for (int index=begin; (index<input.length()) && ok; index++) { charFound=false; for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (!( (Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound) )) { begin=index; ok=false; } } ok=true; for (int index=end; (index>=0) && ok; index--) { charFound=false; for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (!( (Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound) )) { end=index; ok=false; } } output=input.substring(begin,end+1); return output; } /** * Split the input string considering as string separator * all the characters * with the only exception of the characters specified in charsDoNotBeRemoved param. * <BR>For example if you call splitButChars("<DIV> +12.5, +3.4 </DIV>", "+.1234567890"), * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The array of strings as output. */ public static String[] splitButChars (String input, String charsDoNotBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuffer str = new StringBuffer(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (charFound) { str.append(input.charAt(index)); toBeAdd=false; } else if (!toBeAdd) toBeAdd=true; // finished to parse one string if (toBeAdd && (str.length()!=0)) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } } // add the last string if (str.length()!=0) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } output.trimToSize(); Object[] outputObj = output.toArray(); String[] outputStr = new String[output.size()]; for (int i=0; i<output.size(); i++) outputStr[i] = new String((String) outputObj[i]); return outputStr; } /** * Remove from the input string all the characters * with the only exception of the characters specified in charsDoNotBeRemoved param. * <BR>For example if you call trimButChars("<DIV> +12.5 </DIV>", "+.1234567890"), * <BR>you obtain a string "+12.5" as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed). * <BR>For example if you call trimButChars("<DIV> +1 2 . 5 </DIV>", "+.1234567890"), * <BR>you obtain a string "+12.5" as output (the spaces between 1 and 2, 2 and ., . and 5 are removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The string as output. */ public static String trimButChars (String input, String charsDoNotBeRemoved) { StringBuffer output = new StringBuffer(); boolean charFound=false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (charFound) output.append(input.charAt(index)); } return output.toString(); } /** * Remove from the beginning and the end of the input string all the characters * with the only exception of the characters specified in charsDoNotBeRemoved param. * <BR>The removal process removes only chars at the beginning and at the end of the string. * <BR>For example if you call trimButCharsBeginEnd("<DIV> +12.5 </DIV>", "+.1234567890"), * <BR>you obtain a string "+12.5" as output (+,.,1,2,3,4,5,6,7,8,9,0 are chars that do not be removed). * <BR>For example if you call trimButCharsBeginEnd("<DIV> +1 2 . 5 </DIV>", "+.1234567890"), * <BR>you obtain a string "+1 2 . 5" as output (the spaces inside the string are not removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The string as output. */ public static String trimButCharsBeginEnd (String input, String charsDoNotBeRemoved) { String output = new String(); int begin=0; int end=input.length()-1; boolean charFound=false; boolean ok=true; for (int index=begin; (index<input.length()) && ok; index++) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (charFound) { begin=index; ok=false; } } ok=true; for (int index=end; (index>=0) && ok; index--) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (charFound) { end=index; ok=false; } } output=input.substring(begin,end+1); return output; } /** * Split the input string considering as string separator * the chars specified in the input variable charsToBeRemoved. * <BR>For example if you call splitChars("<DIV> +12.5, +3.4 </DIV>", " <>DIV/,"), * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (space chars and <,>,D,I,V,/ and the comma are chars that must be removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The array of strings as output. */ public static String[] splitChars (String input, String charsToBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuffer str = new StringBuffer(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (!(charFound)) { str.append(input.charAt(index)); toBeAdd=false; } else if (!toBeAdd) toBeAdd=true; // finished to parse one string if (toBeAdd && (str.length()!=0)) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } } // add the last string if (str.length()!=0) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } output.trimToSize(); Object[] outputObj = output.toArray(); String[] outputStr = new String[output.size()]; for (int i=0; i<output.size(); i++) outputStr[i] = new String((String) outputObj[i]); return outputStr; } /** * Remove from the input string all the chars specified in the input variable charsToBeRemoved. * <BR>For example if you call trimChars("<DIV> +12.5 </DIV>", "<>DIV/ "), * <BR>you obtain a string "+12.5" as output (<,>,D,I,V,/ and space char are chars that must be removed). * <BR>For example if you call trimChars("<DIV> Trim All Chars Also The Ones Inside The String </DIV>", "<>DIV/ "), * <BR>you obtain a string "TrimAllCharsAlsoTheOnesInsideTheString" as output (all the spaces inside the string are removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The string as output. */ public static String trimChars (String input, String charsToBeRemoved) { StringBuffer output = new StringBuffer(); boolean charFound=false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (!(charFound)) output.append(input.charAt(index)); } return output.toString(); } /** * Remove from the beginning and the end of the input string all the chars specified in the input variable charsToBeRemoved. * <BR>The removal process removes only chars at the beginning and at the end of the string. * <BR>For example if you call trimCharsBeginEnd("<DIV> +12.5 </DIV>", "<>DIV/ "), * <BR>you obtain a string "+12.5" as output (' ' is a space char and <,>,D,I,V,/ are chars that must be removed). * <BR>For example if you call trimCharsBeginEnd("<DIV> Trim all spaces but not the ones inside the string </DIV>", "<>DIV/ "), * <BR>you obtain a string "Trim all spaces but not the ones inside the string" as output (all the spaces inside the string are preserved). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The string as output. */ public static String trimCharsBeginEnd (String input, String charsToBeRemoved) { String output = new String(); int begin=0; int end=input.length()-1; boolean charFound=false; boolean ok=true; for (int index=begin; (index<input.length()) && ok; index++) { charFound=false; for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (!(charFound)) { begin=index; ok=false; } } ok=true; for (int index=end; (index>=0) && ok; index--) { charFound=false; for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (!(charFound)) { end=index; ok=false; } } output=input.substring(begin,end+1); return output; } /** * Split the input string in a string array, * considering the tags as delimiter for splitting. * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String[] splitTags (String input, String[] tags) throws ParserException, UnsupportedEncodingException { return splitTags (input, tags, true, true); } /** * Split the input string in a string array, * considering the tags as delimiter for splitting. * <BR>For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}), * <BR>you obtain a string array {"Begin ", " ALL OK"} as output (splitted <DIV> tags and their content recursively). * <BR>For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, false), * <BR>you obtain a string array {"Begin ", "<DIV> +12.5 </DIV>", " ALL OK"} as output (splitted <DIV> tags and not their content and no recursively). * <BR>For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, true, false), * <BR>you obtain a string array {"Begin ", " +12.5 ", " ALL OK"} as output (splitted <DIV> tags and not their content recursively). * <BR>For example if you call splitTags("Begin <DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, true), * <BR>you obtain a string array {"Begin ", " ALL OK"} as output (splitted <DIV> tags and their content). * @param input The string in input. * @param tags The tags to be used as splitting delimiter. * @param recursive Optional parameter (true if not present), if true delete all the tags recursively. * @param insideTag Optional parameter (true if not present), if true delete also the content of the tags. * @return The string array containing the strings delimited by tags. */ public static String[] splitTags (String input, String[] tags, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { ArrayList outputArrayList = new ArrayList(); int minCapacity = 0; String output = new String(); String inputModified = new String(input); String[] outputStr = new String[] {}; String dummyString = createDummyString (' ', input.length()); // loop inside the different tags to be trimmed for (int i=0; i<tags.length; i++) { // loop inside the tags of the same type NodeList links = getLinks (inputModified, tags[i], recursive); for (int j=0; j<links.size(); j++) { CompositeTag beginTag = (CompositeTag)links.elementAt(j); Tag endTag = beginTag.getEndTag(); // positions of begin and end tags int beginTagBegin = beginTag.getStartPosition (); int endTagBegin = beginTag.getEndPosition (); int beginTagEnd = endTag.getStartPosition (); int endTagEnd = endTag.getEndPosition (); if (insideTag) { dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagEnd); } else { dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagBegin); dummyString = modifyDummyString (new String(dummyString), beginTagEnd, endTagEnd); } } for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) { int kNew = dummyString.indexOf('*',k); if (kNew!=-1) { output = inputModified.substring(k,kNew); k = dummyString.indexOf(' ',kNew); minCapacity++; outputArrayList.ensureCapacity(minCapacity); if (outputArrayList.add(output)) output = new String(); else minCapacity--; } else { output = inputModified.substring(k,dummyString.length()); k = kNew; minCapacity++; outputArrayList.ensureCapacity(minCapacity); if (outputArrayList.add(output)) output = new String(); else minCapacity--; } } StringBuffer outputStringBuffer = new StringBuffer(); outputArrayList.trimToSize(); Object[] outputObj = outputArrayList.toArray(); outputStr = new String[outputArrayList.size()]; for (int j=0; j<outputArrayList.size(); j++) { outputStr[j] = new String((String) outputObj[j]); outputStringBuffer.append(outputStr[j]); } outputArrayList = new ArrayList(); inputModified = new String(outputStringBuffer.toString()); dummyString = createDummyString (' ', inputModified.length()); } return outputStr; } /** * Split the input string in a string array, * considering the tags as delimiter for splitting. * <BR>Use Class class as input parameter * instead of tags[] string array. * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String[] splitTags (String input, Class nodeType) throws ParserException, UnsupportedEncodingException { return splitTags (input, new NodeClassFilter (nodeType), true, true); } /** * Split the input string in a string array, * considering the tags as delimiter for splitting. * <BR>Use Class class as input parameter * instead of tags[] string array. * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String[] splitTags (String input, Class nodeType, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { return splitTags (input, new NodeClassFilter (nodeType), recursive, insideTag); } /** * Split the input string in a string array, * considering the tags as delimiter for splitting. * <BR>Use NodeFilter class as input parameter * instead of tags[] string array. * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String[] splitTags (String input, NodeFilter filter) throws ParserException, UnsupportedEncodingException { return splitTags (input, filter, true, true); } /** * Split the input string in a string array, * considering the tags as delimiter for splitting. * <BR>Use NodeFilter class as input parameter * instead of tags[] string array. * @see ParserUtils#splitTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String[] splitTags (String input, NodeFilter filter, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { ArrayList outputArrayList = new ArrayList(); int minCapacity = 0; String output = new String(); String dummyString = createDummyString (' ', input.length()); // loop inside the tags of the same type NodeList links = getLinks (input, filter, recursive); for (int j=0; j<links.size(); j++) { CompositeTag beginTag = (CompositeTag)links.elementAt(j); Tag endTag = beginTag.getEndTag(); // positions of begin and end tags int beginTagBegin = beginTag.getStartPosition (); int endTagBegin = beginTag.getEndPosition (); int beginTagEnd = endTag.getStartPosition (); int endTagEnd = endTag.getEndPosition (); if (insideTag) { dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagEnd); } else { dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagBegin); dummyString = modifyDummyString (new String(dummyString), beginTagEnd, endTagEnd); } } for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) { int kNew = dummyString.indexOf('*',k); if (kNew!=-1) { output = input.substring(k,kNew); k = dummyString.indexOf(' ',kNew); minCapacity++; outputArrayList.ensureCapacity(minCapacity); if (outputArrayList.add(output)) output = new String(); else minCapacity--; } else { output = input.substring(k,dummyString.length()); k = kNew; minCapacity++; outputArrayList.ensureCapacity(minCapacity); if (outputArrayList.add(output)) output = new String(); else minCapacity--; } } outputArrayList.trimToSize(); Object[] outputObj = outputArrayList.toArray(); String[] outputStr = new String[outputArrayList.size()]; for (int i=0; i<outputArrayList.size(); i++) outputStr[i] = new String((String) outputObj[i]); return outputStr; } /** * Trim the input string, removing all the tags in the input string. * <BR>The method trims all the substrings included in the input string of the following type: * "<XXX>", where XXX could be a string of any type. * <BR>If you set to true the inside parameter, the method deletes also the YYY string in the following input string: * "<XXX>YYY<ZZZ>", note that ZZZ is not necessary the closing tag of XXX. * @param input The string in input. * @param inside If true, it forces the method to delete also what is inside the tags. * @return The string without tags. */ public static String trimAllTags (String input, boolean inside) { StringBuffer output = new StringBuffer(); if (inside) { if ((input.indexOf('<')==-1) || (input.lastIndexOf('>')==-1) || (input.lastIndexOf('>')<input.indexOf('<'))) { output.append(input); } else { output.append(input.substring(0, input.indexOf('<'))); output.append(input.substring(input.lastIndexOf('>')+1, input.length())); } } else { boolean write = true; for (int index=0; index<input.length(); index++) { if (input.charAt(index)=='<' && write) write = false; if (write) output.append(input.charAt(index)); if (input.charAt(index)=='>' && (!write)) write = true; } } return output.toString(); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, String[] tags) throws ParserException, UnsupportedEncodingException { return trimTags (input, tags, true, true); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content (optional). * <BR>For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}), * <BR>you obtain a string " ALL OK" as output (trimmed <DIV> tags and their content recursively). * <BR>For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, false), * <BR>you obtain a string "<DIV> +12.5 </DIV> ALL OK" as output (trimmed <DIV> tags and not their content and no recursively). * <BR>For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, true, false), * <BR>you obtain a string " +12.5 ALL OK" as output (trimmed <DIV> tags and not their content recursively). * <BR>For example if you call trimTags("<DIV><DIV> +12.5 </DIV></DIV> ALL OK", new String[] {"DIV"}, false, true), * <BR>you obtain a string " ALL OK" as output (trimmed <DIV> tags and their content). * @param input The string in input. * @param tags The tags to be removed. * @param recursive Optional parameter (true if not present), if true delete all the tags recursively. * @param insideTag Optional parameter (true if not present), if true delete also the content of the tags. * @return The string without tags. */ public static String trimTags (String input, String[] tags, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { StringBuffer output = new StringBuffer(); String inputModified = new String(input); String dummyString = createDummyString (' ', input.length()); // loop inside the different tags to be trimmed for (int i=0; i<tags.length; i++) { output = new StringBuffer(); // loop inside the tags of the same type NodeList links = getLinks (inputModified, tags[i], recursive); for (int j=0; j<links.size(); j++) { CompositeTag beginTag = (CompositeTag)links.elementAt(j); Tag endTag = beginTag.getEndTag(); // positions of begin and end tags int beginTagBegin = beginTag.getStartPosition (); int endTagBegin = beginTag.getEndPosition (); int beginTagEnd = endTag.getStartPosition (); int endTagEnd = endTag.getEndPosition (); if (insideTag) { dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagEnd); } else { dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagBegin); dummyString = modifyDummyString (new String(dummyString), beginTagEnd, endTagEnd); } } for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) { int kNew = dummyString.indexOf('*',k); if (kNew!=-1) { output = output.append(inputModified.substring(k,kNew)); k = dummyString.indexOf(' ',kNew); } else { output = output.append(inputModified.substring(k,dummyString.length())); k = kNew; } } inputModified = new String(output); dummyString = createDummyString (' ', inputModified.length()); } return output.toString(); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content. * <BR>Use Class class as input parameter * instead of tags[] string array. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, Class nodeType) throws ParserException, UnsupportedEncodingException { return trimTags (input, new NodeClassFilter (nodeType), true, true); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content (optional). * <BR>Use Class class as input parameter * instead of tags[] string array. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, Class nodeType, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { return trimTags (input, new NodeClassFilter (nodeType), recursive, insideTag); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content. * <BR>Use NodeFilter class as input parameter * instead of tags[] string array. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, NodeFilter filter) throws ParserException, UnsupportedEncodingException { return trimTags (input, filter, true, true); } /** * Trim all tags in the input string and * return a string like the input one * without the tags and their content (optional). * <BR>Use NodeFilter class as input parameter * instead of tags[] string array. * @see ParserUtils#trimTags (String input, String[] tags, boolean recursive, boolean insideTag). */ public static String trimTags (String input, NodeFilter filter, boolean recursive, boolean insideTag) throws ParserException, UnsupportedEncodingException { StringBuffer output = new StringBuffer(); String dummyString = createDummyString (' ', input.length()); // loop inside the tags of the same type NodeList links = getLinks (input, filter, recursive); for (int j=0; j<links.size(); j++) { CompositeTag beginTag = (CompositeTag)links.elementAt(j); Tag endTag = beginTag.getEndTag(); // positions of begin and end tags int beginTagBegin = beginTag.getStartPosition (); int endTagBegin = beginTag.getEndPosition (); int beginTagEnd = endTag.getStartPosition (); int endTagEnd = endTag.getEndPosition (); if (insideTag) { dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagEnd); } else { dummyString = modifyDummyString (new String(dummyString), beginTagBegin, endTagBegin); dummyString = modifyDummyString (new String(dummyString), beginTagEnd, endTagEnd); } } for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) { int kNew = dummyString.indexOf('*',k); if (kNew!=-1) { output = output.append(input.substring(k,kNew)); k = dummyString.indexOf(' ',kNew); } else { output = output.append(input.substring(k,dummyString.length())); k = kNew; } } return output.toString(); } /** * Create a Parser Object having a String Object as input (instead of a url or a string representing the url location). * <BR>The string will be parsed as it would be a file. * @param input The string in input. * @return The Parser Object with the string as input stream. */ public static Parser createParserParsingAnInputString (String input) throws ParserException, UnsupportedEncodingException { Parser parser = new Parser(); Lexer lexer = new Lexer(); Page page = new Page(input); lexer.setPage(page); parser.setLexer(lexer); return parser; } private static NodeList getLinks (String output, String tag, boolean recursive) throws ParserException, UnsupportedEncodingException { Parser parser = new Parser(); NodeFilter filterLink = new TagNameFilter (tag); NodeList links = new NodeList (); parser = createParserParsingAnInputString(output); links = parser.extractAllNodesThatMatch(filterLink); // loop to remove tags added recursively // so if you have selected 'not recursive option' // you have only the tag container and not the contained tags. if (!recursive) { for (int j=0; j<links.size(); j++) { CompositeTag jStartTag = (CompositeTag)links.elementAt(j); Tag jEndTag = jStartTag.getEndTag(); int jStartTagBegin = jStartTag.getStartPosition (); int jEndTagEnd = jEndTag.getEndPosition (); for (int k=0; k<links.size(); k++) { CompositeTag kStartTag = (CompositeTag)links.elementAt(k); Tag kEndTag = kStartTag.getEndTag(); int kStartTagBegin = kStartTag.getStartPosition (); int kEndTagEnd = kEndTag.getEndPosition (); if ((k!=j) && (kStartTagBegin>jStartTagBegin) && (kEndTagEnd<jEndTagEnd)) { links.remove(k); k--; j--; } } } } return links; } private static NodeList getLinks (String output, NodeFilter filter, boolean recursive) throws ParserException, UnsupportedEncodingException { Parser parser = new Parser(); NodeList links = new NodeList (); parser = createParserParsingAnInputString(output); links = parser.extractAllNodesThatMatch(filter); // loop to remove tags added recursively // so if you have selected 'not recursive option' // you have only the tag container and not the contained tags. if (!recursive) { for (int j=0; j<links.size(); j++) { CompositeTag jStartTag = (CompositeTag)links.elementAt(j); Tag jEndTag = jStartTag.getEndTag(); int jStartTagBegin = jStartTag.getStartPosition (); int jEndTagEnd = jEndTag.getEndPosition (); for (int k=0; k<links.size(); k++) { CompositeTag kStartTag = (CompositeTag)links.elementAt(k); Tag kEndTag = kStartTag.getEndTag(); int kStartTagBegin = kStartTag.getStartPosition (); int kEndTagEnd = kEndTag.getEndPosition (); if ((k!=j) && (kStartTagBegin>jStartTagBegin) && (kEndTagEnd<jEndTagEnd)) { links.remove(k); k--; j--; } } } } return links; } private static String createDummyString (char fillingChar, int length) { StringBuffer dummyStringBuffer = new StringBuffer(); for (int j=0; j<length; j++) dummyStringBuffer = dummyStringBuffer.append(fillingChar); return new String(dummyStringBuffer); } private static String modifyDummyString (String dummyString, int beginTag, int endTag) { String dummyStringInterval = createDummyString ('*', endTag-beginTag); return new String(dummyString.substring(0, beginTag) + dummyStringInterval + dummyString.substring(endTag, dummyString.length())); } }