/* ****************************************************************************** * * Copyright 2008-2010 Hans Dijkema * * JRichTextEditor is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * JRichTextEditor is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with JRichTextEditor. If not, see <http://www.gnu.org/licenses/>. * * ******************************************************************************/ package nl.dykema.jxmlnote.html; import java.util.Iterator; import java.util.Stack; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; class XMLNoteReplacer { private String _m; private String _r; private String _me; private String _re; private String _mnotAllowedInNesting; private String _mnotAllowedOutsideNesting; private boolean _d=false; private boolean _stopper=false; private Type _type=Type.STANDARD; private Vector<XMLNoteReplacer> _replacers; private Repl _repl; enum Type { STANDARD, NESTED, REPL }; public interface Repl { String process(String in); } Pattern pattern(String re) { Pattern p; if (_d) { p=Pattern.compile(re,Pattern.DOTALL|Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE); } else { p=Pattern.compile(re,Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE); } return p; } Pattern pattern() { return pattern(_m); } String replace() { return _r; } public String p() { return _m; } public boolean stopper() { return _stopper; } public XMLNoteReplacer getEndTag(String match,Vector<XMLNoteReplacer> r) { Iterator<XMLNoteReplacer> g=r.iterator(); while (g.hasNext()) { XMLNoteReplacer rr=g.next(); Pattern p=rr.pattern(); Matcher m=p.matcher(match); if (m.find()) { return rr; } } return null; } @SuppressWarnings("unused") public String recursiveReplace(StringBuffer sb,Matcher m,int depth) { Pattern pb=pattern(_m); Pattern pe=pattern(_me); Pattern pn=pattern(_mnotAllowedInNesting); Pattern pa=pattern(_mnotAllowedOutsideNesting); // allowed in nesting String match,begin,end,notAllowedInside,notAllowedOutside; Stack<String> endBlocks=new Stack<String>(); // Add stack with nested tag. // Make one match and (match,replacers,closers ) combinations for individual nestings. while(m.find()) { match=m.group(); begin=null;end=null;notAllowedInside=null;notAllowedOutside=null; if (pb.matcher(match).find()) { begin=match; } else if (pe.matcher(match).find()) { end=match; } else if (pa.matcher(match).find()) { notAllowedOutside=match; // allowed inside before allowed outside. } else { notAllowedInside=match; } if (begin!=null) { XMLNoteReplacer q=getEndTag(match,_replacers); m.appendReplacement(sb, q._r); depth+=1; endBlocks.push(q._re); } else if (end!=null) { depth-=1; if (depth<0) { //end tag outside nesting; skip. depth=0; m.appendReplacement(sb,""); } else { m.appendReplacement(sb,endBlocks.pop()); } } else if (notAllowedInside!=null) { if (depth>0) { // a not allowed tag inside a nesting! int i; String rrr=""; for(i=0;i<depth;i++) { rrr+=endBlocks.pop(); } m.appendReplacement(sb,rrr); depth=0; } else { // nothing wrong, outside nesting. m.appendReplacement(sb,notAllowedInside); } } else { if (depth>0) { // nothing wrong, inside nesting. m.appendReplacement(sb, notAllowedOutside); } else { // not allowed outside the nesting. means: we dispose of the tag. However, // this means we must make sure that we process al outer possibilities and // match those somehow. And we make sure that we are being nice to simple // faults. m.appendReplacement(sb, ""); } } } m.appendTail(sb); return sb.toString(); } public String toString() { return "{"+_m+","+_me+","+_r+","+_re+","+((_replacers==null) ? "" : _replacers.toString())+"}"; } public String recursiveReplace(String str) { String re="("+_m+")|("+_me+")|("+_mnotAllowedInNesting+")|("+_mnotAllowedOutsideNesting+")"; Pattern p=pattern(re); Matcher m=p.matcher(str); StringBuffer sb=new StringBuffer(); return recursiveReplace(sb,m,0); } public String doReplace(String str) { if (_type==Type.STANDARD) { Pattern p=pattern(); if (stopper()) { stopper(); //System.out.println(p()); } Matcher m=p.matcher(str); StringBuffer sb=new StringBuffer(); while(m.find()) { m.appendReplacement(sb,replace()); } m.appendTail(sb); return sb.toString(); } else if (_type==Type.REPL) { return _repl.process(str); } else { return recursiveReplace(str); } } public XMLNoteReplacer(Repl r) { _type=Type.REPL; _repl=r; } public XMLNoteReplacer(String mb,String me, String mnot, String mnotout, Vector<XMLNoteReplacer> replacers,boolean dotall,boolean stopper) { _m=mb; _me=me; _mnotAllowedInNesting=mnot; _mnotAllowedOutsideNesting=mnotout; _r=null; _re=null; _replacers=replacers; _d=dotall; _stopper=stopper; _type=Type.NESTED; } public XMLNoteReplacer(String mb,String me, String mnot, String mnotout,Vector<XMLNoteReplacer> replacers,boolean dotall) { _m=mb; _me=me; _mnotAllowedInNesting=mnot; _mnotAllowedOutsideNesting=mnotout; _r=null; _re=null; _replacers=replacers; _d=dotall; _type=Type.NESTED; } public XMLNoteReplacer(String mb,String me,String rb,String re,boolean dotall) { _m=mb; _me=me; _r=rb; _re=re; _d=dotall; _type=Type.REPL; } public XMLNoteReplacer(String m,String repl,boolean dotall,boolean stopper) { _m=m;_r=repl; _d=dotall; _stopper=stopper; } public XMLNoteReplacer(String m,String repl,boolean dotall) { _m=m;_r=repl; _d=dotall; } public XMLNoteReplacer(String m,String repl) { _m=m;_r=repl; } } public class HtmlStripper { static HtmlStripper _stripper=null; private String correctNested(String in,String open,String close) { String regex="("+open+"|"+close+")"; String[] parts=in.split(regex); return in; } private Vector<XMLNoteReplacer> getReplacers() { Vector<XMLNoteReplacer> r=new Vector<XMLNoteReplacer>(); r.add(new XMLNoteReplacer("[<]head[^>]*[>].*?[<][/]head[>]","",true)); r.add(new XMLNoteReplacer("[<]p[>](.*?)[<][/]p[>]","::P:PAR:P::$1::P:EPAR:P::",true)); r.add(new XMLNoteReplacer("[<]p\\s[^>]+[>](.*?)[<][/]p[>]","::P:PAR:P::$1::P:EPAR:P::",true)); r.add(new XMLNoteReplacer("[<]br[^>]*[>]","::E:BR:E::",true)); r.add(new XMLNoteReplacer("[<]li[>]::P:PAR:P::(.*?)::P:EPAR:P::","<li>$1</li>",true)); // Correct OpenOffice behaviour. { Vector<XMLNoteReplacer> ulr=new Vector<XMLNoteReplacer>(); ulr.add(new XMLNoteReplacer("[<]ul(\\s[^>]+){0,1}[>]","[<][/][uo]l[>]","::P:UL:P::","::P:EUL:P::",true)); ulr.add(new XMLNoteReplacer("[<]ol(\\s[^>]+){0,1}[>]","[<][/][uo]l[>]","::P:OL:P::","::P:EOL:P::",true)); ulr.add(new XMLNoteReplacer("[<]li(\\s[^>]+){0,1}[>]","[<][/]li[>]","::P:LI:P::","::P:ELI:P::",true)); r.add(new XMLNoteReplacer("[<](ul|ol|li)(\\s[^>]+){0,1}[>]","[<][/](ul|ol|li)[>]", "::P:(PAR|EPAR):P::","([<][/]?li(\\s[^>]+){0,1}[>])",ulr,true,true ) ); } r.add(new XMLNoteReplacer("[&]nbsp[;]"," ",true)); //r.add(new XMLNoteReplacer("[<]li[>](.*?)[<][/]li[>]","::P:LI:P::$1::P:ELI:P::",true)); //r.add(new XMLNoteReplacer("[<]li\\s[^>]+[>](.*?)[<][/]li[>]","::P:LI:P::$1::P:ELI:P::",true)); r.add(new XMLNoteReplacer("[<]b(\\sclass=[\"]h[0-9][\"])\\s*[>](.*?)[<][/]b[>]","::EL:B:$1:EL::$2::EL:EB:EL::",true)); r.add(new XMLNoteReplacer("[<]b[>](.*?)[<][/]b[>]","::EL:B:EL::$1::EL:EB:EL::",true)); r.add(new XMLNoteReplacer("[<]b\\s[^>]+[>](.*?)[<][/]b[>]","::EL:B:EL::$1::EL:EB:EL::",true)); r.add(new XMLNoteReplacer("[<]i[>](.*?)[<][/]i[>]","::EL:I:EL::$1::EL:EI:EL::",true)); r.add(new XMLNoteReplacer("[<]i\\s[^>]+[>](.*?)[<][/]i[>]","::EL:I:EL::$1::EL:EI:EL::",true)); r.add(new XMLNoteReplacer("[<]u[>](.*?)[<][/]u[>]","::EL:U:EL::$1::EL:EU:EL::",true)); r.add(new XMLNoteReplacer("[<]u\\s[^>]+[>](.*?)[<][/]u[>]","::EL:U:EL::$1::EL:EU:EL::",true)); r.add(new XMLNoteReplacer("[<]h1[>](.*?)[<][/]h1[>]","::EL:H1:EL::$1::EL:EH1:EL::",true)); r.add(new XMLNoteReplacer("[<]h1\\s[^>]+[>](.*?)[<][/]h1[>]","::EL:H1:EL::$1::EL:EH1:EL::",true)); r.add(new XMLNoteReplacer("[<]h2[>](.*?)[<][/]h2[>]","::EL:H2:EL::$1::EL:EH2:EL::",true)); r.add(new XMLNoteReplacer("[<]h2\\s[^>]+[>](.*?)[<][/]h2[>]","::EL:H2:EL::$1::EL:EH2:EL::",true)); r.add(new XMLNoteReplacer("[<]h3[>](.*?)[<][/]h3[>]","::EL:H3:EL::$1::EL:EH3:EL::",true)); r.add(new XMLNoteReplacer("[<]h3\\s[^>]+[>](.*?)[<][/]h3[>]","::EL:H3:EL::$1::EL:EH3:EL::",true)); r.add(new XMLNoteReplacer("[<]h4[>](.*?)[<][/]h4[>]","::EL:H4:EL::$1::EL:EH4:EL::",true)); r.add(new XMLNoteReplacer("[<]h4\\s[^>]+[>](.*?)[<][/]h4[>]","::EL:H4:EL::$1::EL:EH4:EL::",true)); r.add(new XMLNoteReplacer("[<]h5[>](.*?)[<][/]h5[>]","::EL:B:EL::$1::EL:EB:EL::",true)); r.add(new XMLNoteReplacer("[<]h5\\s[^>]+[>](.*?)[<][/]h5[>]","::EL:B:EL::$1::EL:EB:EL::",true)); r.add(new XMLNoteReplacer("[<][^>]+[>]","",true,true)); // General tag cleaner. r.add(new XMLNoteReplacer("::E:BR:E::","<br>",true)); r.add(new XMLNoteReplacer("::P:PAR:P::(.*?)::P:EPAR:P::","<p>$1</p>",true)); // If we still have (we don't want that) nested paragraphs, we correct that r.add(new XMLNoteReplacer("::P:(EPAR|PAR):P::","",true)); // For the nested stuff we already have made sure they match, no problem there. r.add(new XMLNoteReplacer("::P:UL:P::","<ul>",true)); r.add(new XMLNoteReplacer("::P:EUL:P::","</ul>",true)); r.add(new XMLNoteReplacer("::P:OL:P::","<ol>",true)); r.add(new XMLNoteReplacer("::P:EOL:P::","</ol>",true)); r.add(new XMLNoteReplacer("::P:LI:P::(.*?)::P:ELI:P::","<li>$1</li>",true)); r.add(new XMLNoteReplacer("::P:LI:P::","<li>",true)); r.add(new XMLNoteReplacer("::P:ELI:P::","</li>",true)); // Correct <li>...<ul> to <li>...</li><ul> //r.add(new XMLNoteReplacer("[<]")) // Now correct nested <li>'s in the html, we can't have nested <li>'s. The outer nesting needs to go. r.add(new XMLNoteReplacer("[<]li[>]\\s*[<]([ou])l[>]","<$1l>",true)); r.add(new XMLNoteReplacer("[<]li[>]([^<]+)[<]([ou])l[>]","<li>$1</li><$2l>",true)); r.add(new XMLNoteReplacer("[<][/]([ou])l[>]\\s*[<][/]li[>]","</$1l>",true)); r.add(new XMLNoteReplacer("[<][/](ul|ol)[>]([^<]+)[<][/]p[>]","</$1><p>$2</p>",true)); // No start paragraph tag for a given paragraph tag. // (other tags have been cleaned with the general tag cleaner). // ul and ol are the only ones left. r.add(new XMLNoteReplacer("::EL:B:([^:]+):EL::(.*?)::EL:EB:EL::","<b$1>$2</b>",true)); r.add(new XMLNoteReplacer("::EL:B:EL::(.*?)::EL:EB:EL::","<b>$1</b>",true)); r.add(new XMLNoteReplacer("::EL:I:EL::(.*?)::EL:EI:EL::","<i>$1</i>",true)); r.add(new XMLNoteReplacer("::EL:U:EL::(.*?)::EL:EU:EL::","<u>$1</u>",true)); r.add(new XMLNoteReplacer("::EL:H1:EL::(.*?)::EL:EH1:EL::","<p><b class=\"h1\">$1</b></p>",true)); r.add(new XMLNoteReplacer("::EL:H2:EL::(.*?)::EL:EH2:EL::","<p><b class=\"h2\">$1</b></p>",true)); r.add(new XMLNoteReplacer("::EL:H3:EL::(.*?)::EL:EH3:EL::","<p><b class=\"h3\">$1</b></p>",true)); r.add(new XMLNoteReplacer("::EL:H4:EL::(.*?)::EL:EH4:EL::","<p><b class=\"h4\">$1</b></p>",true)); // correction r.add(new XMLNoteReplacer("[>]\\s*[<]","><",true,true)); // cleanup empty space between tags, surrounding text //r.add(new XMLNoteReplacer("[>]\\s*",">",true)); r.add(new XMLNoteReplacer("^\\s+","",true)); r.add(new XMLNoteReplacer("\\s+$","",true)); //r.add(new XMLNoteReplacer("[<]p[>]","<p>$1</p>",true)); // correct html that has no paragraphs at all. r.add(new XMLNoteReplacer(new XMLNoteReplacer.Repl() { public String process(String in) { if (in.substring(0,3).equals("<p>")) { return in; } else { int i1=in.indexOf("<p>"); int i2=in.indexOf("</p>"); if (i1<i2) { return "<p>"+in.substring(0,i1)+"</p>"+in.substring(i1); } else { return "<p>"+in; } } } })); // correct the html from the beginning to the first <p> tag r.add(new XMLNoteReplacer(new XMLNoteReplacer.Repl() { public String process(String in) { if (in.substring(in.length()-4).equals("</p>")) { return in; } else { int i; for(i=in.length()-4;i>=0 && !in.substring(i,i+4).equals("</p>");i--); if (i<0) { for(i=in.length()-3;i>=0 && !in.substring(i,i+3).equals("<p>");i--); if (i<0) { return in; } else { return in+"</p>"; } } else { return in.substring(0,i+4)+"<p>"+in.substring(i+4)+"</p>"; } } } })); // correct html from end paragraph to end of html r.add(new XMLNoteReplacer("[<][/]p[>](.*?)[<]p[>]","</p><p>$1</p><p>",true)); // correct lines in between that have no paragraph boundaries r.add(new XMLNoteReplacer("[<]p[>][<][/]p[>]","",true)); // correct empty paragraphs. //This one doesn't work. with </ul></ul></p> //r.add(new XMLNoteReplacer("[<][/](ul|ol)[>](.+?)[<][/]p[>]","</$1><p>$2</p>",true)); // No start paragraph tag for a given paragraph tag. // (other tags have been cleaned with the general tag cleaner). // ul and ol are the only ones left. // I've moved this cleaner up. r.add(new XMLNoteReplacer("(.*)","<p>$1</p>",true)); // enclose html in paragraph tags, in case there were no paragraph tags r.add(new XMLNoteReplacer("[<]p[>][<]p[>]","<p>",true)); // remove double <p><p> r.add(new XMLNoteReplacer("[<][/]p[>][<][/]p[>]","</p>",true)); // remove double </p></p> r.add(new XMLNoteReplacer("([<]p[>][<][/]p[>])+$","",true)); // remove ending (<p></p>)+ r.add(new XMLNoteReplacer(new XMLNoteReplacer.Repl() { public String process(String in) { return correctNested(in,"[<]li[>]","[<][/]li[>]"); } })); return r; } private String stripHtml(String str) { Vector<XMLNoteReplacer> R=getReplacers(); Iterator<XMLNoteReplacer> I=R.iterator(); while(I.hasNext()) { XMLNoteReplacer r=I.next(); //System.out.println(r.toString()); str=r.doReplace(str); } return str; } static public String strip(String str) { if (_stripper==null) { _stripper=new HtmlStripper(); } return _stripper.stripHtml(str); } }