/* ******************************************************************************
*
* Copyright 2008-2010 Hans Dijkema
*
* JRichTextEditor is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* JRichTextEditor is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with JRichTextEditor. If not, see <http://www.gnu.org/licenses/>.
*
* ******************************************************************************/
package nl.dykema.jxmlnote.html;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Stack;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import nl.dykema.jxmlnote.document.XMLNoteDocument;
import nl.dykema.jxmlnote.document.XMLNoteImageIcon;
import nl.dykema.jxmlnote.exceptions.BadDocumentException;
import nl.dykema.jxmlnote.exceptions.BadStyleException;
import nl.dykema.jxmlnote.exceptions.DefaultXMLNoteErrorHandler;
import nl.dykema.jxmlnote.styles.XMLNoteStyles;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
/**
* This class converts a true XHtml source to XMLNote. XHTML is a form of XML, and the precondition
* for the use of this class, is that the XHtml source is well formed. This class has a couple of
* standard conversions:
*
* Paragraph stuff:
*
* <pre><h1>...</h1> --> <h1>...</h1></pre>
* <pre><h2>...</h2> --> <h2>...</h2></pre>
* <pre><h3>...</h3> --> <h3>...</h3></pre>
* <pre><p>...</p> --> <par>...</par></pre>
* <pre><tr>...</tr> --> </tr> --> <enter /></pre>
* <pre><td>...</td> --> </td> --> <tab /></pre>
*
* Text stuff:
*
* <pre><b>...</b> --> <b>...</b></pre>
* <pre><i>...</i> --> <i>...</i></pre>
* <pre><u>...</u> --> <u>...</u></pre>
*
* All other stuff will be translated to <code><par></code> paragraphs.
*
* This class only exports public static <code>convert()</code> methods. It uses JTidy
* to tidy up the HTML and convert it to XHtml.
*
* @author Hans Dijkema
*/
public class XHtmlToXMLNote {
XMLNoteStyles _styles;
XMLNoteImageIcon.Provider _prov;
String _xmlnote;
protected XMLNoteDocument getDocument() throws BadStyleException, BadDocumentException {
return new XMLNoteDocument(_xmlnote,_prov,_styles);
}
protected String getXML() {
return _xmlnote;
}
/**
* Convert from XHtml, reading from an URL to an XMLNoteDocument.
*
* @param url
* @param styles
* @return
* @throws IOException
* @throws ParserConfigurationException
* @throws SAXException
* @throws BadStyleException
* @throws BadDocumentException
*/
public static XMLNoteDocument convert(URL url,XMLNoteImageIcon.Provider prov,XMLNoteStyles styles) throws IOException, ParserConfigurationException, SAXException, BadStyleException, BadDocumentException {
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
XHtmlToXMLNote cvt=new XHtmlToXMLNote(prov,styles);
cvt.parse(in);
return cvt.getDocument();
}
/**
* Convert from XHtml, reading from a String to XMLNoteDocument.
*
* @param xhtml
* @param styles
* @return
* @throws ParserConfigurationException
* @throws SAXException
* @throws IOException
* @throws BadStyleException
* @throws BadDocumentException
*/
public static XMLNoteDocument convert(String xhtml,XMLNoteImageIcon.Provider prov,XMLNoteStyles styles) throws ParserConfigurationException, SAXException, IOException, BadStyleException, BadDocumentException {
XHtmlToXMLNote cvt=new XHtmlToXMLNote(prov,styles);
cvt.parse(new StringReader(xhtml));
//System.out.println(XMLNoteUtils.prettyPrintXML(cvt.getXML()));
return cvt.getDocument();
}
/**
* Convert from XHtml, reading from a Reader to XMLNoteDocument
*
* @param in
* @param styles
* @return
* @throws ParserConfigurationException
* @throws SAXException
* @throws IOException
* @throws BadStyleException
* @throws BadDocumentException
*/
public static XMLNoteDocument convert(Reader in,XMLNoteImageIcon.Provider prov,XMLNoteStyles styles) throws ParserConfigurationException, SAXException, IOException, BadStyleException, BadDocumentException {
XHtmlToXMLNote cvt=new XHtmlToXMLNote(prov,styles);
cvt.parse(in);
return cvt.getDocument();
}
protected void parse(Reader rd) throws ParserConfigurationException, SAXException, IOException {
SAXParserFactory factory=SAXParserFactory.newInstance();
SAXParser parser=factory.newSAXParser();
XMLReader reader=parser.getXMLReader();
HTMLContentHandler handler=new HTMLContentHandler(this);
reader.setContentHandler(handler);
InputSource source=new InputSource(rd);
reader.parse(source);
_xmlnote=handler.getXML();
}
protected XHtmlToXMLNote(XMLNoteImageIcon.Provider prov,XMLNoteStyles styles) {
_styles=styles;
_prov=prov;
_xmlnote=null;
}
}
class HTMLContentHandler extends DefaultHandler {
private enum ListType { NONE, NUMBER, BULLET };
private XHtmlToXMLNote _converter;
private org.w3c.dom.Document _xmlnote;
private Stack<ListType> _list;
private Stack<Integer> _listCount;
private boolean _ignoreContents;
private Stack<String> _parTags;
private int _indent;
private Stack<org.w3c.dom.Element> _elemStack;
private static String[] ignoreTags={
"script",
};
private boolean ignoreContents() {
return _ignoreContents;
}
private boolean ignoreContents(boolean n) {
_ignoreContents=n;
return _ignoreContents;
}
private boolean inIgnores(String s) {
int i;
for(i=0;i<ignoreTags.length && !ignoreTags[i].equals(s);i++);
return (i<ignoreTags.length);
}
private boolean inParagraph() {
return !_parTags.isEmpty();
}
private boolean inParagraph(String parTag) {
_parTags.push(parTag);
return inParagraph();
}
private boolean outParagraph() {
_parTags.pop();
return inParagraph();
}
public String getXML() {
Writer out=new StringWriter();
OutputFormat format=new OutputFormat(_xmlnote);
XMLSerializer serial=new XMLSerializer(out,format);
try {
serial.serialize(_xmlnote);
} catch (IOException e) {
DefaultXMLNoteErrorHandler.exception(e);
}
return out.toString();
}
private void startPar(String tag) {
_elemStack.push(_xmlnote.createElement(tag));
}
private void startPar(String tag,String style) {
startPar(tag);
org.w3c.dom.Element par=(org.w3c.dom.Element) _elemStack.peek();
par.setAttribute("style", style);
}
private void startPar(String tag,String style,Integer indent) {
startPar(tag);
org.w3c.dom.Element par=(org.w3c.dom.Element) _elemStack.peek();
par.setAttribute("style", style);
par.setAttribute("indent", indent.toString());
}
private void startElem(String tag) {
_elemStack.push(_xmlnote.createElement(tag));
}
private void endPar(String e) {
outParagraph();
if (!inParagraph()) {
org.w3c.dom.Element el=_elemStack.pop();
org.w3c.dom.Node n=el.getLastChild();
if (n==null) {
el.appendChild(_xmlnote.createElement("enter"));
} else {
if (n instanceof org.w3c.dom.Element) {
if (!((org.w3c.dom.Element) n).getTagName().equals("enter")) {
el.appendChild(_xmlnote.createElement("enter"));
}
} else {
el.appendChild(_xmlnote.createElement("enter"));
}
}
_elemStack.peek().appendChild(el);
}
}
private void endElem(String e) {
org.w3c.dom.Element el=_elemStack.pop();
_elemStack.peek().appendChild(el);
}
private void addText(String txt) {
org.w3c.dom.Element el=_elemStack.peek();
txt=txt.replaceAll("\\s+", " ");
int i,k,n;
for(i=0,k=0,n=txt.length();i<n;i++) {
char c=txt.charAt(i);
switch (c) {
case '\t': if (k!=i) { el.appendChild(_xmlnote.createTextNode(txt.substring(k,i))); }
el.appendChild(_xmlnote.createElement("tab"));
k=i+1;
break;
case ' ': if (k!=i) { el.appendChild(_xmlnote.createTextNode(txt.substring(k,i))); }
el.appendChild(_xmlnote.createElement("space"));
k=i+1;
break;
case '\n': if (k!=i) { el.appendChild(_xmlnote.createTextNode(txt.substring(k,i))); }
el.appendChild(_xmlnote.createElement("enter"));
k=i+1;
break;
}
}
if (k!=i) {
el.appendChild(_xmlnote.createTextNode(txt.substring(k,i)));
}
}
private boolean inPx(String val) {
String v=val.trim().replaceFirst("[0-9.]+", "").trim();
if (v.equals("") || v.equals("px")) {
return true;
} else {
return false;
}
}
private boolean inPerc(String val) {
String v=val.trim().replaceFirst("[0-9.]+", "").trim();
return v.equals("%");
}
private Integer toPt(int a,String val) {
String v=val.trim().replaceFirst("[0-9.]+", "").trim();
if (v.equals("pt")) { return a; }
else if (v.equals("cm")) { return (int) Math.round(a*(72.0/2.54)); }
else if (v.equals("in")) { return (int) Math.round(a*72.0); }
else if (v.equals("pc")) { return (int) Math.round(a*(72.0/12.0)); }
else { return a; }
}
public void startElement(String uri, String localName,String qName,Attributes attrs) throws SAXException {
if (qName.equals("body")) {
ignoreContents(false);
} else if (qName.equals("h1")) {
if (!inParagraph()) { startPar("h1"); }
inParagraph("h1");
} else if (qName.equals("h2")) {
if (!inParagraph()) { startPar("h2"); }
inParagraph("h2");
} else if (qName.equals("h3")) {
if (!inParagraph()) { startPar("h3"); }
inParagraph("h3");
} else if (qName.equals("h4")) {
if (!inParagraph()) { startPar("h4"); }
inParagraph("h4");
} else if (qName.equals("ul")) {
_indent+=36;
_listCount.push(new Integer(0));
_list.push(ListType.BULLET);
} else if (qName.equals("ol")) {
_indent+=36;
_listCount.push(new Integer(1));
_list.push(ListType.NUMBER);
} else if (qName.equals("li")) {
if (_list.peek()==ListType.NONE) {
if (!inParagraph()) { startPar("par"); }
} else {
String s="numbered_list";
if (_list.peek()==ListType.BULLET) { s="bullet_list"; }
if (!inParagraph()) {
startPar("sp",s,_indent);
if (s.equals("numbered_list")) {
int cnt=_listCount.pop();
addText(String.format("%d.",cnt));
startElem("tab");endElem("tab");
cnt+=1;
_listCount.push(new Integer(cnt));
} else {
addText(String.format("-"));
startElem("tab");endElem("tab");
}
}
}
inParagraph("li");
} else if (qName.equals("b") || qName.equals("strong")) {
if (inParagraph()) { startElem("b"); }
} else if (qName.equals("i") || qName.equals("em")) {
if (inParagraph()) { startElem("i"); }
} else if (qName.equals("u")) {
if (inParagraph()) { startElem("u"); }
} else if (qName.equals("img")) {
boolean inPar=inParagraph();
if (!inPar) { startPar("par"); }
startElem("image");
String src=attrs.getValue("src");
String alt=attrs.getValue("alt");
String style=attrs.getValue("style");
String width=attrs.getValue("width");
String height=attrs.getValue("height");
if (src!=null) {
try {
URL url=new URL(src);
_elemStack.peek().setAttribute("url",url.toString());
if (alt!=null) { _elemStack.peek().setAttribute("description",alt); }
if (width!=null) { _elemStack.peek().setAttribute("width_in_px", width); }
if (height!=null) { _elemStack.peek().setAttribute("height_in_px", height); }
if (style!=null) {
String[] keyvals=style.split("[;]");
for (String keyval : keyvals) {
String [] kv=keyval.split("[:]");
if (kv.length==2) {
String key=kv[0].trim();
String val=kv[1].trim();
if (key.compareToIgnoreCase("width")==0) {
Integer w=Integer.parseInt(val);
if (inPx(val)) {
_elemStack.peek().setAttribute("width_in_px",w.toString());
} else if (inPerc(val)) {
_elemStack.peek().setAttribute("width_in_%",w.toString());
} else {
_elemStack.peek().setAttribute("width_in_pt",toPt(w,val).toString());
}
}
if (key.compareToIgnoreCase("height")==0) {
Integer h=Integer.parseInt(val);
if (inPx(val)) {
_elemStack.peek().setAttribute("height_in_px",h.toString());
} else if (inPerc(val)) {
_elemStack.peek().setAttribute("width_in_%",h.toString());
} else {
_elemStack.peek().setAttribute("height_in_px",toPt(h,val).toString());
}
}
}
}
}
} catch (MalformedURLException e) {
_elemStack.pop(); // skip this image. It won't work.
if (!inPar) { _elemStack.pop(); }
}
}
} else if (qName.equals("p")) {
if (!inParagraph()) { startPar("par"); }
inParagraph("par");
} else if (qName.equals("br")) {
// Enters must be added at </br>
//if (inParagraph()) { _xmlnote.append("<enter />"); }
} else if (qName.equals("tr")) {
if (!inParagraph()) { startPar("sp","table"); }
inParagraph("sp");
} else if (qName.equals("td")) {
// Tabs must be added at </td>
//if (inParagraph()) { _xmlnote.append("<tab />"); }
} else if (qName.equals("th")) {
if (inParagraph()) { startElem("b"); }
} else if (inIgnores(qName)) {
this.ignoreContents(true);
}
}
public void characters(char [] ch,int start,int length) throws SAXException {
if (!ignoreContents() && inParagraph()) {
String s=new String(ch,start,length);
//s=s.replaceAll("\\t", "<tab />");
//s=s.replaceAll("\\s+"," ");
//s=s.replaceAll("\\s", "<space />");
addText(s);
}
}
public void endElement(String uri,String localName,String qName) throws SAXException {
if (qName.equals("h1")) {
endPar("h1");
} else if (qName.equals("h2")) {
endPar("h2");
} else if (qName.equals("h3")) {
endPar("h3");
} else if (qName.equals("h4")) {
endPar("h4");
} else if (qName.equals("ul")) {
_indent-=36;
_listCount.pop();
_list.pop();
} else if (qName.equals("ol")) {
_indent-=36;
_listCount.pop();
_list.pop();
} else if (qName.equals("li")) {
if (_list.peek()==ListType.NONE) {
endPar("par");
} else {
endPar("sp");
}
} else if (qName.equals("b") || qName.equals("strong")) {
if (inParagraph()) { endElem("b"); }
} else if (qName.equals("i") || qName.equals("em")) {
if (inParagraph()) { endElem("i"); }
} else if (qName.equals("u")) {
if (inParagraph()) { endElem("u"); }
} else if (qName.equals("img")) {
String txt=_elemStack.peek().getTextContent();
if (txt==null || txt.equals("")) {
addText("i");
}
endElem("image");
if (!inParagraph()) { endElem("par"); }
} else if (qName.equals("p")) {
endPar("par");
} else if (qName.equals("br")) {
if (inParagraph()) { startElem("enter");endElem("enter"); }
} else if (qName.equals("tr")) {
endPar("sp");
} else if (qName.equals("td")) {
if (inParagraph()) { startElem("tab");endElem("tab"); }
} else if (qName.equals("th")) {
if (inParagraph()) { endElem("b");startElem("tab");endElem("tab"); }
} else if (inIgnores(qName)) { // TODO: CHeck of dit niet ook recursief moet werken!
this.ignoreContents(false);
}
}
public HTMLContentHandler(XHtmlToXMLNote converter) throws ParserConfigurationException {
_converter=converter;
DocumentBuilderFactory dbf=DocumentBuilderFactory.newInstance();
DocumentBuilder db=dbf.newDocumentBuilder();
_xmlnote=db.newDocument();
org.w3c.dom.Element root=_xmlnote.createElement("xmlnote");
root.setAttribute("version", "2010.1");
root.appendChild(_xmlnote.createElement("meta"));
org.w3c.dom.Element notes=_xmlnote.createElement("notes");
root.appendChild(notes);
_xmlnote.appendChild(root);
_elemStack=new Stack<org.w3c.dom.Element>();
//_elemStack.push(root);
_elemStack.push(notes);
_list=new Stack<ListType>();
_listCount=new Stack<Integer>();
_listCount.push(new Integer(0));
_list.push(ListType.NONE);
_ignoreContents=true;
_parTags=new Stack<String>();
_indent=0;
}
}