/* ******************************************************************************
*
* Copyright 2008-2010 Hans Dijkema
*
* JRichTextEditor is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* JRichTextEditor is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with JRichTextEditor. If not, see <http://www.gnu.org/licenses/>.
*
* ******************************************************************************/
package nl.dykema.jxmlnote.html;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import nl.dykema.jxmlnote.exceptions.BadDocumentException;
import nl.dykema.jxmlnote.exceptions.DefaultXMLNoteErrorHandler;
import nl.dykema.jxmlnote.internationalization.DefaultXMLNoteTranslator;
import nl.dykema.jxmlnote.internationalization.XMLNoteTranslator;
import org.w3c.tidy.Tidy;
public class HtmlToXHtml {
static XMLNoteTranslator translator=new DefaultXMLNoteTranslator();
/**
* This method converts html to xhtml, using JTidy. It calls <code>fromHtml(url,"Windows-1252",false)</code>.
* Windows-1252 is a character encoding almost equal to ISO-8859-1 with a few extra characters. Most browsers
* will treat ISO-8859-1 (latin-1) encoding as Windows-1252.
*
* @param url The URL to fetch
* @return The converted HTML as XHTML (i.e. sanitized HTML with JTidy, readable as XML, in UTF-8)
* @throws IOException
* @throws BadDocumentException
*/
static public String fromHtml(URL url) throws IOException, BadDocumentException {
return fromHtml(url,"Windows-1252",false);
}
/**
* This method converts html to xhtml, using JTidy. It tries to detect the encoding of the HTML, and will
* default to defaultEncoding if it cannot detect the encoding.
*
* @param url The URL to fetch
* @param defaultEncoding The default encoding to use
* @param overrideEncoding true, if the default encoding must always be used
* @return The converted HTML as XHTML (i.e. sanitized HTML with JTidy, readable as XML, in UTF-8)
* @throws IOException
* @throws BadDocumentException
*/
static public String fromHtml(URL url,String defaultEncoding,boolean overrideEncoding) throws IOException, BadDocumentException {
URLConnection conn=url.openConnection();
String contentType=conn.getContentType();
String encoding=conn.getContentEncoding();
if (encoding==null) {
int i=contentType.indexOf("charset");
if (i>=0) {
String s=contentType.substring(i);
i=s.indexOf('=');
if (i>=0) {
s=contentType.substring(i+1).trim();
encoding=s.replace("\'", "").replace("\"", "").trim();
if (encoding.equals("")) {
encoding=defaultEncoding;
}
}
} else { // guess defaultEncoding
encoding=defaultEncoding;
}
}
String expected="text/html";
if (contentType==null) { // guess html/text
DefaultXMLNoteErrorHandler.warning(null, 90190, "Returned content type for url.openConnection() is null" );
contentType=expected;
}
int index=contentType.indexOf(';');
if (index>=0) {
contentType=contentType.substring(0,index).trim();
}
if (!contentType.equals(expected)) {
String msg=translator.translate("The content type of url '%s' is not '%s', it is '%s'");
throw new BadDocumentException(String.format(msg,url.toString(),expected,contentType));
}
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(),encoding));
return fromHtml(in,encoding);
}
/**
* This method reads HTML from a string and expects UTF-8 encoding.
*
* @param html
* @return
* @throws IOException
*/
static public String fromHtml(String html) throws IOException {
Reader in=new StringReader(html);
return fromHtml(in,"UTF-8");
}
static protected String fromHtml(Reader in,String inEncoding) throws IOException {
Writer out=new StringWriter();
Tidy tidy=new Tidy();
tidy.setXHTML(true);
tidy.setDocType("strict");
tidy.setNumEntities(true);
tidy.setDropFontTags(true);
tidy.setDropProprietaryAttributes(true);
tidy.setFixBackslash(true);
tidy.setWord2000(true);
//tidy.setHideComments(true); // do not hide comments
tidy.setEncloseText(true);
tidy.setEncloseBlockText(true);
tidy.setForceOutput(false); // don't force output if the HTML cannot be cleaned
tidy.setOutputEncoding("UTF-8");
tidy.setInputEncoding(inEncoding);
//tidy.setOnlyErrors(true);
tidy.setShowWarnings(false);
tidy.setQuiet(true);
tidy.parse(in, out);
String s=out.toString();
//return s.replaceAll("[<][!]DOCTYPE[^>]*[>]", "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>").replaceAll("[&]nbsp[;]", " ");
return s.replaceAll("[<][!]DOCTYPE[^>]*[>]", "").replaceAll("[&]nbsp[;]", " ");
//return s;
//return "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"+s;
}
}