package de.l3s.boilerpipe.sax;
import org.xml.sax.InputSource;
import java.io.ByteArrayInputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* An {@link InputSourceable} for {@link HTMLFetcher}.
*
* @author Christian Kohlschütter
*/
public class HTMLDocument implements InputSourceable {
private Charset charset;
private byte[] data;
public HTMLDocument(final byte[] data, final Charset charset) {
this.data = data;
this.charset = charset;
}
public HTMLDocument(final String data) {
Charset cs = Charset.forName("utf-8");
this.data = data.getBytes(cs);
this.charset = cs;
}
public Charset getCharset() {
return charset;
}
public byte[] getData() {
return data;
}
public InputSource toInputSource() {
final InputSource is = new InputSource(new ByteArrayInputStream(data));
is.setEncoding(charset.name());
return is;
}
/*
* Encodes <img > tags as #img#<attributes>#/img#
*/
public void encodeImageTagsAsText()
{
String htmlDataString = new String(this.data);
htmlDataString = HTMLDocument.encodeImageTagsAsText(htmlDataString, this.charset.name());
final byte[] htmlData = htmlDataString.getBytes();
this.data = htmlData;
}
/*
* Encodes <img > tags as #img#<attributes>#/img#
*/
public static String encodeImageTagsAsText(String htmlDataString, String encoding)
{
ArrayList<String> images = new ArrayList<String>();
Pattern PAT_IMAGE_TAG = Pattern.compile("<img (.*?)[/]?>");
boolean repeat = true;
while(repeat) {
repeat = false;
Matcher matcher = PAT_IMAGE_TAG.matcher(htmlDataString);
if(matcher.find()) {
repeat = true;
String imageAttributes = matcher.group(1);
try {
imageAttributes = URLEncoder.encode(imageAttributes, encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
imageAttributes = URLEncoder.encode(imageAttributes);
}
String encodedImageTag = "#img#" + imageAttributes + "#/img#";
// Ignore duplicate images
if(!images.contains(encodedImageTag))
{
images.add(encodedImageTag);
// System.out.println("encodedImageTag: " + encodedImageTag);
htmlDataString = matcher.replaceFirst(encodedImageTag);
}
else
{
// System.out.println("skipping duplicate encodedImageTag: " + encodedImageTag);
htmlDataString = matcher.replaceFirst("");
}
}
}
return htmlDataString;
}
/*
* Decodes #img#<attributes>#/img# as <img > tags
*/
public void restoreTextEncodedImageTags()
{
String htmlDataString = new String(this.data);
htmlDataString = HTMLDocument.restoreTextEncodedImageTags(htmlDataString, this.charset.name());
final byte[] htmlData = htmlDataString.getBytes();
this.data = htmlData;
}
/*
* Decodes #img#<attributes>#/img# as <img > tags
*/
public static String restoreTextEncodedImageTags(String htmlDataString, String encoding)
{
Pattern PAT_IMAGE_TAG = Pattern.compile("#img#(.*?)#/img#");
boolean repeat = true;
while(repeat) {
repeat = false;
Matcher matcher = PAT_IMAGE_TAG.matcher(htmlDataString);
if(matcher.find()) {
repeat = true;
String imageAttributes = matcher.group(1);
try {
imageAttributes = URLDecoder.decode(imageAttributes, encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
imageAttributes = URLDecoder.decode(imageAttributes);
}
String decodedImageTag = "<img " + imageAttributes + ">";
// System.out.println("decodedImageTag: " + decodedImageTag);
htmlDataString = matcher.replaceFirst(decodedImageTag);
}
}
return htmlDataString;
}
/*
* Encodes xxx; escaped chars as #esc#xxx#/esc#
*/
public void encodeEscapedCharsAsText()
{
String htmlDataString = new String(this.data);
htmlDataString = HTMLDocument.encodeEscapedCharsAsText(htmlDataString, this.charset.name());
final byte[] htmlData = htmlDataString.getBytes();
this.data = htmlData;
}
/*
* Encodes xxx; escaped chars as #esc#xxx#/esc#
*/
public static String encodeEscapedCharsAsText(String htmlDataString, String encoding)
{
// Wrap any escaped chars in cdata
Pattern PAT_ESC_CHAR = Pattern.compile("(.*?);");
boolean repeat = true;
while(repeat) {
repeat = false;
Matcher matcher = PAT_ESC_CHAR.matcher(htmlDataString);
if(matcher.find()) {
repeat = true;
String escChar = matcher.group(1);
try {
escChar = URLEncoder.encode(escChar, encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
escChar = URLEncoder.encode(escChar);
}
String encodedEscChar = "#esc#" + escChar + "#/esc#";
// System.out.println("encodedEscChar: " + encodedEscChar);
htmlDataString = matcher.replaceFirst(encodedEscChar);
}
}
return htmlDataString;
}
/*
* Decodes #esc#xxx#/esc# as xxx; escaped chars
*/
public void restoreTextEncodedEscapedChars()
{
String htmlDataString = new String(this.data);
htmlDataString = HTMLDocument.restoreTextEncodedEscapedChars(htmlDataString, this.charset.name());
final byte[] htmlData = htmlDataString.getBytes();
this.data = htmlData;
}
/*
* Decodes #esc#xxx#/esc# as xxx; escaped chars
*/
public static String restoreTextEncodedEscapedChars(String htmlDataString, String encoding)
{
// Wrap any escaped chars in cdata
Pattern PAT_ESC_CHAR = Pattern.compile("#esc#(.*?)#/esc#");
boolean repeat = true;
while(repeat) {
repeat = false;
Matcher matcher = PAT_ESC_CHAR.matcher(htmlDataString);
if(matcher.find()) {
repeat = true;
String escChar = matcher.group(1);
try {
escChar = URLDecoder.decode(escChar, encoding);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
escChar = URLDecoder.decode(escChar);
}
String decodedEscChar = "" + escChar + ";";
// System.out.println("decodedEscChar: " + decodedEscChar);
htmlDataString = matcher.replaceFirst(decodedEscChar);
}
}
return htmlDataString;
}
}