/*
* Red Bee Browser
*
* Copyright (c) 2013 Tran Dinh Thoai <dthoai@yahoo.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.redbee;
public class PageSaver {
private final static int MAX_LEVEL = 5;
private String link;
private String folder;
private java.util.Map<String, String> map;
private int level;
public PageSaver(String link, String folder) {
this.link = link;
this.folder = folder;
this.map = new java.util.HashMap<String, String>();
this.level = 1;
}
public PageSaver(String link, String folder, int level) {
this.link = link;
this.folder = folder;
this.map = new java.util.HashMap<String, String>();
this.level = level;
}
public void run() throws Exception {
new java.io.File(folder).mkdirs();
org.jsoup.Connection conn = org.jsoup.Jsoup.connect(link);
conn.timeout(60000);
conn.userAgent("Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1");
org.jsoup.nodes.Document doc = conn.get();
link = conn.request().url().toString();
org.jsoup.select.Elements children = doc.select("a");
for (int i = 0; i < children.size(); i++) {
org.jsoup.nodes.Element child = children.get(i);
String url = child.attr("href");
try {
java.net.URL turl = new java.net.URL(new java.net.URL(link), url);
url = turl.toString();
} catch (Exception e) {
continue;
}
child.attr("href", url);
}
children = doc.select("img");
for (int i = 0; i < children.size(); i++) {
org.jsoup.nodes.Element child = children.get(i);
String url = child.attr("src");
String url2 = "";
try {
java.net.URL turl = new java.net.URL(new java.net.URL(link), url);
url = turl.toString();
url2 = url;
String q = turl.getQuery();
if (q != null) {
url2 = url.substring(0, url.length() - q.length());
if (url2.endsWith("?")) {
url2 = url2.substring(0, url2.length() - 1);
}
}
} catch (Exception e) {
continue;
}
String filename = suniqid();
int pos1 = url2.lastIndexOf(".");
int pos2 = url2.lastIndexOf("/");
if (pos1 > pos2) {
String ext = toLetterDigit(url2.substring(pos1 + 1));
if (ext.equalsIgnoreCase("com") || ext.equalsIgnoreCase("exe")) {
ext += "_";
}
if (ext.length() > 0) {
filename += "." + ext;
}
}
try {
if (map.containsKey(url)) {
filename = map.get(url);
} else {
save(new java.io.File(folder, filename).getAbsolutePath(), url);
map.put(url, filename);
}
child.attr("src", filename);
} catch (Exception e) {
child.attr("src", url);
}
}
parseStyleAttr(doc);
children = doc.select("style");
for (int i = 0; i < children.size(); i++) {
org.jsoup.nodes.Element child = children.get(i);
child.html(parseStyleUrl(child.html()));
}
children = doc.select("link");
for (int i = 0; i < children.size(); i++) {
org.jsoup.nodes.Element child = children.get(i);
if ("stylesheet".equalsIgnoreCase(child.attr("rel"))) {
String url = child.attr("href");
String url2 = "";
try {
java.net.URL turl = new java.net.URL(new java.net.URL(link), url);
url = turl.toString();
url2 = url;
String q = turl.getQuery();
if (q != null) {
url2 = url.substring(0, url.length() - q.length());
if (url2.endsWith("?")) {
url2 = url2.substring(0, url2.length() - 1);
}
}
} catch (Exception e) {
continue;
}
String filename = suniqid();
int pos1 = url2.lastIndexOf(".");
int pos2 = url2.lastIndexOf("/");
if (pos1 > pos2) {
String ext = toLetterDigit(url2.substring(pos1 + 1));
if (ext.equalsIgnoreCase("com") || ext.equalsIgnoreCase("exe")) {
ext += "_";
}
if (ext.length() > 0) {
filename += "." + ext;
}
}
try {
if (map.containsKey(url)) {
filename = map.get(url);
} else {
save(new java.io.File(folder, filename).getAbsolutePath(), url);
String style = new String(read(new java.io.File(folder, filename).getAbsolutePath()), "UTF-8");
style = parseStyleUrl(style);
save(new java.io.File(folder, filename).getAbsolutePath(), style.getBytes("UTF-8"));
map.put(url, filename);
}
child.attr("href", filename);
} catch (Exception e) {
child.attr("href", url);
}
} else {
String url = child.attr("href");
try {
java.net.URL turl = new java.net.URL(new java.net.URL(link), url);
url = turl.toString();
} catch (Exception e) {
continue;
}
child.attr("href", url);
}
}
children = doc.select("script");
for (int i = 0; i < children.size(); i++) {
org.jsoup.nodes.Element child = children.get(i);
String url = child.attr("src");
if (url.trim().length() == 0) continue;
String url2 = "";
try {
java.net.URL turl = new java.net.URL(new java.net.URL(link), url);
url = turl.toString();
url2 = url;
String q = turl.getQuery();
if (q != null) {
url2 = url.substring(0, url.length() - q.length());
if (url2.endsWith("?")) {
url2 = url2.substring(0, url2.length() - 1);
}
}
} catch (Exception e) {
continue;
}
String filename = suniqid();
int pos1 = url2.lastIndexOf(".");
int pos2 = url2.lastIndexOf("/");
if (pos1 > pos2) {
String ext = toLetterDigit(url2.substring(pos1 + 1));
if (ext.equalsIgnoreCase("com") || ext.equalsIgnoreCase("exe")) {
ext += "_";
}
if (ext.length() > 0) {
filename += "." + ext;
}
}
try {
if (map.containsKey(url)) {
filename = map.get(url);
} else {
save(new java.io.File(folder, filename).getAbsolutePath(), url);
map.put(url, filename);
}
child.attr("src", filename);
} catch (Exception e) {
child.attr("src", url);
}
}
children = doc.select("frame");
for (int i = 0; i < children.size(); i++) {
org.jsoup.nodes.Element child = children.get(i);
String url = child.attr("src");
try {
java.net.URL turl = new java.net.URL(new java.net.URL(link), url);
url = turl.toString();
} catch (Exception e) {
continue;
}
String filename = suniqid();
try {
if (level < MAX_LEVEL) {
new PageSaver(url, new java.io.File(folder, filename).getAbsolutePath(), level + 1).run();
child.attr("src", filename + "/index.html");
} else {
child.attr("src", url);
}
} catch (Exception e) {
child.attr("src", url);
}
}
children = doc.select("iframe");
for (int i = 0; i < children.size(); i++) {
org.jsoup.nodes.Element child = children.get(i);
String url = child.attr("src");
try {
java.net.URL turl = new java.net.URL(new java.net.URL(link), url);
url = turl.toString();
} catch (Exception e) {
continue;
}
String filename = suniqid();
try {
if (level < MAX_LEVEL) {
new PageSaver(url, new java.io.File(folder, filename).getAbsolutePath(), level + 1).run();
child.attr("src", filename + "/index.html");
} else {
child.attr("src", url);
}
} catch (Exception e) {
child.attr("src", url);
}
}
children = doc.select("base");
for (int i = children.size() - 1; i >= 0; i--) {
org.jsoup.nodes.Element child = children.get(i);
child.remove();
}
String filename = new java.io.File(folder, "index.html").getAbsolutePath();
save(filename, doc.html().getBytes("UTF-8"));
}
private String parseStyleUrl(String style) {
style = style.replaceAll("<!--", "");
style = style.replaceAll("-->", "");
String styleL = style.toLowerCase();
String styleT = "";
int oldpos = 0;
int pos1 = styleL.indexOf("url(");
while (pos1 >= 0) {
int pos2 = style.indexOf(")", pos1 + 4);
if (pos2 >= 0) {
String url = style.substring(pos1 + 4, pos2);
if (url.startsWith("\"")) {
url = url.substring(1);
} else if (url.startsWith("'")) {
url = url.substring(1);
}
if (url.endsWith("\"")) {
url = url.substring(0, url.length() - 1);
} else if (url.endsWith("'")) {
url = url.substring(0, url.length() - 1);
}
String url2 = "";
try {
java.net.URL turl = new java.net.URL(new java.net.URL(link), url);
url = turl.toString();
url2 = url;
String q = turl.getQuery();
if (q != null) {
url2 = url.substring(0, url.length() - q.length());
if (url2.endsWith("?")) {
url2 = url2.substring(0, url2.length() - 1);
}
}
} catch (Exception e) {
}
String filename = suniqid();
int posA = url2.lastIndexOf(".");
int posB = url2.lastIndexOf("/");
if (posA > posB) {
String ext = toLetterDigit(url2.substring(posA + 1));
if (ext.equalsIgnoreCase("com") || ext.equalsIgnoreCase("exe")) {
ext += "_";
}
if (ext.length() > 0) {
filename += "." + ext;
}
}
try {
if (map.containsKey(url)) {
filename = map.get(url);
} else {
save(new java.io.File(folder, filename).getAbsolutePath(), url);
map.put(url, filename);
}
styleT += style.substring(oldpos, pos1 + 4) + "'" + filename + "'";
} catch (Exception e) {
styleT += style.substring(oldpos, pos1 + 4) + url;
}
oldpos = pos2;
pos1 = styleL.indexOf("url(", oldpos);
} else {
pos1 = styleL.indexOf("url(", pos1 + 4);
}
}
styleT += style.substring(oldpos);
return styleT;
}
private void parseStyleAttr(org.jsoup.nodes.Element parent) {
String style = parent.attr("style");
if (style != null && style.length() > 0) {
parent.attr("style", parseStyleUrl(style));
}
for (int i = 0; i < parent.children().size(); i++) {
parseStyleAttr(parent.child(i));
}
}
private String toLetterDigit(String src) {
String tag = "";
for (int i = 0; i < src.length(); i++) {
if (Character.isLetterOrDigit(src.charAt(i))) {
tag += src.charAt(i) + "";
}
}
return tag;
}
private void save(String filename, String url) throws Exception {
org.jsoup.Connection conn = org.jsoup.Jsoup.connect(url);
conn.timeout(60000);
conn.userAgent("Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1");
conn.ignoreContentType(true);
byte[] data = conn.execute().bodyAsBytes();
save(filename, data);
}
private byte[] read(String filename) throws Exception {
java.io.InputStream is = new java.io.FileInputStream(filename);
byte[] data = new byte[is.available()];
is.read(data);
is.close();
return data;
}
private void save(String filename, byte[] data) throws Exception {
java.io.OutputStream os = new java.io.FileOutputStream(filename);
os.write(data);
os.close();
}
private String suniqid() {
java.util.Random random = new java.util.Random();
return Long.toString(Math.abs(random.nextLong()), 36);
}
}