package com.newsrob.download; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.net.MalformedURLException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HeaderElement; import org.apache.http.HttpHost; import org.apache.http.HttpRequest; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.NameValuePair; import org.apache.http.client.methods.HttpGet; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.ExecutionContext; import org.apache.http.protocol.HttpContext; import android.content.Context; import android.util.Log; import com.newsrob.AssetContentProvider; import com.newsrob.EntryManager; import com.newsrob.NewsRob; import com.newsrob.PL; import com.newsrob.jobs.Job; import com.newsrob.storage.IStorageAdapter; import com.newsrob.util.Timing; public class WebPageDownloadDirector { private Map<URL, Asset> assetUrls2download = new HashMap<URL, Asset>(15); private URL pageUrl; private String id; private int assetCounter = 1; private EntryManager entryManager; private boolean isDetailedLoggingEnabled; static final String TAG = WebPageDownloadDirector.class.getSimpleName(); private IStorageAdapter fileContext; private Context context; private long started = System.currentTimeMillis(); static final long PAGE_DOWNLOAD_TIMEOUT_MS = 180000; private static final Pattern PATTERN_LINK_HREF = Pattern.compile( "<\\s*?link.*?href.*?[\"']([^\"]*?\\.css).*?[\"'].*?>", Pattern.CASE_INSENSITIVE); // | // Pattern.MULTILINE); // "<\\s*?link.*?href.*?\"(.*?\\.css)\".*?>" private static final Pattern PATTERN_IMG_SRC = Pattern.compile( "[^>]*?<\\s*?img[^><]*?src[^><]*?[\"'](.*?)[\"'][^<]*?>.*?", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // Removed // a // tailing // $ // "^.*<\\s*?img.*?src.*?\"(.*?)\".*?>.*$" // private static final Pattern PATTERN_IMG_SRC = // Pattern.compile("<\\s*?img.*?src.*?\"(.*?)\".*?>", // Pattern.CASE_INSENSITIVE); // | Pattern.MULTILINE private static final Pattern PATTERN_EXTENSION = Pattern.compile(".*(\\.\\w+).*?$"); private static final Pattern PATTERN_BACKGROUND_IMG = Pattern.compile( "background.*?:.*?url\\([\"']?(.*?)[\"']?\\)", Pattern.CASE_INSENSITIVE); // | // Pattern.MULTILINE static final Pattern PATTERN_CHARSET = Pattern.compile( "^.*?meta[^>]*?http-equiv[^>]*?Content-Type[^>]*?charset=([a-z0-9-]*).*$", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); public static WebPageDownloadDirector downloadWebPage(String id, URL pageUrl, IStorageAdapter fileContext, Job job, String summary, boolean downloadCompleteWebPage, EntryManager entryManager, boolean manualSync) throws DownloadException, DownloadTimedOutException, DownloadCancelledException { try { return new WebPageDownloadDirector(id, pageUrl, fileContext, job, summary, downloadCompleteWebPage, entryManager, manualSync); } catch (OutOfMemoryError oome) { throw new DownloadException("OutOfMemory when processing " + pageUrl + ".", oome); } } public static int removeAllAssets(IStorageAdapter fileContext) { int noOfDeletedAssets = 0; if (fileContext == null) throw new IllegalStateException("fileContext cannot be null."); fileContext.clear(); return noOfDeletedAssets; } // LATER move out to IStorage... public static int removeAssetsForId(String atomId, IStorageAdapter fileContext) { int noOfDeletedAssets = 0; if (fileContext == null) throw new IllegalStateException("fileContext cannot be null."); noOfDeletedAssets = fileContext.removeAllAssets(atomId); return noOfDeletedAssets; } private WebPageDownloadDirector(String id, URL pageUrl, IStorageAdapter fileContext, Job job, CharSequence summary, boolean downloadCompleteWebPage, EntryManager entryManager, boolean manualSync) throws DownloadException, DownloadTimedOutException, DownloadCancelledException { this.context = entryManager.getContext(); Timing t = null; if (isDetailedLoggingEnabled) t = new Timing("WPDD: Downloading " + pageUrl, context); this.entryManager = entryManager; try { pageUrl.toURI(); } catch (URISyntaxException e1) { throw new DownloadException("Problem with a URI: " + pageUrl, e1); } isDetailedLoggingEnabled = "1".equals(NewsRob.getDebugProperties(context).getProperty( "webpageDownloadDirector", "0")); if (isDetailedLoggingEnabled) PL.log("WPDD: Making offline: " + pageUrl, context); NewsRobHttpClient httpClient = NewsRobHttpClient.newInstance(true, context); try { this.id = id; this.pageUrl = pageUrl; this.fileContext = fileContext; if (downloadCompleteWebPage) { assertDownloadShouldContinue(manualSync); NewsRobHttpClient httpC = NewsRobHttpClient.newInstance(true, context); try { Map<String, String> results = Downloader.loadTextFromUrl(httpC, pageUrl, started, job, context); CharSequence pageContent = results.get("content"); pageUrl = new URL(results.get("url")); this.pageUrl = pageUrl; // LATER assertDownloadShouldContinue(manualSync); // pageContent = // pageContent.toString().replace("iso-8859-1", // "utf-8"); pageContent = convertImageTags(pageContent, job); pageContent = convertStyleSheetLinks(pageContent, job); pageContent = convertStyleSheetImageTags(pageUrl, pageContent, job); savePage(pageContent, "x"); } finally { httpC.close(); } } if (summary != null) { summary = convertImageTags(summary, job); summary = convertStyleSheetLinks(summary, job); summary = convertStyleSheetImageTags(pageUrl, summary, job); if (true) summary = summary; else summary = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/></head><body>" + summary + "</body></html>"; savePage(summary, "s"); } downloadAndConvertAllImageTagsInStyleSheets(httpClient, job, manualSync); downloadAllImages(httpClient, started, job, pageUrl, manualSync); } catch (SocketTimeoutException ste) { throw new DownloadTimedOutException(); } catch (DownloadException de) { throw de; } catch (DownloadCancelledException dce) { throw dce; } catch (DownloadTimedOutException dce) { throw dce; } catch (Exception e) { throw new DownloadException("Problem while downloading " + pageUrl + ".", e); } finally { httpClient.close(); } if (t != null) { t.stop(); Log.d(TAG, "Assets downloaded for " + pageUrl + ": " + assetCounter); } } private void assertDownloadShouldContinue(boolean manualSync) throws DownloadCancelledException { if (!entryManager.downloadContentCurrentlyEnabled(manualSync)) throw new DownloadCancelledException("WiFi no longer available."); } private void savePage(CharSequence convertedPageContent, String postfix) throws DownloadException { final String aId = "a" + id; String fileName = aId + "/" + aId + "_" + postfix + ".html"; try { U2.saveTextFile(fileContext.openFileOutput(fileName + "nr"), convertedPageContent); } catch (IOException e) { throw new DownloadException("Problem during writing of " + fileName + " for page " + pageUrl + ".", e); } } private void downloadAllImages(NewsRobHttpClient httpClient, long started, Job job, URL pageUrl, boolean manualSync) throws DownloadCancelledException, DownloadTimedOutException { for (Asset asset : assetUrls2download.values()) { assertDownloadShouldContinue(manualSync); if (Asset.TYPE_IMAGE == asset.type) { try { if (job.isCancelled()) throw new DownloadCancelledException(); downloadBinaryAsset(httpClient, asset, started, job, pageUrl); } catch (URISyntaxException e) { asset.exception = e; } } } } private void downloadAndConvertAllImageTagsInStyleSheets(NewsRobHttpClient httpClient, Job job, boolean manualSync) throws DownloadException, DownloadCancelledException, IOException, DownloadTimedOutException { Collection<Asset> assets2downloadCopy = new ArrayList<Asset>(assetUrls2download.values()); for (Asset asset : assets2downloadCopy) { assertDownloadShouldContinue(manualSync); if (Asset.TYPE_STYLESHEET == asset.type) { CharSequence content; try { if (job.isCancelled()) throw new DownloadCancelledException(); Thread.yield(); Map<String, String> results = Downloader.loadTextFromUrl(httpClient, asset.remoteUrl, started, job, context); // pageUrl = new URL(results.get("url")); content = results.get("content"); } catch (URISyntaxException e) { // ignore continue; } catch (WrongStatusException wsr) { continue; } content = convertStyleSheetImageTags(asset.remoteUrl, content, job); U2.saveTextFile(fileContext.openFileOutput(asset.localName + "nr"), content); asset.downloaded = true; } } } private CharSequence convertStyleSheetImageTags(URL baseUrl, CharSequence input, Job job) throws DownloadCancelledException { return convertRemoteToLocalNameAndRegisterAssetForDownload(PATTERN_BACKGROUND_IMG, input, Asset.TYPE_IMAGE, baseUrl, job); } // LATER consolidate text and binary downloads // LATER Also a better understanding is needed if // I need to take care of the content type from the server?! // LATER GZIP für Stylesheets und HTML? private void downloadBinaryAsset(NewsRobHttpClient httpClient, Asset asset, long started, Job job, URL pageUrl) throws URISyntaxException, DownloadTimedOutException { final int BUFFER_SIZE = 8 * 1024; BufferedOutputStream bos = null; BufferedInputStream bis = null; try { Timing t = null; HttpGet getAssetRequest = new HttpGet(asset.remoteUrl.toURI()); getAssetRequest.setHeader("Referer", pageUrl.toExternalForm()); if (isDetailedLoggingEnabled) { PL.log("WPDD: Downloading as part of " + "(" + pageUrl + ") remote:" + asset.remoteUrl.toURI() + " local: " + asset.localName, context); t = new Timing("Downloading as part of " + "(" + pageUrl + ") remote:" + asset.remoteUrl.toURI() + " local: " + asset.localName, context); } HttpResponse response = httpClient.execute(getAssetRequest); int statusCode = response.getStatusLine().getStatusCode(); if (isDetailedLoggingEnabled) PL.log("WPDD: HTTP_STATUS_CODE=" + statusCode, context); if (statusCode == HttpStatus.SC_OK) { bis = new BufferedInputStream(response.getEntity().getContent(), BUFFER_SIZE); bos = new BufferedOutputStream(fileContext.openFileOutput(asset.localName + "nr"), BUFFER_SIZE); byte[] buffer = new byte[BUFFER_SIZE]; while (true) { if (job.isCancelled()) throw new DownloadCancelledException(); if (System.currentTimeMillis() - started > WebPageDownloadDirector.PAGE_DOWNLOAD_TIMEOUT_MS) throw new DownloadTimedOutException(pageUrl.toString(), WebPageDownloadDirector.PAGE_DOWNLOAD_TIMEOUT_MS); int noReadBytes = bis.read(buffer); if (noReadBytes == -1) break; bos.write(buffer, 0, noReadBytes); Thread.yield(); } Log.w(TAG, asset.remoteUrl + " did download ok."); } else Log.w(TAG, asset.remoteUrl + " did not download. Status code=" + statusCode); response.getEntity().consumeContent(); if (isDetailedLoggingEnabled && t != null) t.stop(); } catch (URISyntaxException e) { throw e; } catch (DownloadTimedOutException e) { throw e; } catch (Exception e) { if (isDetailedLoggingEnabled) { PL.log("WPDD: Downloading as part of " + "(" + pageUrl + "):" + asset.remoteUrl.toURI() + " Resulting exception=" + e.getClass().getName() + " " + e.getMessage(), context); e.printStackTrace(); } asset.exception = e; String path = fileContext.getAbsolutePathForAsset(asset.localName); File f = new File(path); if (f.exists()) { boolean success = f.delete(); Log.d("DEBUG", "Deleting file " + f + " was successful: " + success); } } finally { try { if (bis != null) bis.close(); if (bos != null) bos.close(); } catch (IOException e) { } } asset.downloaded = true; } private CharSequence convertStyleSheetLinks(CharSequence input, Job job) throws DownloadCancelledException { return convertRemoteToLocalNameAndRegisterAssetForDownload(PATTERN_LINK_HREF, input, Asset.TYPE_STYLESHEET, job); } private CharSequence convertRemoteToLocalNameAndRegisterAssetForDownload(final Pattern p, final CharSequence input, int assetType, Job job) throws DownloadCancelledException { return convertRemoteToLocalNameAndRegisterAssetForDownload(p, input, assetType, pageUrl, job); } private CharSequence convertRemoteToLocalNameAndRegisterAssetForDownload(final Pattern p, final CharSequence input, int assetType, URL baseUrl, Job job) throws DownloadCancelledException { StringBuffer result = new StringBuffer(); Matcher m = p.matcher(input); while (m.find()) { if (job.isCancelled()) throw new DownloadCancelledException(); String tag = decodeString(m.group()); String assetUrl = decodeString(m.group(1)); if (tag != null && assetUrl != null && assetUrl.length() > 0) { try { Thread.yield();// LATER String newName = AssetContentProvider.CONTENT_URI + "/" + translateAndRegisterAssetLocation(assetUrl, baseUrl, assetType); String replaced = tag.replace(assetUrl, newName); /* * if (isDetailedLoggingEnabled) PL.log("WPDD: assetUrl=" + * assetUrl + "\n tag=" + tag + "\n replaced=" + * replaced + "\n newName=" + newName); */ m.appendReplacement(result, Matcher.quoteReplacement(replaced)); } catch (MalformedURLException e) { // Ignoring malformed asset urls System.err.println("Ooops. Malformed URL " + e); } catch (ArrayIndexOutOfBoundsException aioobe) { aioobe.printStackTrace(); continue; // throw aioobe; } } } m.appendTail(result); return result; } private static final String decodeString(String input) { String tag = HtmlEntitiesDecoder.decodeString(input); // tag = tag.replaceAll("%3A", ":"); // tag = tag.replaceAll("%2F", "/"); return tag; } private CharSequence convertImageTags(CharSequence input, Job job) throws DownloadCancelledException { return convertRemoteToLocalNameAndRegisterAssetForDownload(PATTERN_IMG_SRC, input, Asset.TYPE_IMAGE, job); } private CharSequence translateAndRegisterAssetLocation(final String assetUrl, URL baseUrl, int assetType) throws MalformedURLException { // avoid duplicated by checking if this // asset is already known URL remoteUrl = new URL(baseUrl, assetUrl); // make rurl fully qualified Asset asset = assetUrls2download.get(remoteUrl); if (asset == null) { asset = new Asset(); asset.remoteUrl = remoteUrl; asset.type = assetType; String aId = "a" + id.replace('/', '_'); String ad = ""; String url = asset.remoteUrl.toString().toLowerCase(); if (url.indexOf("ad") > -1 && url.replace("gadget", "gatget").replace("load", "loat").replace("pad", "pat").replace("adobe", "atobe").replace("add", "att").replace("ead", "eat").indexOf("ad") > -1) ad = "_ad"; asset.localName = aId + "/" + aId + "_" + assetCounter++ + ad; Matcher extensionMatch = PATTERN_EXTENSION.matcher(assetUrl); if (extensionMatch != null && extensionMatch.matches()) { String extension = extensionMatch.group(1); asset.localName += extension; } assetUrls2download.put(remoteUrl, asset); } return asset.localName; } } class Downloader { static Map<String, String> loadTextFromUrl(NewsRobHttpClient httpClient, URL pageUrl, long started, Job job, Context context) throws DownloadException, DownloadCancelledException, URISyntaxException, SocketException, SocketTimeoutException, DownloadTimedOutException { Map<String, String> returnValues = new HashMap<String, String>(2); CharSequence result = null; HttpResponse response; HttpContext localContext = new BasicHttpContext(); try { HttpGet loadRequest = new HttpGet(pageUrl.toURI()); response = httpClient.executeZipped(loadRequest, localContext); } catch (IOException e) { throw new DownloadException("Problem during download of " + pageUrl + ".", e); } int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) throw new WrongStatusException(pageUrl, statusCode); String newUri = extractUriFromHttpContext(localContext); if (!pageUrl.toString().equals(newUri)) { PL.log("WPDD Downloader: Changed page's url after redirect from " + pageUrl + " to " + newUri + ".", context); try { pageUrl = new URL(newUri); } catch (MalformedURLException e) { e.printStackTrace(); // keep the existing pageUrl } } try { String charsetName = null; for (HeaderElement he : response.getEntity().getContentType().getElements()) { NameValuePair nvp = he.getParameterByName("charset"); if (nvp != null) { charsetName = nvp.getValue(); break; } } result = U2.readInputStreamIntoString(NewsRobHttpClient.getUngzippedContent(response.getEntity(), context), charsetName, started, job); response.getEntity().consumeContent(); } catch (IOException e) { throw new DownloadException("Problem during reading of InputStream when loading " + pageUrl + ".", e); } returnValues.put("url", pageUrl.toExternalForm()); returnValues.put("content", result.toString()); return returnValues; } private static String extractUriFromHttpContext(HttpContext localContext) { String newHost = ((HttpHost) localContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST)).toURI(); String path = ((HttpRequest) localContext.getAttribute(ExecutionContext.HTTP_REQUEST)).getRequestLine() .getUri(); String newUri = newHost + path; return newUri; } } class U2 { static CharSequence readInputStreamIntoString(InputStream is, String charsetName, long started, Job job) throws IOException, DownloadCancelledException, DownloadTimedOutException { // LATER throw an Exception when the buffer get's to big // Matcher m = WebPageDownloadDirector.PATTERN_CHARSET.matcher(result); // if (m.find()) { // String tag = m.group(); // String charsetValue = m.group(1); // m.appendReplacement(rv, tag.replace(charsetValue, "UTF-8")); // } // m.appendTail(rv); StringBuilder result = new StringBuilder(); Charset charset = Charset.forName("ISO-8859-1"); if (charsetName != null) try { charset = Charset.forName(charsetName); } catch (Exception e) { // stick with the default } InputStreamReader isr = charset != null ? new InputStreamReader(is, charset) : new InputStreamReader(is); BufferedReader br = new BufferedReader(isr, 8 * 1024); while (true) { if (System.currentTimeMillis() - started > WebPageDownloadDirector.PAGE_DOWNLOAD_TIMEOUT_MS) throw new DownloadTimedOutException(); if (job.isCancelled()) throw new DownloadCancelledException(); String line = br.readLine(); if (line != null) { Matcher m = WebPageDownloadDirector.PATTERN_CHARSET.matcher(line); if (m.find()) { StringBuffer sb = new StringBuffer(); String tag = m.group(); String charsetValue = m.group(1); try { if (!charsetValue.toLowerCase().equals("utf-8")) { m.appendReplacement(sb, Matcher.quoteReplacement(tag.replace(charsetValue, "UTF-8"))); m.appendTail(sb); line = sb.toString(); } } catch (ArrayIndexOutOfBoundsException aioobe) { Log.e(WebPageDownloadDirector.TAG, "Ooh. ArrayIndexOutOfBoundsException", aioobe); } } result.append(line + "\n"); } else break; } br.close(); // Timing t = new Timing("replacing charset"); // StringBuffer rv = new StringBuffer(); // Matcher m = WebPageDownloadDirector.PATTERN_CHARSET.matcher(result); // if (m.find()) { // String tag = m.group(); // String charsetValue = m.group(1); // m.appendReplacement(rv, tag.replace(charsetValue, "UTF-8")); // } // m.appendTail(rv); // t.stop(); return result; } static void saveTextFile(OutputStream os, CharSequence content) throws IOException { PrintWriter pw = null; try { pw = new PrintWriter(new OutputStreamWriter(os)); pw.print(content); } finally { if (pw != null) pw.close(); } } } class Asset { static final int TYPE_UNDEFINED = 0; static final int TYPE_IMAGE = 1; static final int TYPE_STYLESHEET = 2; String localName; URL remoteUrl; boolean downloaded; int type; Exception exception; @Override public String toString() { return remoteUrl + " -> " + localName; } }