WebPageDownloadDirector.java example

Explorer
newsrob-master
package com.newsrob.download;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HeaderElement;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;

import android.content.Context;
import android.util.Log;

import com.newsrob.AssetContentProvider;
import com.newsrob.EntryManager;
import com.newsrob.NewsRob;
import com.newsrob.PL;
import com.newsrob.jobs.Job;
import com.newsrob.storage.IStorageAdapter;
import com.newsrob.util.Timing;

public class WebPageDownloadDirector {
    private Map<URL, Asset> assetUrls2download = new HashMap<URL, Asset>(15);

    private URL pageUrl;
    private String id;
    private int assetCounter = 1;
    private EntryManager entryManager;

    private boolean isDetailedLoggingEnabled;

    static final String TAG = WebPageDownloadDirector.class.getSimpleName();
    private IStorageAdapter fileContext;

    private Context context;
    private long started = System.currentTimeMillis();
    static final long PAGE_DOWNLOAD_TIMEOUT_MS = 180000;

    private static final Pattern PATTERN_LINK_HREF = Pattern.compile(
            "<\\s*?link.*?href.*?[\"']([^\"]*?\\.css).*?[\"'].*?>", Pattern.CASE_INSENSITIVE); // |
    // Pattern.MULTILINE);
    // "<\\s*?link.*?href.*?\"(.*?\\.css)\".*?>"
    private static final Pattern PATTERN_IMG_SRC = Pattern.compile(
            "[^>]*?<\\s*?img[^><]*?src[^><]*?[\"'](.*?)[\"'][^<]*?>.*?", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // Removed

    // a
    // tailing
    // $
    // "^.*<\\s*?img.*?src.*?\"(.*?)\".*?>.*$"
    // private static final Pattern PATTERN_IMG_SRC =
    // Pattern.compile("<\\s*?img.*?src.*?\"(.*?)\".*?>",
    // Pattern.CASE_INSENSITIVE); // | Pattern.MULTILINE
    private static final Pattern PATTERN_EXTENSION = Pattern.compile(".*(\\.\\w+).*?$");

    private static final Pattern PATTERN_BACKGROUND_IMG = Pattern.compile(
            "background.*?:.*?url\\([\"']?(.*?)[\"']?\\)", Pattern.CASE_INSENSITIVE); // |
    // Pattern.MULTILINE
    static final Pattern PATTERN_CHARSET = Pattern.compile(
            "^.*?meta[^>]*?http-equiv[^>]*?Content-Type[^>]*?charset=([a-z0-9-]*).*$", Pattern.CASE_INSENSITIVE
                    | Pattern.MULTILINE);

    public static WebPageDownloadDirector downloadWebPage(String id, URL pageUrl, IStorageAdapter fileContext, Job job,
            String summary, boolean downloadCompleteWebPage, EntryManager entryManager, boolean manualSync)
            throws DownloadException, DownloadTimedOutException, DownloadCancelledException {

        try {
            return new WebPageDownloadDirector(id, pageUrl, fileContext, job, summary, downloadCompleteWebPage,
                    entryManager, manualSync);
        } catch (OutOfMemoryError oome) {
            throw new DownloadException("OutOfMemory when processing " + pageUrl + ".", oome);
        }
    }

    public static int removeAllAssets(IStorageAdapter fileContext) {
        int noOfDeletedAssets = 0;
        if (fileContext == null)
            throw new IllegalStateException("fileContext cannot be null.");
        fileContext.clear();
        return noOfDeletedAssets;
    }

    // LATER move out to IStorage...
    public static int removeAssetsForId(String atomId, IStorageAdapter fileContext) {
        int noOfDeletedAssets = 0;

        if (fileContext == null)
            throw new IllegalStateException("fileContext cannot be null.");

        noOfDeletedAssets = fileContext.removeAllAssets(atomId);
        return noOfDeletedAssets;
    }

    private WebPageDownloadDirector(String id, URL pageUrl, IStorageAdapter fileContext, Job job, CharSequence summary,
            boolean downloadCompleteWebPage, EntryManager entryManager, boolean manualSync) throws DownloadException,
            DownloadTimedOutException, DownloadCancelledException {

        this.context = entryManager.getContext();

        Timing t = null;
        if (isDetailedLoggingEnabled)
            t = new Timing("WPDD: Downloading " + pageUrl, context);

        this.entryManager = entryManager;
        try {
            pageUrl.toURI();
        } catch (URISyntaxException e1) {
            throw new DownloadException("Problem with a URI: " + pageUrl, e1);
        }

        isDetailedLoggingEnabled = "1".equals(NewsRob.getDebugProperties(context).getProperty(
                "webpageDownloadDirector", "0"));
        if (isDetailedLoggingEnabled)
            PL.log("WPDD: Making offline: " + pageUrl, context);
        NewsRobHttpClient httpClient = NewsRobHttpClient.newInstance(true, context);

        try {
            this.id = id;
            this.pageUrl = pageUrl;
            this.fileContext = fileContext;

            if (downloadCompleteWebPage) {

                assertDownloadShouldContinue(manualSync);
                NewsRobHttpClient httpC = NewsRobHttpClient.newInstance(true, context);
                try {

                    Map<String, String> results = Downloader.loadTextFromUrl(httpC, pageUrl, started, job, context);

                    CharSequence pageContent = results.get("content");
                    pageUrl = new URL(results.get("url"));
                    this.pageUrl = pageUrl; // LATER
                    assertDownloadShouldContinue(manualSync);

                    // pageContent =
                    // pageContent.toString().replace("iso-8859-1",
                    // "utf-8");

                    pageContent = convertImageTags(pageContent, job);

                    pageContent = convertStyleSheetLinks(pageContent, job);
                    pageContent = convertStyleSheetImageTags(pageUrl, pageContent, job);

                    savePage(pageContent, "x");

                } finally {
                    httpC.close();
                }
            }

            if (summary != null) {

                summary = convertImageTags(summary, job);
                summary = convertStyleSheetLinks(summary, job);
                summary = convertStyleSheetImageTags(pageUrl, summary, job);
                if (true)
                    summary = summary;
                else
                    summary = "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/></head><body>"
                            + summary + "</body></html>";
                savePage(summary, "s");
            }

            downloadAndConvertAllImageTagsInStyleSheets(httpClient, job, manualSync);
            downloadAllImages(httpClient, started, job, pageUrl, manualSync);

        } catch (SocketTimeoutException ste) {
            throw new DownloadTimedOutException();
        } catch (DownloadException de) {
            throw de;
        } catch (DownloadCancelledException dce) {
            throw dce;
        } catch (DownloadTimedOutException dce) {
            throw dce;
        } catch (Exception e) {
            throw new DownloadException("Problem while downloading " + pageUrl + ".", e);
        } finally {
            httpClient.close();
        }
        if (t != null) {
            t.stop();
            Log.d(TAG, "Assets downloaded for " + pageUrl + ": " + assetCounter);
        }

    }

    private void assertDownloadShouldContinue(boolean manualSync) throws DownloadCancelledException {
        if (!entryManager.downloadContentCurrentlyEnabled(manualSync))
            throw new DownloadCancelledException("WiFi no longer available.");
    }

    private void savePage(CharSequence convertedPageContent, String postfix) throws DownloadException {
        final String aId = "a" + id;
        String fileName = aId + "/" + aId + "_" + postfix + ".html";
        try {
            U2.saveTextFile(fileContext.openFileOutput(fileName + "nr"), convertedPageContent);
        } catch (IOException e) {
            throw new DownloadException("Problem during writing of " + fileName + " for page " + pageUrl + ".", e);
        }
    }

    private void downloadAllImages(NewsRobHttpClient httpClient, long started, Job job, URL pageUrl, boolean manualSync)
            throws DownloadCancelledException, DownloadTimedOutException {

        for (Asset asset : assetUrls2download.values()) {
            assertDownloadShouldContinue(manualSync);

            if (Asset.TYPE_IMAGE == asset.type) {
                try {

                    if (job.isCancelled())
                        throw new DownloadCancelledException();

                    downloadBinaryAsset(httpClient, asset, started, job, pageUrl);
                } catch (URISyntaxException e) {
                    asset.exception = e;
                }
            }
        }

    }

    private void downloadAndConvertAllImageTagsInStyleSheets(NewsRobHttpClient httpClient, Job job, boolean manualSync)
            throws DownloadException, DownloadCancelledException, IOException, DownloadTimedOutException {
        Collection<Asset> assets2downloadCopy = new ArrayList<Asset>(assetUrls2download.values());

        for (Asset asset : assets2downloadCopy) {
            assertDownloadShouldContinue(manualSync);

            if (Asset.TYPE_STYLESHEET == asset.type) {
                CharSequence content;
                try {

                    if (job.isCancelled())
                        throw new DownloadCancelledException();

                    Thread.yield();
                    Map<String, String> results = Downloader.loadTextFromUrl(httpClient, asset.remoteUrl, started, job,
                            context);
                    // pageUrl = new URL(results.get("url"));
                    content = results.get("content");
                } catch (URISyntaxException e) {
                    // ignore
                    continue;
                } catch (WrongStatusException wsr) {
                    continue;
                }

                content = convertStyleSheetImageTags(asset.remoteUrl, content, job);

                U2.saveTextFile(fileContext.openFileOutput(asset.localName + "nr"), content);
                asset.downloaded = true;
            }
        }

    }

    private CharSequence convertStyleSheetImageTags(URL baseUrl, CharSequence input, Job job)
            throws DownloadCancelledException {
        return convertRemoteToLocalNameAndRegisterAssetForDownload(PATTERN_BACKGROUND_IMG, input, Asset.TYPE_IMAGE,
                baseUrl, job);
    }

    // LATER consolidate text and binary downloads
    // LATER Also a better understanding is needed if
    // I need to take care of the content type from the server?!
    // LATER GZIP für Stylesheets und HTML?
    private void downloadBinaryAsset(NewsRobHttpClient httpClient, Asset asset, long started, Job job, URL pageUrl)
            throws URISyntaxException, DownloadTimedOutException {

        final int BUFFER_SIZE = 8 * 1024;
        BufferedOutputStream bos = null;
        BufferedInputStream bis = null;
        try {
            Timing t = null;

            HttpGet getAssetRequest = new HttpGet(asset.remoteUrl.toURI());
            getAssetRequest.setHeader("Referer", pageUrl.toExternalForm());
            if (isDetailedLoggingEnabled) {
                PL.log("WPDD: Downloading as part of " + "(" + pageUrl + ") remote:" + asset.remoteUrl.toURI()
                        + " local: " + asset.localName, context);
                t = new Timing("Downloading as part of " + "(" + pageUrl + ") remote:" + asset.remoteUrl.toURI()
                        + " local: " + asset.localName, context);
            }
            HttpResponse response = httpClient.execute(getAssetRequest);
            int statusCode = response.getStatusLine().getStatusCode();
            if (isDetailedLoggingEnabled)
                PL.log("WPDD: HTTP_STATUS_CODE=" + statusCode, context);
            if (statusCode == HttpStatus.SC_OK) {

                bis = new BufferedInputStream(response.getEntity().getContent(), BUFFER_SIZE);
                bos = new BufferedOutputStream(fileContext.openFileOutput(asset.localName + "nr"), BUFFER_SIZE);

                byte[] buffer = new byte[BUFFER_SIZE];
                while (true) {
                    if (job.isCancelled())
                        throw new DownloadCancelledException();

                    if (System.currentTimeMillis() - started > WebPageDownloadDirector.PAGE_DOWNLOAD_TIMEOUT_MS)
                        throw new DownloadTimedOutException(pageUrl.toString(),
                                WebPageDownloadDirector.PAGE_DOWNLOAD_TIMEOUT_MS);

                    int noReadBytes = bis.read(buffer);
                    if (noReadBytes == -1)
                        break;
                    bos.write(buffer, 0, noReadBytes);
                    Thread.yield();
                }
                Log.w(TAG, asset.remoteUrl + " did download ok.");
            } else
                Log.w(TAG, asset.remoteUrl + " did not download. Status code=" + statusCode);
            response.getEntity().consumeContent();
            if (isDetailedLoggingEnabled && t != null)
                t.stop();

        } catch (URISyntaxException e) {
            throw e;
        } catch (DownloadTimedOutException e) {
            throw e;
        } catch (Exception e) {
            if (isDetailedLoggingEnabled) {
                PL.log("WPDD: Downloading as part of " + "(" + pageUrl + "):" + asset.remoteUrl.toURI()
                        + " Resulting exception=" + e.getClass().getName() + " " + e.getMessage(), context);
                e.printStackTrace();
            }
            asset.exception = e;
            String path = fileContext.getAbsolutePathForAsset(asset.localName);
            File f = new File(path);

            if (f.exists()) {
                boolean success = f.delete();
                Log.d("DEBUG", "Deleting file " + f + " was successful: " + success);
            }

        } finally {
            try {
                if (bis != null)
                    bis.close();
                if (bos != null)
                    bos.close();
            } catch (IOException e) {
            }
        }
        asset.downloaded = true;

    }

    private CharSequence convertStyleSheetLinks(CharSequence input, Job job) throws DownloadCancelledException {
        return convertRemoteToLocalNameAndRegisterAssetForDownload(PATTERN_LINK_HREF, input, Asset.TYPE_STYLESHEET, job);
    }

    private CharSequence convertRemoteToLocalNameAndRegisterAssetForDownload(final Pattern p, final CharSequence input,
            int assetType, Job job) throws DownloadCancelledException {
        return convertRemoteToLocalNameAndRegisterAssetForDownload(p, input, assetType, pageUrl, job);
    }

    private CharSequence convertRemoteToLocalNameAndRegisterAssetForDownload(final Pattern p, final CharSequence input,
            int assetType, URL baseUrl, Job job) throws DownloadCancelledException {

        StringBuffer result = new StringBuffer();

        Matcher m = p.matcher(input);

        while (m.find()) {
            if (job.isCancelled())
                throw new DownloadCancelledException();

            String tag = decodeString(m.group());
            String assetUrl = decodeString(m.group(1));

            if (tag != null && assetUrl != null && assetUrl.length() > 0) {
                try {
                    Thread.yield();// LATER
                    String newName = AssetContentProvider.CONTENT_URI + "/"
                            + translateAndRegisterAssetLocation(assetUrl, baseUrl, assetType);
                    String replaced = tag.replace(assetUrl, newName);
                    /*
                     * if (isDetailedLoggingEnabled) PL.log("WPDD: assetUrl=" +
                     * assetUrl + "\n      tag=" + tag + "\n      replaced=" +
                     * replaced + "\n      newName=" + newName);
                     */
                    m.appendReplacement(result, Matcher.quoteReplacement(replaced));

                } catch (MalformedURLException e) {
                    // Ignoring malformed asset urls
                    System.err.println("Ooops. Malformed URL " + e);
                } catch (ArrayIndexOutOfBoundsException aioobe) {
                    aioobe.printStackTrace();
                    continue;
                    // throw aioobe;
                }
            }
        }
        m.appendTail(result);

        return result;

    }

    private static final String decodeString(String input) {
        String tag = HtmlEntitiesDecoder.decodeString(input);
        // tag = tag.replaceAll("%3A", ":");
        // tag = tag.replaceAll("%2F", "/");
        return tag;
    }

    private CharSequence convertImageTags(CharSequence input, Job job) throws DownloadCancelledException {
        return convertRemoteToLocalNameAndRegisterAssetForDownload(PATTERN_IMG_SRC, input, Asset.TYPE_IMAGE, job);
    }

    private CharSequence translateAndRegisterAssetLocation(final String assetUrl, URL baseUrl, int assetType)
            throws MalformedURLException {

        // avoid duplicated by checking if this
        // asset is already known

        URL remoteUrl = new URL(baseUrl, assetUrl); // make rurl fully qualified

        Asset asset = assetUrls2download.get(remoteUrl);

        if (asset == null) {

            asset = new Asset();
            asset.remoteUrl = remoteUrl;
            asset.type = assetType;
            String aId = "a" + id.replace('/', '_');

            String ad = "";
            String url = asset.remoteUrl.toString().toLowerCase();
            if (url.indexOf("ad") > -1
                    && url.replace("gadget", "gatget").replace("load", "loat").replace("pad", "pat").replace("adobe",
                            "atobe").replace("add", "att").replace("ead", "eat").indexOf("ad") > -1)
                ad = "_ad";
            asset.localName = aId + "/" + aId + "_" + assetCounter++ + ad;

            Matcher extensionMatch = PATTERN_EXTENSION.matcher(assetUrl);
            if (extensionMatch != null && extensionMatch.matches()) {
                String extension = extensionMatch.group(1);
                asset.localName += extension;
            }

            assetUrls2download.put(remoteUrl, asset);
        }
        return asset.localName;
    }
}

class Downloader {
    static Map<String, String> loadTextFromUrl(NewsRobHttpClient httpClient, URL pageUrl, long started, Job job,
            Context context) throws DownloadException, DownloadCancelledException, URISyntaxException, SocketException,
            SocketTimeoutException, DownloadTimedOutException {

        Map<String, String> returnValues = new HashMap<String, String>(2);

        CharSequence result = null;

        HttpResponse response;
        HttpContext localContext = new BasicHttpContext();

        try {
            HttpGet loadRequest = new HttpGet(pageUrl.toURI());
            response = httpClient.executeZipped(loadRequest, localContext);

        } catch (IOException e) {
            throw new DownloadException("Problem during download of " + pageUrl + ".", e);
        }

        int statusCode = response.getStatusLine().getStatusCode();
        if (statusCode != HttpStatus.SC_OK)
            throw new WrongStatusException(pageUrl, statusCode);

        String newUri = extractUriFromHttpContext(localContext);

        if (!pageUrl.toString().equals(newUri)) {
            PL.log("WPDD Downloader: Changed page's url after redirect from " + pageUrl + " to " + newUri + ".",
                    context);
            try {
                pageUrl = new URL(newUri);
            } catch (MalformedURLException e) {
                e.printStackTrace();
                // keep the existing pageUrl
            }

        }

        try {
            String charsetName = null;
            for (HeaderElement he : response.getEntity().getContentType().getElements()) {
                NameValuePair nvp = he.getParameterByName("charset");
                if (nvp != null) {
                    charsetName = nvp.getValue();
                    break;
                }
            }
            result = U2.readInputStreamIntoString(NewsRobHttpClient.getUngzippedContent(response.getEntity(), context),
                    charsetName, started, job);
            response.getEntity().consumeContent();
        } catch (IOException e) {
            throw new DownloadException("Problem during reading of InputStream when loading " + pageUrl + ".", e);
        }

        returnValues.put("url", pageUrl.toExternalForm());
        returnValues.put("content", result.toString());
        return returnValues;
    }

    private static String extractUriFromHttpContext(HttpContext localContext) {
        String newHost = ((HttpHost) localContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST)).toURI();
        String path = ((HttpRequest) localContext.getAttribute(ExecutionContext.HTTP_REQUEST)).getRequestLine()
                .getUri();
        String newUri = newHost + path;
        return newUri;
    }
}

class U2 {

    static CharSequence readInputStreamIntoString(InputStream is, String charsetName, long started, Job job)
            throws IOException, DownloadCancelledException, DownloadTimedOutException {
        // LATER throw an Exception when the buffer get's to big

        // Matcher m = WebPageDownloadDirector.PATTERN_CHARSET.matcher(result);
        // if (m.find()) {
        // String tag = m.group();
        // String charsetValue = m.group(1);
        // m.appendReplacement(rv, tag.replace(charsetValue, "UTF-8"));
        // }
        // m.appendTail(rv);

        StringBuilder result = new StringBuilder();
        Charset charset = Charset.forName("ISO-8859-1");
        if (charsetName != null)
            try {
                charset = Charset.forName(charsetName);
            } catch (Exception e) {
                // stick with the default
            }
        InputStreamReader isr = charset != null ? new InputStreamReader(is, charset) : new InputStreamReader(is);

        BufferedReader br = new BufferedReader(isr, 8 * 1024);
        while (true) {

            if (System.currentTimeMillis() - started > WebPageDownloadDirector.PAGE_DOWNLOAD_TIMEOUT_MS)
                throw new DownloadTimedOutException();

            if (job.isCancelled())
                throw new DownloadCancelledException();

            String line = br.readLine();

            if (line != null) {
                Matcher m = WebPageDownloadDirector.PATTERN_CHARSET.matcher(line);
                if (m.find()) {
                    StringBuffer sb = new StringBuffer();
                    String tag = m.group();
                    String charsetValue = m.group(1);
                    try {
                        if (!charsetValue.toLowerCase().equals("utf-8")) {
                            m.appendReplacement(sb, Matcher.quoteReplacement(tag.replace(charsetValue, "UTF-8")));
                            m.appendTail(sb);
                            line = sb.toString();
                        }
                    } catch (ArrayIndexOutOfBoundsException aioobe) {
                        Log.e(WebPageDownloadDirector.TAG, "Ooh. ArrayIndexOutOfBoundsException", aioobe);
                    }
                }
                result.append(line + "\n");
            } else
                break;
        }
        br.close();
        // Timing t = new Timing("replacing charset");

        // StringBuffer rv = new StringBuffer();
        // Matcher m = WebPageDownloadDirector.PATTERN_CHARSET.matcher(result);
        // if (m.find()) {
        // String tag = m.group();
        // String charsetValue = m.group(1);
        // m.appendReplacement(rv, tag.replace(charsetValue, "UTF-8"));
        // }
        // m.appendTail(rv);
        // t.stop();
        return result;
    }

    static void saveTextFile(OutputStream os, CharSequence content) throws IOException {
        PrintWriter pw = null;
        try {
            pw = new PrintWriter(new OutputStreamWriter(os));
            pw.print(content);
        } finally {
            if (pw != null)
                pw.close();
        }
    }
}

class Asset {
    static final int TYPE_UNDEFINED = 0;
    static final int TYPE_IMAGE = 1;
    static final int TYPE_STYLESHEET = 2;

    String localName;
    URL remoteUrl;
    boolean downloaded;
    int type;
    Exception exception;

    @Override
    public String toString() {
        return remoteUrl + " -> " + localName;
    }
}