WebClient.java example

Explorer

Xponents-master
- Basics
  - src
    - main
      - java
        org
        opensextant
        ConfigException.java
        data
        Country.java
        DocInput.java
        GeoBase.java
        Geocoding.java
        Language.java
        LatLon.java
        Place.java
        Taxon.java
        TextInput.java
        extraction
        ExtractionException.java
        ExtractionMetrics.java
        ExtractionResult.java
        Extractor.java
        MatchFilter.java
        NormalizationException.java
        TextEntity.java
        TextMatch.java
        processing
        Parameters.java
        ProcessingException.java
        util
        AnyFilenameFilter.java
        FileUtility.java
        GeodeticUtility.java
        GeonamesUtility.java
        TextUtils.java
    - test
      - java
        MetricsTest.java
        TestGeoUtils.java
        TestGeonamesLanguages.java
        TestGeonamesMeta.java
        TestTextUtils.java
- Examples
  - src
    - main
      - java
        org
        opensextant
        examples
        BasicGeoTemporalProcessing.java
        TaxonomicTagger.java
        WebCrawl.java
        twitter
        MicroMessage.java
        Tweet.java
        TweetGeocoder.java
- Extraction
  - src
    - main
      - java
        org
        opensextant
        extraction
        SolrMatcherSupport.java
        SolrTaggerRequest.java
        extractors
        geo
        BoundaryObserver.java
        CountryCount.java
        CountryObserver.java
        GazetteerMatcher.java
        GazetteerUpdateProcessorFactory.java
        LocationObserver.java
        PlaceCandidate.java
        PlaceCount.java
        PlaceEvidence.java
        PlaceGeocoder.java
        ScoredPlace.java
        SolrGazetteer.java
        TagFilter.java
        rules
        ContextualOrganizationRule.java
        CoordinateAssociationRule.java
        CountryRule.java
        GeocodeRule.java
        LocationChooserRule.java
        MajorPlaceRule.java
        NameCodeRule.java
        NameRule.java
        NonsenseFilter.java
        PersonNameFilter.java
        ProvinceAssociationRule.java
        xtax
        TaxonMatch.java
        TaxonMatcher.java
        output
        AbstractFormatter.java
        CSVFormatter.java
        FormatterFactory.java
        GDBFormatter.java
        GISDataFormatter.java
        GISDataModel.java
        GeoCSVFormatter.java
        KMLFormatter.java
        OpenSextantSchema.java
        ResultsFormatter.java
        ShapefileFormatter.java
        WKTFormatter.java
        processing
        ResultsUtility.java
        XtractorGroup.java
        progress
        ProgressListener.java
        ProgressMonitor.java
        ProgressMonitorBase.java
        util
        SolrProxy.java
        SolrUtil.java
    - test
      - java
        org
        opensextant
        extractors
        test
        TestExtraction.java
        TestGazFactory.java
        TestGazMatcher.java
        TestGazetteer.java
        TestGazetteerConflationKey.java
        TestPersonFilter.java
        TestPlaceGeocoder.java
        TestPlaceGeocoderLanguages.java
        TestStopFilters.java
        TestUtils.java
        TestXTax.java
- MapReduce
  - src
    - main
      - java
        org
        opensextant
        mapreduce
        AbstractMapper.java
        GeoTaggerMapper.java
        KeywordTaggerMapper.java
        Log4JUtils.java
        LoggingUtilities.java
        XponentsTaggerDemo.java
    - test
      - java
        org
        apache
        solr
        core
        CoreContainer.java
- Patterns
  - src
    - main
      - java
        org
        opensextant
        extractors
        flexpat
        AbstractFlexPat.java
        PatternTestCase.java
        RegexPattern.java
        RegexPatternManager.java
        TextMatchResult.java
        poli
        PatternsOfLife.java
        PoliMatch.java
        PoliPatternManager.java
        TestCase.java
        data
        MACAddress.java
        Money.java
        TelephoneNumber.java
        xcoord
        DMSFilter.java
        DMSOrdinate.java
        GeocoordMatch.java
        GeocoordMatchFilter.java
        GeocoordNormalization.java
        GeocoordPattern.java
        GeocoordPrecision.java
        GeocoordTestCase.java
        Hemisphere.java
        MGRSFilter.java
        MGRSParser.java
        PatternManager.java
        PrecisionScales.java
        UTMParser.java
        XConstants.java
        XCoord.java
        xtemporal
        DateMatch.java
        DateNormalization.java
        DateTimePattern.java
        PatternManager.java
        TestCase.java
        XTConstants.java
        XTemporal.java
    - test
      - java
        org
        opensextant
        extractors
        test
        DateNormalizationTest.java
        PrecisionScalesTest.java
        TestPoLi.java
        TestPoLiReporter.java
        TestXCoord.java
        TestXCoordReporter.java
        TestXTemporal.java
        TestXTemporalReporter.java
- XText
  - examples
  - src
    - main
      - java
        org
        opensextant
        xtext
        Content.java
        ConversionListener.java
        ConvertedDocument.java
        Converter.java
        ExclusionFilter.java
        PathManager.java
        XText.java
        collectors
        ArchiveNavigator.java
        CollectionListener.java
        Collector.java
        mailbox
        DefaultMailCrawl.java
        MailClient.java
        MailConfig.java
        NTLMAuth.java
        OutlookPSTCrawler.java
        sharepoint
        DefaultSharepointCrawl.java
        SPLink.java
        SharepointClient.java
        web
        CrawlFilter.java
        DefaultWebCrawl.java
        HyperLink.java
        WebClient.java
        converters
        ConverterAdapter.java
        DefaultConverter.java
        EmbeddedContentConverter.java
        ImageMetadataConverter.java
        MessageConverter.java
        TextTranscodingConverter.java
        TikaHTMLConverter.java
        WebArchiveConverter.java
    - test
      - java
        org
        opensextant
        xtext
        converters
        test
        MessageConverterTest.java
        test
        Decomposer.java
        ImageGroper.java
        MailClientTest.java
        SharepointClientTest.java
        SharepointCrawlTest.java
        TestPST.java
        TestSPLinks.java
        TestTikaPST.java
        Tests.java
        TextTranscodingTest.java
        WebLinkTest.java
- Xlayer
  - src
    - main
      - java
        org
        opensextant
        xlayer
        Transforms.java
        XlayerClient.java
        server
        RequestParameters.java
        TaggerResource.java
        XlayerApp.java
        xgeo
        XlayerRestlet.java
        XlayerServer.java
        XponentsGeotagger.java
    - test
      - java
        XlayerClientTest.java

/**
 *
 * Copyright 2013-2014 OpenSextant.org
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.opensextant.xtext.collectors.web;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.opensextant.ConfigException;
import org.opensextant.util.FileUtility;
import org.opensextant.xtext.XText;
import org.opensextant.xtext.collectors.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

// TODO: Auto-generated Javadoc
/**
 * Simple client that pulls down HTML from a web site, acquire files and crawl sub-folders.
 * This is not a generalize web crawler. It specifically looks for meaningful content, such as HTML pages, document
 * downloads, etc.
 */
public class WebClient {

    private final Logger log = LoggerFactory.getLogger(getClass());

    /**
     * Prep url. This ensures that found URLs that may contain whitespace
     * are properly converted to proper URL format/escaping.
     *
     * @param u
     *            URL string
     * @return URL object
     * @throws MalformedURLException
     *             the malformed url exception
     */
    public static URL prepURL(String u) throws MalformedURLException {
        /**
         * TODO: require outside caller encode URL properly.
         * For now, whitespace is only main issue.
         */
        String encoded = u.replaceAll(" ", "%20");
        return new URL(encoded);
    }

    /**
     * Prep url path.
     *
     * @param u
     *            the u
     * @return the string
     * @throws MalformedURLException
     *             the malformed url exception
     */
    public static String prepURLPath(String u) throws MalformedURLException {
        /**
         * TODO: require outside caller encode URL properly.
         * For now, whitespace is only main issue.
         */
        return u.replaceAll(" ", "%20");
    }

    /**
     * Instantiates a new web client.
     *
     * @param siteUrl
     *            the url to collect.
     * @param archive
     *            the destination archive. Keep in mind, this is the location of downloaded originals.
     *            Use Xtext instance to manage where/how you convert those originals.
     * @throws MalformedURLException
     *             if URL given is bad
     * @throws ConfigException
     *             the config exception
     */
    public WebClient(String siteUrl, String archive) throws MalformedURLException, ConfigException {
        setSite(siteUrl);
        archiveRoot = archive;
    }

    /** The archive root. */
    protected String archiveRoot = null;
    private String proxy = null;

    /** The server. */
    protected String server = null;

    /** The site. */
    protected URL site = null;

    /** The proxy host. */
    protected HttpHost proxyHost = null;

    /** The interval. */
    protected int interval = 100; // milliseconds wait between web requests.

    /** The converter. */
    protected XText converter = null;

    /**
     * Configure.
     *
     * @throws ConfigException
     *             the config exception
     */
    public void configure() throws ConfigException {
        // Test if the site exists and is reachable
        testAvailability();

        // Test is your destination archive exists
        if (archiveRoot != null) {
            File test = new File(archiveRoot);
            if (!(test.isDirectory() && test.exists())) {
                throw new ConfigException(
                        "Destination archive does not exist. Caller must create prior to creation.");
            }
        }
    }

    /**
     * Caller should construct their own conversionManager and pass that in.
     * NOTE: since the web client can operate without an instance of XText, e.g., just run a crawl with no conversion
     * the WebClient constructor takes an archive path. As you pass in a conversion manager here, make sure that the
     * archive root there matches what is used her in the WebClient. If you are using Xtext in embedded mode, then do
     * not worry.
     * the archive is ignored.
     *
     * @param conversionManager
     *            converter, an XText instance
     */
    public void setConverter(XText conversionManager) {
        converter = conversionManager;
    }

    /**
     * Creates the archive file.
     *
     * @param relpath
     *            relative path for this object
     * @param isDir
     *            the is dir
     * @return full path
     * @throws IOException
     *             on I/O error
     */
    protected File createArchiveFile(String relpath, boolean isDir) throws IOException {
        String itemArchivedPath = archiveRoot + Collector.PATH_SEP + relpath;
        File itemSaved = new File(itemArchivedPath.replaceAll("//", "/"));
        if (isDir) {
            FileUtility.makeDirectory(itemSaved);
        } else {
            itemSaved.getParentFile().mkdirs();
        }
        return itemSaved;
    }

    /** */
    protected Map<String, HyperLink> found = new HashMap<String, HyperLink>();

    /** */
    protected Set<String> saved = new HashSet<String>();

    /**
     * current depth of the crawl at any time.
     */
    protected int depth = 0;

    /**
     * Maximum number of levels that will be crawled.
     */
    public final static int MAX_DEPTH = 5;

    /**
     * Allow a proxy host to be set given the URL.
     * Assumes port 80, no user/password.
     *
     * @param hosturl
     *            proxy URL
     */
    public void setProxy(String hosturl) {
        proxy = hosturl;
        int port = 80;
        String host = proxy;
        if (proxy.contains(":")) {
            String[] hp = proxy.split(":");
            host = hp[0];
            port = Integer.parseInt(hp[1]);
        }
        proxyHost = new HttpHost(host, port);
    }

    public void setProxy(String h, int port) {
        proxyHost = new HttpHost(h, port);
    }

    boolean useSystemProperties = false;

    /**
     * @param b flag to enable use of System Properties to get proxy settings, etc.
     */
    public void enableSystemProperties(boolean b) {
        this.useSystemProperties = b;
    }

    /**
     * Sets the site.
     *
     * @param url
     *            the new site
     * @throws MalformedURLException
     *             the malformed url exception
     */
    public void setSite(String url) throws MalformedURLException {
        site = new URL(url);
        server = new URL(url).getHost();
    }

    /**
     * Gets the site.
     *
     * @return the URL object
     */
    public URL getSite() {
        return site;
    }

    /**
     * Gets the server.
     *
     * @return server hostname
     */
    public String getServer() {
        return server;
    }

    /**
     * TODO: Update to use HTTP client "HttpClients....build()" method of creating and tailoring HttpClient
     * using the proxy and cookie settings, as well as any other tuning.
     *
     * Override if your context requires a different style of HTTP client.
     * 
     * @return HttpClient 4.x object
     */
    public HttpClient getClient() {
        HttpClientBuilder clientHelper = null;

        if (this.useSystemProperties) {
            clientHelper = HttpClientBuilder.create().useSystemProperties();
        } else {
            clientHelper = HttpClientBuilder.create();
            if (proxyHost != null) {
                clientHelper.setProxy(proxyHost);
            }
        }

        RequestConfig globalConfig = RequestConfig.custom()
                .setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY).build();

        HttpClient httpClient = clientHelper.setDefaultRequestConfig(globalConfig).build();

        return httpClient;
    }

    /**
     * Tests the availability of the currently configured source.
     *
     * @throws ConfigException
     *             error which means resource is unavailable.
     */
    public void testAvailability() throws ConfigException {

        if (site == null) {
            throw new ConfigException("Engineering Error: site was not set.");
        }

        try {
            getPage(site);
            return;
        } catch (Exception err) {
            throw new ConfigException(
                    String.format("%s failed to collect URL %s", getName(), site), err);
        }
    }

    /**
     * clears state of crawl.
     */
    public void reset() {
        // Clear list of distinct items found
        this.found.clear();
        // Clear list of items tracked/saved in this session.
        this.saved.clear();
    }

    /**
     * Sets the interval.
     *
     * @param i
     *            interval
     */
    public void setInterval(int i) {
        interval = i;
    }

    /**
     * Pause.
     */
    protected void pause() {
        if (interval > 0) {
            try {
                Thread.sleep(interval);
            } catch (Exception err) {

            }
        }
    }

    /**
     * Get a web page that requires NTLM authentication.
     *
     * @param siteURL
     *            URL
     * @return response for the URL
     * @throws IOException
     *             on error
     */
    public HttpResponse getPage(URL siteURL) throws IOException {
        HttpClient httpClient = getClient();
        HttpGet httpget = new HttpGet();

        try {
            URI address = siteURL.toURI();
            httpget.setURI(address);
            HttpResponse response = httpClient.execute(httpget);

            if (response.getStatusLine().getStatusCode() == 404) {
                throw new IOException("HTTP Page " + siteURL + " not found");
            }

            return response;
        } catch (URISyntaxException ioerr) {
            throw new IOException(ioerr);
        }
    }

    private static final Pattern HREF_MATCH = Pattern.compile("href=[\"']([^\"']+)[\"']", Pattern.CASE_INSENSITIVE);

    /**
     * Recursively parse a site page, limiting the crawl to local items
     * contained within the current folder/page
     * This finds only obvious HREF anchors and filters out problematic ones:
     * 
     * <pre>
     *  "/"
     *  "../xxxxxxx/"
     *  "#"
     *  "javascript:xxxxxx"
     * </pre>
     * 
     * TODO: pass in or set an allow filter. sometimes caller knows which content is worth
     * following, e.g., ../abc_folder/morecontent.htm and such URLs should be resolved absolutely to avoid
     * recapture repeatedly.
     *
     * @param html
     *            HTML text buffer
     * @param pageUrl
     *            the page url
     * @param siteUrl
     *            the site url
     * @return a list of found links
     */
    public Collection<HyperLink> parseContentPage(String html, URL pageUrl, URL siteUrl) {
        Map<String, HyperLink> contentLinks = new HashMap<String, HyperLink>();
        Matcher matches = HREF_MATCH.matcher(html);
        while (matches.find()) {
            String link = matches.group(1).trim();
            String link_lc = link.toLowerCase();

            if ("/".equals(link) || "#".equals(link)) {
                continue;
            }
            if (link_lc.startsWith("#") || link_lc.startsWith("javascript")) {
                continue;
            }
            if (link_lc.startsWith("mailto:")) {
                log.info("Ignore Mailto {}", link_lc);
                continue;
            }

            if (link.endsWith("/")) {
                link = link.substring(0, link.length() - 1);
            }

            try {
                HyperLink l = new HyperLink(link, pageUrl, siteUrl);
                if (l.isResource()) {
                    continue;
                }
                if (!contentLinks.containsKey(l.toString())) {
                    log.debug("Found link {}", link);
                    contentLinks.put(l.toString(), l);
                }
            } catch (Exception err) {
                log.error("Failed to parse URL {}", link, err);
            }

        }

        return contentLinks.values();
    }

    /**
     * Reads a data stream as text as the default encoding.
     * TODO: test reading website content with different charset encodings to see if the resulting String
     * is properly decoded.
     *
     * @param io
     *            IO stream
     * @return content of the stream
     * @throws IOException
     *             I/O error
     */
    public static String readTextStream(InputStream io) throws IOException {
        Reader reader = new InputStreamReader(io);
        StringWriter buf = new StringWriter();

        int ch;
        while ((ch = reader.read()) >= 0) {
            buf.write(ch);
        }
        reader.close();
        io.close();

        return buf.toString();
    }

    /**
     * Reads an HttpEntity object, saving it to the path
     * 
     * REF: http://stackoverflow.com/questions/10960409/how-do-i-save-a-file-
     * downloaded-with-httpclient-into-a-specific-folder
     *
     * @param entity
     *            http entity obj
     * @param destPath
     *            output path
     * @throws IOException
     *             Signals that an I/O exception has occurred.
     */
    public static void downloadFile(HttpEntity entity, String destPath) throws IOException {
        org.apache.commons.io.IOUtils.copy(entity.getContent(), new FileOutputStream(destPath));
    }

    private String name = "Unamed Web crawler";

    /**
     * Set a name of this client for tracking puropses, e.g., in multiple threads
     *
     * @param n
     *            the new name
     */
    public void setName(String n) {
        name = n;
    }

    /**
     * Get name of client
     *
     * @return the name
     */
    public String getName() {
        return name;
    }
}