DefaultWebCrawl.java example

Explorer

Xponents-master
- Basics
  - src
    - main
      - java
        org
        opensextant
        ConfigException.java
        data
        Country.java
        DocInput.java
        GeoBase.java
        Geocoding.java
        Language.java
        LatLon.java
        Place.java
        Taxon.java
        TextInput.java
        extraction
        ExtractionException.java
        ExtractionMetrics.java
        ExtractionResult.java
        Extractor.java
        MatchFilter.java
        NormalizationException.java
        TextEntity.java
        TextMatch.java
        processing
        Parameters.java
        ProcessingException.java
        util
        AnyFilenameFilter.java
        FileUtility.java
        GeodeticUtility.java
        GeonamesUtility.java
        TextUtils.java
    - test
      - java
        MetricsTest.java
        TestGeoUtils.java
        TestGeonamesLanguages.java
        TestGeonamesMeta.java
        TestTextUtils.java
- Examples
  - src
    - main
      - java
        org
        opensextant
        examples
        BasicGeoTemporalProcessing.java
        TaxonomicTagger.java
        WebCrawl.java
        twitter
        MicroMessage.java
        Tweet.java
        TweetGeocoder.java
- Extraction
  - src
    - main
      - java
        org
        opensextant
        extraction
        SolrMatcherSupport.java
        SolrTaggerRequest.java
        extractors
        geo
        BoundaryObserver.java
        CountryCount.java
        CountryObserver.java
        GazetteerMatcher.java
        GazetteerUpdateProcessorFactory.java
        LocationObserver.java
        PlaceCandidate.java
        PlaceCount.java
        PlaceEvidence.java
        PlaceGeocoder.java
        ScoredPlace.java
        SolrGazetteer.java
        TagFilter.java
        rules
        ContextualOrganizationRule.java
        CoordinateAssociationRule.java
        CountryRule.java
        GeocodeRule.java
        LocationChooserRule.java
        MajorPlaceRule.java
        NameCodeRule.java
        NameRule.java
        NonsenseFilter.java
        PersonNameFilter.java
        ProvinceAssociationRule.java
        xtax
        TaxonMatch.java
        TaxonMatcher.java
        output
        AbstractFormatter.java
        CSVFormatter.java
        FormatterFactory.java
        GDBFormatter.java
        GISDataFormatter.java
        GISDataModel.java
        GeoCSVFormatter.java
        KMLFormatter.java
        OpenSextantSchema.java
        ResultsFormatter.java
        ShapefileFormatter.java
        WKTFormatter.java
        processing
        ResultsUtility.java
        XtractorGroup.java
        progress
        ProgressListener.java
        ProgressMonitor.java
        ProgressMonitorBase.java
        util
        SolrProxy.java
        SolrUtil.java
    - test
      - java
        org
        opensextant
        extractors
        test
        TestExtraction.java
        TestGazFactory.java
        TestGazMatcher.java
        TestGazetteer.java
        TestGazetteerConflationKey.java
        TestPersonFilter.java
        TestPlaceGeocoder.java
        TestPlaceGeocoderLanguages.java
        TestStopFilters.java
        TestUtils.java
        TestXTax.java
- MapReduce
  - src
    - main
      - java
        org
        opensextant
        mapreduce
        AbstractMapper.java
        GeoTaggerMapper.java
        KeywordTaggerMapper.java
        Log4JUtils.java
        LoggingUtilities.java
        XponentsTaggerDemo.java
    - test
      - java
        org
        apache
        solr
        core
        CoreContainer.java
- Patterns
  - src
    - main
      - java
        org
        opensextant
        extractors
        flexpat
        AbstractFlexPat.java
        PatternTestCase.java
        RegexPattern.java
        RegexPatternManager.java
        TextMatchResult.java
        poli
        PatternsOfLife.java
        PoliMatch.java
        PoliPatternManager.java
        TestCase.java
        data
        MACAddress.java
        Money.java
        TelephoneNumber.java
        xcoord
        DMSFilter.java
        DMSOrdinate.java
        GeocoordMatch.java
        GeocoordMatchFilter.java
        GeocoordNormalization.java
        GeocoordPattern.java
        GeocoordPrecision.java
        GeocoordTestCase.java
        Hemisphere.java
        MGRSFilter.java
        MGRSParser.java
        PatternManager.java
        PrecisionScales.java
        UTMParser.java
        XConstants.java
        XCoord.java
        xtemporal
        DateMatch.java
        DateNormalization.java
        DateTimePattern.java
        PatternManager.java
        TestCase.java
        XTConstants.java
        XTemporal.java
    - test
      - java
        org
        opensextant
        extractors
        test
        DateNormalizationTest.java
        PrecisionScalesTest.java
        TestPoLi.java
        TestPoLiReporter.java
        TestXCoord.java
        TestXCoordReporter.java
        TestXTemporal.java
        TestXTemporalReporter.java
- XText
  - examples
  - src
    - main
      - java
        org
        opensextant
        xtext
        Content.java
        ConversionListener.java
        ConvertedDocument.java
        Converter.java
        ExclusionFilter.java
        PathManager.java
        XText.java
        collectors
        ArchiveNavigator.java
        CollectionListener.java
        Collector.java
        mailbox
        DefaultMailCrawl.java
        MailClient.java
        MailConfig.java
        NTLMAuth.java
        OutlookPSTCrawler.java
        sharepoint
        DefaultSharepointCrawl.java
        SPLink.java
        SharepointClient.java
        web
        CrawlFilter.java
        DefaultWebCrawl.java
        HyperLink.java
        WebClient.java
        converters
        ConverterAdapter.java
        DefaultConverter.java
        EmbeddedContentConverter.java
        ImageMetadataConverter.java
        MessageConverter.java
        TextTranscodingConverter.java
        TikaHTMLConverter.java
        WebArchiveConverter.java
    - test
      - java
        org
        opensextant
        xtext
        converters
        test
        MessageConverterTest.java
        test
        Decomposer.java
        ImageGroper.java
        MailClientTest.java
        SharepointClientTest.java
        SharepointCrawlTest.java
        TestPST.java
        TestSPLinks.java
        TestTikaPST.java
        Tests.java
        TextTranscodingTest.java
        WebLinkTest.java
- Xlayer
  - src
    - main
      - java
        org
        opensextant
        xlayer
        Transforms.java
        XlayerClient.java
        server
        RequestParameters.java
        TaggerResource.java
        XlayerApp.java
        xgeo
        XlayerRestlet.java
        XlayerServer.java
        XponentsGeotagger.java
    - test
      - java
        XlayerClientTest.java

/**
 *
 * Copyright 2013-2014 OpenSextant.org
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.opensextant.xtext.collectors.web;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.client.utils.DateUtils;
import org.opensextant.ConfigException;
import org.opensextant.util.FileUtility;
import org.opensextant.xtext.ConvertedDocument;
import org.opensextant.xtext.ExclusionFilter;
import org.opensextant.xtext.collectors.CollectionListener;
import org.opensextant.xtext.collectors.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A demonstration of how to use the WebClient to crawl a site and convert it as you go.
 * An optional collection listener is settable to let you do something with each item collected/converted.
 *
 * @author ubaldino
 *
 */
public class DefaultWebCrawl extends WebClient implements ExclusionFilter, Collector, CrawlFilter {

    /**
     * A collection listener to consult as far as how to record the found & converted content
     * as well as to determine what is worth saving.
     *
     */
    protected CollectionListener listener = null;
    private final Logger log = LoggerFactory.getLogger(getClass());
    private boolean allowCurrentSiteOnly = true;
    private boolean allowCurrentDirOnly = false;
    private HashSet<String> errorPages = new HashSet<>();
    private List<String> prefixFilters = new ArrayList<>();
    private List<String> prefixIgnore = new ArrayList<>();

    /**
     *
     * @param srcSite
     *            top level site
     * @param destFolder
     *            output folder
     * @throws MalformedURLException
     *             if srcSite is invalid format
     * @throws ConfigException
     *             if other setup error
     */
    public DefaultWebCrawl(String srcSite, String destFolder) throws MalformedURLException,
            ConfigException {
        super(srcSite, destFolder);
    }

    /**
     * Important that you set a listener if you want to see what was captured.
     * As well as optimize future harvests. Listener tells the collector if the item in question was harvested or not.
     * 
     * @param l
     *            listener to use
     */
    public void setListener(CollectionListener l) {
        listener = l;
    }

    public void addPrefixFilter(String u) {
        if (StringUtils.isNotBlank(u)) {
            this.prefixFilters.add(u);
        }
    }

    public void addPrefixFilters(Collection<String> arr) {
        if (arr != null) {
            this.prefixFilters.addAll(arr);
        }
    }

    public void addIgnoreFilter(String u) {
        if (StringUtils.isNotBlank(u)) {
            this.prefixIgnore.add(u);
        }
    }

    public void addIgnoreFilters(Collection<String> arr) {
        if (arr != null) {
            this.prefixIgnore.addAll(arr);
        }
    }

    /**
     * For web crawl, this default crawler considers flash, video media, etc. to be out of scope.
     * Other HREF links, like mailto:xyz@me.com are also items to avoid.
     * Method is left open so you may override.
     * 
     * @param path
     *            a url
     */
    @Override
    public boolean filterOutFile(String path) {
        String url = path.toLowerCase();
        if (url.startsWith("mailto:")) {
            return true;
        }
        if (url.endsWith(".atom") || url.endsWith(".rss")) {
            return true;
        }
        if (url.endsWith(".flv")) {
            return true;
        }
        if (url.endsWith(".mp4")) {
            return true;
        }
        if (url.contains("xmlrpc")) {
            return true;
        }
        return false;
    }

    /**
     * Run the collection.
     * Make sure you have set your converter and collection listener
     * If you have a converter that also has a conversion listener, whoa!! good luck.
     * This web crawl example is meant to provide the mechanics of the conversion listener
     * as implemented by the collection listener.
     * The details on how actions at collection time differ from conversion time are TBD.
     *
     * @throws IOException
     *             on collection err
     */
    @Override
    public void collect() throws IOException {
        try {
            collectItems(null, this.getSite());
        } catch (NoSuchAlgorithmException err) {
            log.error("Hashing error", err);
        }
    }

    /**
     * Override this if you have differnt ideas about what URL patterns are of interest.
     * DEFAULT FILTER OUT: video files, page anchors, mailto links
     * 
     * @param link
     *            a URL
     * @return true if link should be ignored.
     */
    public boolean filterOut(HyperLink link) {
        if (filterOutFile(link.getAbsoluteURL())) {
            return true;
        }
        if (link.isPageAnchor()) {
            log.debug("Filter out anchor link {}", link);
            return true;
        }

        return false;
    }

    /**
     * recursive folder crawl through a site. This is where docs are
     * converted and recorded.
     * As hashing algorithms are used in defining concise output paths, NoSuchAlgorithmException is thrown.
     *
     * @param _link
     *            a URL
     * @param startingSite
     *            the top level site
     * @throws IOException
     *             on err
     * @throws NoSuchAlgorithmException
     *             error that never happens
     */
    public void collectItems(String _link, URL startingSite) throws IOException,
            NoSuchAlgorithmException {
        String link = startingSite.toString();
        if (_link != null) {
            link = _link;
        }

        HyperLink thisLink = new HyperLink(link, new URL(link), getSite());

        if (errorPages.contains(thisLink.getAbsoluteURL())) {
            log.debug("Do not visit error pages tracked in this session; link: {}", link);
            return;
        }

        HttpResponse page = getPage(prepURL(link));

        /*
         * As of XText 1.4, this HTTP header does not appear to be avaiable often using this http API:
         */
        Header lastModStr = page.getFirstHeader("Last-Modified");
        Date lastMod = null;
        if (lastModStr != null) {
            lastMod = DateUtils.parseDate(lastModStr.getValue());
        }

        /*
         * 1.  Capture the page content represented by the requested link.
         *     It is saved to  FILE.html
         */
        String rawData = WebClient.readTextStream(page.getEntity().getContent());

        String thisPath = thisLink.getNormalPath();
        //if (StringUtils.isEmpty(thisPath)) {
        //    return;
        //}
        if (thisLink.isDynamic() && (!thisPath.endsWith("html"))) {
            thisPath = String.format("%s.html", thisPath);
        }
        File thisPage = createArchiveFile(thisPath, thisLink.isFolder());
        // OVERWRITE:
        if (!thisPage.exists()) {
            FileUtility.writeFile(rawData, thisPage.getAbsolutePath());
        }
        log.info("Starting in on {} from {} @ depth=" + depth, link, site);
        pause();

        ++depth;

        collectItemsOnPage(rawData, thisLink.getURL(), getSite());
    }

    /**
     * 
     * @param f file object
     * @throws IOException on err
     */
    public void collect(File f) throws IOException {
        String pageContent = FileUtility.readFile(f, "UTF-8");
        collectItemsOnPage(pageContent, getSite(), getSite());
    }

    /**
     * User filters are likely to be more general ALLOW, but specific DENIES within what is allowed.
     * 
     * @param path filepath
     * @return if user options filter out the given path
     */
    protected boolean userFilteredOut(final String path) {
        /*
         * Caller URL filters. Filter In.
         */
        boolean allow = true;
        if (this.prefixFilters.size() > 0) {
            allow = false;
            for (String filt : prefixFilters) {
                if (path.startsWith(filt)) {
                    allow = true;
                    break;
                }
            }
        }

        if (!allow) {
            return true;
        }

        /*
         * Okay, url was allowed, but does it fit a pattern that is to be filtered out?
         */
        if (this.prefixIgnore.size() > 0) {
            for (String filt : prefixIgnore) {
                if (path.startsWith(filt)) {
                    allow = false;
                    break;
                }
            }
        }

        return !allow;
    }

    /**
     * Internal method for parsing and harvesting from a single page and then crawling deeper, if instructed to do so.
     * 
     * @param pageContent raw HTML
     * @param url  url for HTML
     * @param site  top level url for site
     */
    protected void collectItemsOnPage(String pageContent, URL url, URL site) {

        Collection<HyperLink> items = parseContentPage(pageContent, url, site);

        /* 2. Collect items on this page.
         *
         */
        for (HyperLink l : items) {
            if (filterOut(l)) {
                continue;
            }

            if (this.isAllowCurrentSiteOnly() && !(l.isCurrentSite() || l.isCurrentHost())) {
                // Page represented by link, l, is on another website.
                log.debug("Not on current site: {}", l);
                continue;
            }

            if (this.isAllowCurrentDirOnly() && !l.isCurrentPage()) {
                // Page represented by link, l, is on another directory on same or site.
                log.debug("Not on current directory: {}", l);
                continue;
            }

            /* TODO: fix "key", as it represents not just path, but unique URLs
             * different URLs with same path would collide.
             * TODO: in general fix the ability to crawl off requested site.
             *  If that is really needed, this is not the crawling capability you want.
             *
             */
            String key = l.getNormalPath();
            if (key == null) {
                key = l.getAbsoluteURL();
            }

            if (found.containsKey(key)) {
                // We already did this.
                continue;
            }

            if (userFilteredOut(key)) {
                // We don't want to do this.
                log.debug("Filtered Out by User: {}", key);
                continue;
            }

            found.put(key, l);

            // B. Drop files in archive mirroring the original
            //

            if (saved.contains(l.getId())) {
                // in theory this item resolved to an item that was already saved.
                // ignore.
                continue;
            }

            // Download artifacts
            if (l.isFile() || l.isWebPage()) {
                pause();

                log.info("Pulling page {}", l);

                try {
                    // The default document ID will be an MD5 hash ID of the URL.
                    // This may differ for other collectors/harvesters/listeners
                    //
                    try {
                        if (listener != null && listener.exists(l.getId())) {
                            // You already collected this. So it will be ignored.
                            continue;
                        }
                    } catch (Exception err1) {
                        log.error("Collection Listener error", err1);
                        continue;
                    }

                    // create URL for link and download artifact.
                    HttpResponse itemPage = getPage(l.getURL());
                    // Regardless of the item's discovered path, determine
                    // the relative path.
                    if (itemPage.getStatusLine().getStatusCode() >= 400) {
                        this.errorPages.add(l.getAbsoluteURL());
                        log.error("Failing on this request, HTTP status>=400, LINK={}", l.getURL());
                        continue;
                    }

                    /*
                     * Identify the correct type of file this item is, from HTTP headers & MIME, not just the link
                     */
                    Header contentType = itemPage.getEntity().getContentType();
                    if (contentType != null) {
                        l.setMIMEType(contentType.getValue());
                    }

                    /*
                     * Create a non-trivial path for the item.
                     * 
                     */
                    String fpath = l.getNormalPath();
                    if (l.isDynamic()) {
                        if (!fpath.endsWith(".html")) {
                            fpath = fpath + ".html";
                        }
                    }
                    File itemSaved = createArchiveFile(fpath, false);
                    File dir = new File(itemSaved.getParentFile().getAbsolutePath());
                    FileUtility.makeDirectory(dir);
                    l.setFilepath(itemSaved);
                    // CACHE the identify of this URL.
                    saved.add(l.getId());

                    WebClient.downloadFile(itemPage.getEntity(), itemSaved.getAbsolutePath());

                    convertContent(itemSaved, l);

                    // Continue to crawl deeper...
                    //
                    if (l.isWebPage() && depth <= MAX_DEPTH) {
                        collectItems(l.getAbsoluteURL(), site);
                    }
                } catch (Exception fileErr) {
                    log.error("Item for URL {} was not saved due to a net or IO issue.",
                            l.getAbsoluteURL(), fileErr);
                }
            }
        }
        --depth;
    }

    /**
     * convert and record a downloaded item, given the item and its source URL.
     * 
     * @param item
     *            item to convert
     * @param link
     *            link representing the original/source
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     * @throws NoSuchAlgorithmException
     *             an error that never happens
     */
    protected void convertContent(File item, HyperLink link)
            throws IOException, ConfigException, NoSuchAlgorithmException {

        if (item == null || link == null) {
            throw new IOException("Bad data - null values for file and link...");
        }

        if (converter == null && listener != null) {
            log.debug("Link {} was saved to {}", link.getAbsoluteURL(), item.getAbsolutePath());
            listener.collected(item);
            return;
        }

        /**
         * Convert the item.
         */
        ConvertedDocument doc = null;
        if (item.exists()) {
            // record with a success state.
            doc = converter.convert(item);

            if (doc != null) {
                if (doc.textpath == null) {
                    log.error("Expecting the content to be non-null for {}", doc.getFilepath());
                    return;
                }
                //doc.setDefaultID();
                doc.setId(link.getId());
                doc.addSourceURL(link.getAbsoluteURL(), link.getReferrer());
                // This path must already exist
                doc.saveBuffer(new File(doc.textpath));

                if (listener != null) {
                    listener.collected(doc, item.getAbsolutePath());
                }
            } else {
                log.error("Document was not converted, FILE={}", item);
            }
        }
    }

    /**
     * @see org.opensextant.xtext.collectors.web.CrawlFilter#isAllowCurrentDirOnly()
     */
    @Override
    public boolean isAllowCurrentDirOnly() {
        return allowCurrentDirOnly;
    }

    /**
     * @see org.opensextant.xtext.collectors.web.CrawlFilter#setAllowCurrentDirOnly(boolean)
     */
    @Override
    public void setAllowCurrentDirOnly(boolean allowCurrentDirOnly) {
        this.allowCurrentDirOnly = allowCurrentDirOnly;
    }

    /**
     * @see org.opensextant.xtext.collectors.web.CrawlFilter#isAllowCurrentSiteOnly()
     */
    @Override
    public boolean isAllowCurrentSiteOnly() {
        return allowCurrentSiteOnly;
    }

    /**
     * @see org.opensextant.xtext.collectors.web.CrawlFilter#setAllowCurrentSiteOnly(boolean)
     */
    @Override
    public void setAllowCurrentSiteOnly(boolean allowCurrentSiteOnly) {
        this.allowCurrentSiteOnly = allowCurrentSiteOnly;
    }
}