ExtractorPDFContentTest.java example

Explorer
heritrix3-master
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.extractor;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.Recorder;

public class ExtractorPDFContentTest extends ContentExtractorTestBase {

   protected static final String TEST_RESOURCE_FILE_1 = "ExtractorPDFContentTest1.pdf";
   protected static final String TEST_RESOURCE_FILE_2 = "ExtractorPDFContentTest2.pdf";
   protected static final String TEST_RESOURCE_FILE_3 = "ExtractorPDFContentTest3.pdf";
   protected static final String TEST_RESOURCE_FILE_4 = "ExtractorPDFContentTest4.pdf";

    
    public void testA() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
        CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_1);
        extractor.process(testUri);   

        Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.businessdictionary.com/definition/supervisor.html","http://management.about.com/od/policiesandprocedures/g/supervisor1.html"});
        assertTrue(testUri.getOutLinks().containsAll(expected));        
    }
    public void testEndingInDot() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
        CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_2);
        extractor.process(testUri);   

        Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.fec.gov/data/CommitteeSummary.do",
                "http://www.opensecrets.org/bigpicture/elec_stats.php",
                "http://www.opensecrets.org/pacs"});
        assertTrue(testUri.getOutLinks().containsAll(expected));        
    }
    public void testUnderscoreInURL() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
        CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_3);
        extractor.process(testUri);   

        Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.dot.gov/sites/dot.dev/files/docs/2014_February_ATCR.pdf"});
        assertTrue(testUri.getOutLinks().containsAll(expected));        
    }
    public void testParenthesis() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
        CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_4);
        extractor.process(testUri);

        Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.unisys.com","http://www.myserver.mycorp.com/images/exttest.jpg","http://www.adobe.com/intro?100,200","http://www.w3.org/1999/xhtml","http://www.xfa.org/schema/xfa-data/1.0","http://www.adobe.com","http://www.adobe.com/getacro.gif","http://www.example.com/testOpeningParen"});
        assertTrue(testUri.getOutLinks().containsAll(expected));
    }
    public void testNewlineSeparatedURIs() throws URIException, UnsupportedEncodingException, IOException, InterruptedException{
        CrawlURI testUri = createTestUri("http://www.example.com/fake.pdf", TEST_RESOURCE_FILE_4);
        extractor.process(testUri);

        Set<CrawlURI> expected = makeLinkSet(testUri, new String[]{"http://www.unisys.com","http://www.myserver.mycorp.com/images/exttest.jpg","http://www.example.com/test","http://www.adobe.com/intro?100,200","http://www.w3.org/1999/xhtml","http://www.xfa.org/schema/xfa-data/1.0","http://www.adobe.com","http://www.adobe.com/getacro.gif"});
        assertTrue(testUri.getOutLinks().containsAll(expected));
    }


    
    @Override
    protected Extractor makeExtractor() {
        ExtractorPDFContent result = new ExtractorPDFContent();
        UriErrorLoggerModule ulm = new UnitTestUriLoggerModule(); 
        result.setLoggerModule(ulm);
        return (Extractor)result;
    }
    private Set<CrawlURI> makeLinkSet(CrawlURI sourceUri, String[] urlStrs) throws URIException {
        HashSet<CrawlURI> linkSet = new HashSet<CrawlURI>();
        for (String urlStr : urlStrs) {
            CrawlURI link = sourceUri.createCrawlURI(urlStr, HTMLLinkContext.NAVLINK_MISC, Hop.NAVLINK);
            linkSet.add(link);
        }
        return linkSet;
    }
    private CrawlURI createTestUri(String urlStr, String resourceFileName) throws URIException,
    UnsupportedEncodingException, IOException {
        UURI testUuri = UURIFactory.getInstance(urlStr);
        CrawlURI testUri = new CrawlURI(testUuri, null, null, LinkContext.NAVLINK_MISC);
        

        File temp = File.createTempFile("test", ".tmp");
        Recorder recorder = new Recorder(temp, 1024, 1024);
        InputStream is = recorder.inputWrap(ExtractorPDFContentTest.class.getClassLoader().getResourceAsStream(resourceFileName));
        recorder.markContentBegin();
        for(int x = is.read(); x>=0; x=is.read());
        is.close();
        

        testUri.setContentType("application/pdf");
        testUri.setFetchStatus(200);
        testUri.setRecorder(recorder);
        testUri.setContentSize(recorder.getResponseContentLength());
        return testUri;
    }
   
 
}