FetchDNS.java example

Explorer
heritrix3-master
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.fetcher;

import static org.archive.modules.fetcher.FetchStatusCodes.S_DNS_SUCCESS;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DOMAIN_UNRESOLVABLE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_GETBYNAME_SUCCESS;
import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.security.MessageDigest;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.util.ArchiveUtils;
import org.archive.util.InetAddressUtil;
import org.archive.util.Recorder;
import org.springframework.beans.factory.annotation.Autowired;
import org.xbill.DNS.ARecord;
import org.xbill.DNS.DClass;
import org.xbill.DNS.Lookup;
import org.xbill.DNS.Record;
import org.xbill.DNS.ResolverConfig;
import org.xbill.DNS.TextParseException;
import org.xbill.DNS.Type;


/**
 * Processor to resolve 'dns:' URIs.
 * 
 * TODO: Refactor to use org.archive.util.DNSJavaUtils.
 *
 * @author multiple
 */
public class FetchDNS extends Processor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    private static Logger logger = Logger.getLogger(FetchDNS.class.getName());

    // Defaults.
    private short ClassType = DClass.IN;
    private short TypeType = Type.A;
    protected InetAddress serverInetAddr = null;

    /**
     * If a DNS lookup fails, whether or not to fallback to InetAddress
     * resolution, which may use local 'hosts' files or other mechanisms.
     */
    protected boolean acceptNonDnsResolves = false; 
    public boolean getAcceptNonDnsResolves() {
        return acceptNonDnsResolves;
    }
    public void setAcceptNonDnsResolves(boolean acceptNonDnsResolves) {
        this.acceptNonDnsResolves = acceptNonDnsResolves;
    }
    
    /**
     * Used to do DNS lookups.
     */
    protected ServerCache serverCache;
    public ServerCache getServerCache() {
        return this.serverCache;
    }
    @Autowired
    public void setServerCache(ServerCache serverCache) {
        this.serverCache = serverCache;
    }
    
    /**
     * Whether or not to perform an on-the-fly digest hash of retrieved
     * content-bodies.
     */
    {
        setDigestContent(true);
    }
    public boolean getDigestContent() {
        return (Boolean) kp.get("digestContent");
    }
    public void setDigestContent(boolean digest) {
        kp.put("digestContent",digest);
    }

    /**
     * Which algorithm (for example MD5 or SHA-1) to use to perform an 
     * on-the-fly digest hash of retrieved content-bodies.
     */
    protected String digestAlgorithm = "sha1"; 
    public String getDigestAlgorithm() {
        return digestAlgorithm;
    }
    public void setDigestAlgorithm(String digestAlgorithm) {
        this.digestAlgorithm = digestAlgorithm;
    }

    private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES
        = 6 * 60 * 60; // 6 hrs

    public FetchDNS() {
    }

    protected boolean shouldProcess(CrawlURI curi) {
        return curi.getUURI().getScheme().equals("dns");
    }
    
    
    protected void innerProcess(CrawlURI curi) {
        Record[] rrecordSet = null; // Retrieved dns records
        String dnsName = null;
        try {
            dnsName = curi.getUURI().getReferencedHost();
        } catch (URIException e) {
            logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
        }
        
        if(dnsName == null) {
            curi.setFetchStatus(S_UNFETCHABLE_URI);
            return;
        }

        CrawlHost targetHost = getServerCache().getHostFor(dnsName);
        if (isQuadAddress(curi, dnsName, targetHost)) {
        	// We're done processing.
        	return;
        }
        
        // Do actual DNS lookup.
        curi.setFetchBeginTime(System.currentTimeMillis());

        // Try to get the records for this host (assume domain name)
        // TODO: Bug #935119 concerns potential hang here
        String lookupName = dnsName.endsWith(".") ? dnsName : dnsName + ".";
        try {
            rrecordSet = (new Lookup(lookupName, TypeType, ClassType)).run();
        } catch (TextParseException e) {
            rrecordSet = null;
        }
        curi.setContentType("text/dns");
        if (rrecordSet != null) {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Found recordset for " + lookupName);
            }
        	storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
        } else {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Failed find of recordset for " + lookupName);
            }
            if (getAcceptNonDnsResolves()||"localhost".equals(dnsName)) {
                // Do lookup that bypasses javadns.
                InetAddress address = null;
                try {
                    address = InetAddress.getByName(dnsName);
                } catch (UnknownHostException e1) {
                    address = null;
                }
                if (address != null) {
                    targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
                    curi.setFetchStatus(S_GETBYNAME_SUCCESS);
                    if (logger.isLoggable(Level.FINE)) {
                        logger.fine("Found address for " + dnsName +
                            " using native dns.");
                    }
                } else {
                    if (logger.isLoggable(Level.FINE)) {
                        logger.fine("Failed find of address for " + dnsName +
                            " using native dns.");
                    }
                    setUnresolvable(curi, targetHost);
                }
            } else {
                setUnresolvable(curi, targetHost);
            }
        }
        curi.setFetchCompletedTime(System.currentTimeMillis());
    }
    
    protected void storeDNSRecord(final CrawlURI curi, final String dnsName,
    		final CrawlHost targetHost, final Record[] rrecordSet) {
        // Get TTL and IP info from the first A record (there may be
        // multiple, e.g. www.washington.edu) then update the CrawlServer
        ARecord arecord = getFirstARecord(rrecordSet);
        if (arecord == null) {
            throw new NullPointerException("Got null arecord for " +
                dnsName);
        }
        targetHost.setIP(arecord.getAddress(), arecord.getTTL());
        try {
        	recordDNS(curi, rrecordSet);
            curi.setFetchStatus(S_DNS_SUCCESS);
            curi.setDNSServerIPLabel(ResolverConfig.getCurrentConfig().server());
        } catch (IOException e) {
        	logger.log(Level.SEVERE, "Failed store of DNS Record for " +
        		curi.toString(), e);
        	setUnresolvable(curi, targetHost);
        }
    }
    
    protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,
			final CrawlHost targetHost) {
		boolean result = false;
		Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
		// If it's an ip no need to do a lookup
		if (matcher == null || !matcher.matches()) {
			return result;
		}
		
		result = true;
		// Ideally this branch would never be reached: no CrawlURI
		// would be created for numerical IPs
		if (logger.isLoggable(Level.WARNING)) {
			logger.warning("Unnecessary DNS CrawlURI created: " + curi);
		}
		try {
			targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
					(byte) (new Integer(matcher.group(1)).intValue()),
					(byte) (new Integer(matcher.group(2)).intValue()),
					(byte) (new Integer(matcher.group(3)).intValue()),
					(byte) (new Integer(matcher.group(4)).intValue()) }),
					CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
			curi.setFetchStatus(S_DNS_SUCCESS);
		} catch (UnknownHostException e) {
			logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
			setUnresolvable(curi, targetHost);
		}
		return result;
	}
    
    protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
            throws IOException {
        final byte[] dnsRecord = getDNSRecord(curi.getFetchBeginTime(),
                rrecordSet);

        Recorder rec = curi.getRecorder();
        // Shall we get a digest on the content downloaded?
        boolean digestContent = getDigestContent();
        String algorithm = null;
        if (digestContent) {
            algorithm = getDigestAlgorithm();
            rec.getRecordedInput().setDigest(algorithm);
        } else {
            rec.getRecordedInput().setDigest((MessageDigest)null);
        }
        InputStream is = curi.getRecorder().inputWrap(
                new ByteArrayInputStream(dnsRecord));

        if (digestContent) {
            rec.getRecordedInput().startDigest();
        }

        // Reading from the wrapped stream, behind the scenes, will write
        // files into scratch space
        try {
            byte[] buf = new byte[256];
            while (is.read(buf) != -1) {
                continue;
            }
        } finally {
            is.close();
            rec.closeRecorders();
        }
        curi.setContentSize(dnsRecord.length);

        if (digestContent) {
            curi.setContentDigest(algorithm,
                rec.getRecordedInput().getDigestValue());
        }
    }
    
    protected byte [] getDNSRecord(final long fetchStart,
    		final Record[] rrecordSet)
    throws IOException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        // Start the record with a 14-digit date per RFC 2540
        byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
        baos.write(fetchDate);
        // Don't forget the newline
        baos.write("\n".getBytes());
        if (rrecordSet != null) {
            for (int i = 0; i < rrecordSet.length; i++) {
                byte[] record = rrecordSet[i].toString().getBytes();
                baos.write(record);
                // Add the newline between records back in
                baos.write("\n".getBytes());
            }
        }
        return baos.toByteArray();
    }
    
    protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
        host.setIP(null, 0);
        curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); 
    }
    
    protected ARecord getFirstARecord(Record[] rrecordSet) {
        ARecord arecord = null;
        if (rrecordSet == null || rrecordSet.length == 0) {
            if (logger.isLoggable(Level.FINEST)) {
                logger.finest("rrecordSet is null or zero length: " +
                    rrecordSet);
            }
            return arecord;
        }
        for (int i = 0; i < rrecordSet.length; i++) {
            if (rrecordSet[i].getType() != Type.A) {
                if (logger.isLoggable(Level.FINEST)) {
                    logger.finest("Record " + Integer.toString(i) +
                        " is not A type but " + rrecordSet[i].getType());
                }
                continue;
            }
            arecord = (ARecord) rrecordSet[i];
            break;
        }
        return arecord;
    }
}