/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.fetcher;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DNS_SUCCESS;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DOMAIN_UNRESOLVABLE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_GETBYNAME_SUCCESS;
import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.security.MessageDigest;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.util.ArchiveUtils;
import org.archive.util.InetAddressUtil;
import org.archive.util.Recorder;
import org.springframework.beans.factory.annotation.Autowired;
import org.xbill.DNS.ARecord;
import org.xbill.DNS.DClass;
import org.xbill.DNS.Lookup;
import org.xbill.DNS.Record;
import org.xbill.DNS.ResolverConfig;
import org.xbill.DNS.TextParseException;
import org.xbill.DNS.Type;
/**
* Processor to resolve 'dns:' URIs.
*
* TODO: Refactor to use org.archive.util.DNSJavaUtils.
*
* @author multiple
*/
public class FetchDNS extends Processor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 3L;
private static Logger logger = Logger.getLogger(FetchDNS.class.getName());
// Defaults.
private short ClassType = DClass.IN;
private short TypeType = Type.A;
protected InetAddress serverInetAddr = null;
/**
* If a DNS lookup fails, whether or not to fallback to InetAddress
* resolution, which may use local 'hosts' files or other mechanisms.
*/
protected boolean acceptNonDnsResolves = false;
public boolean getAcceptNonDnsResolves() {
return acceptNonDnsResolves;
}
public void setAcceptNonDnsResolves(boolean acceptNonDnsResolves) {
this.acceptNonDnsResolves = acceptNonDnsResolves;
}
/**
* Used to do DNS lookups.
*/
protected ServerCache serverCache;
public ServerCache getServerCache() {
return this.serverCache;
}
@Autowired
public void setServerCache(ServerCache serverCache) {
this.serverCache = serverCache;
}
/**
* Whether or not to perform an on-the-fly digest hash of retrieved
* content-bodies.
*/
{
setDigestContent(true);
}
public boolean getDigestContent() {
return (Boolean) kp.get("digestContent");
}
public void setDigestContent(boolean digest) {
kp.put("digestContent",digest);
}
/**
* Which algorithm (for example MD5 or SHA-1) to use to perform an
* on-the-fly digest hash of retrieved content-bodies.
*/
protected String digestAlgorithm = "sha1";
public String getDigestAlgorithm() {
return digestAlgorithm;
}
public void setDigestAlgorithm(String digestAlgorithm) {
this.digestAlgorithm = digestAlgorithm;
}
private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES
= 6 * 60 * 60; // 6 hrs
public FetchDNS() {
}
protected boolean shouldProcess(CrawlURI curi) {
return curi.getUURI().getScheme().equals("dns");
}
protected void innerProcess(CrawlURI curi) {
Record[] rrecordSet = null; // Retrieved dns records
String dnsName = null;
try {
dnsName = curi.getUURI().getReferencedHost();
} catch (URIException e) {
logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
}
if(dnsName == null) {
curi.setFetchStatus(S_UNFETCHABLE_URI);
return;
}
CrawlHost targetHost = getServerCache().getHostFor(dnsName);
if (isQuadAddress(curi, dnsName, targetHost)) {
// We're done processing.
return;
}
// Do actual DNS lookup.
curi.setFetchBeginTime(System.currentTimeMillis());
// Try to get the records for this host (assume domain name)
// TODO: Bug #935119 concerns potential hang here
String lookupName = dnsName.endsWith(".") ? dnsName : dnsName + ".";
try {
rrecordSet = (new Lookup(lookupName, TypeType, ClassType)).run();
} catch (TextParseException e) {
rrecordSet = null;
}
curi.setContentType("text/dns");
if (rrecordSet != null) {
if (logger.isLoggable(Level.FINE)) {
logger.fine("Found recordset for " + lookupName);
}
storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
} else {
if (logger.isLoggable(Level.FINE)) {
logger.fine("Failed find of recordset for " + lookupName);
}
if (getAcceptNonDnsResolves()||"localhost".equals(dnsName)) {
// Do lookup that bypasses javadns.
InetAddress address = null;
try {
address = InetAddress.getByName(dnsName);
} catch (UnknownHostException e1) {
address = null;
}
if (address != null) {
targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
curi.setFetchStatus(S_GETBYNAME_SUCCESS);
if (logger.isLoggable(Level.FINE)) {
logger.fine("Found address for " + dnsName +
" using native dns.");
}
} else {
if (logger.isLoggable(Level.FINE)) {
logger.fine("Failed find of address for " + dnsName +
" using native dns.");
}
setUnresolvable(curi, targetHost);
}
} else {
setUnresolvable(curi, targetHost);
}
}
curi.setFetchCompletedTime(System.currentTimeMillis());
}
protected void storeDNSRecord(final CrawlURI curi, final String dnsName,
final CrawlHost targetHost, final Record[] rrecordSet) {
// Get TTL and IP info from the first A record (there may be
// multiple, e.g. www.washington.edu) then update the CrawlServer
ARecord arecord = getFirstARecord(rrecordSet);
if (arecord == null) {
throw new NullPointerException("Got null arecord for " +
dnsName);
}
targetHost.setIP(arecord.getAddress(), arecord.getTTL());
try {
recordDNS(curi, rrecordSet);
curi.setFetchStatus(S_DNS_SUCCESS);
curi.setDNSServerIPLabel(ResolverConfig.getCurrentConfig().server());
} catch (IOException e) {
logger.log(Level.SEVERE, "Failed store of DNS Record for " +
curi.toString(), e);
setUnresolvable(curi, targetHost);
}
}
protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,
final CrawlHost targetHost) {
boolean result = false;
Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
// If it's an ip no need to do a lookup
if (matcher == null || !matcher.matches()) {
return result;
}
result = true;
// Ideally this branch would never be reached: no CrawlURI
// would be created for numerical IPs
if (logger.isLoggable(Level.WARNING)) {
logger.warning("Unnecessary DNS CrawlURI created: " + curi);
}
try {
targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
(byte) (new Integer(matcher.group(1)).intValue()),
(byte) (new Integer(matcher.group(2)).intValue()),
(byte) (new Integer(matcher.group(3)).intValue()),
(byte) (new Integer(matcher.group(4)).intValue()) }),
CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
curi.setFetchStatus(S_DNS_SUCCESS);
} catch (UnknownHostException e) {
logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
setUnresolvable(curi, targetHost);
}
return result;
}
protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
throws IOException {
final byte[] dnsRecord = getDNSRecord(curi.getFetchBeginTime(),
rrecordSet);
Recorder rec = curi.getRecorder();
// Shall we get a digest on the content downloaded?
boolean digestContent = getDigestContent();
String algorithm = null;
if (digestContent) {
algorithm = getDigestAlgorithm();
rec.getRecordedInput().setDigest(algorithm);
} else {
rec.getRecordedInput().setDigest((MessageDigest)null);
}
InputStream is = curi.getRecorder().inputWrap(
new ByteArrayInputStream(dnsRecord));
if (digestContent) {
rec.getRecordedInput().startDigest();
}
// Reading from the wrapped stream, behind the scenes, will write
// files into scratch space
try {
byte[] buf = new byte[256];
while (is.read(buf) != -1) {
continue;
}
} finally {
is.close();
rec.closeRecorders();
}
curi.setContentSize(dnsRecord.length);
if (digestContent) {
curi.setContentDigest(algorithm,
rec.getRecordedInput().getDigestValue());
}
}
protected byte [] getDNSRecord(final long fetchStart,
final Record[] rrecordSet)
throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// Start the record with a 14-digit date per RFC 2540
byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
baos.write(fetchDate);
// Don't forget the newline
baos.write("\n".getBytes());
if (rrecordSet != null) {
for (int i = 0; i < rrecordSet.length; i++) {
byte[] record = rrecordSet[i].toString().getBytes();
baos.write(record);
// Add the newline between records back in
baos.write("\n".getBytes());
}
}
return baos.toByteArray();
}
protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
host.setIP(null, 0);
curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE);
}
protected ARecord getFirstARecord(Record[] rrecordSet) {
ARecord arecord = null;
if (rrecordSet == null || rrecordSet.length == 0) {
if (logger.isLoggable(Level.FINEST)) {
logger.finest("rrecordSet is null or zero length: " +
rrecordSet);
}
return arecord;
}
for (int i = 0; i < rrecordSet.length; i++) {
if (rrecordSet[i].getType() != Type.A) {
if (logger.isLoggable(Level.FINEST)) {
logger.finest("Record " + Integer.toString(i) +
" is not A type but " + rrecordSet[i].getType());
}
continue;
}
arecord = (ARecord) rrecordSet[i];
break;
}
return arecord;
}
}