package org.archive.server; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.nio.charset.Charset; import java.text.ParseException; import java.util.List; import java.util.UUID; import java.util.logging.Logger; import org.archive.format.gzip.GZIPFormatException; import org.archive.format.gzip.GZIPMemberSeries; import org.archive.format.gzip.GZIPMemberWriter; import org.archive.format.gzip.GZIPSeriesMember; import org.archive.streamcontext.SimpleStream; import org.archive.util.IAUtils; import org.archive.util.DateUtils; import org.archive.util.FileNameSpec; import com.google.common.io.ByteStreams; import com.google.common.io.LimitInputStream; public class GZRangeClient { private final static Logger LOGGER = Logger.getLogger(GZRangeClient.class.getName()); private static final Charset UTF8 = Charset.forName("UTF-8"); private static int CR = 13; private static int LF = 10; private static final long DEFAULT_MAX_ARC_SIZE = 1024 * 1024 * 100; private static final long DEFAULT_MAX_WARC_SIZE = 1024 * 1024 * 1024; private File targetDir; private long maxArcSize; private long maxWarcSize; private String timestamp14; private String timestampZ; private FileNameSpec warcNamer; private FileNameSpec arcNamer; private File currentArc; private File currentArcTmp; private File currentWarc; private File currentWarcTmp; private FileOutputStream currentArcOS; private long currentArcSize = 0; private FileOutputStream currentWarcOS; private long currentWarcSize = 0; private byte[] warcHeaderContents; private boolean exitOnError = false; private final static String ARC_PATTERN = "filedesc://%s 0.0.0.0 %s text/plain 76\n" + "1 0 InternetArchive\n" + "URL IP-address Archive-date Content-type Archive-length\n\n"; private final static String WARC_PATTERN = "WARC/1.0\r\n" + "WARC-Type: warcinfo\r\n" + "WARC-Date: %s\r\n" + "WARC-Filename: %s\r\n" + "WARC-Record-ID: <urn:uuid:%s>\r\n" + "Content-Type: application/warc-fields\r\n" + "Content-Length: %d\r\n\r\n"; /* filedesc://IQ-125-20061126082604-03075-crawling08.us.archive.org.arc 0.0.0.0 20061126082604 text/plain 1447 1 1 InternetArchive URL IP-address Archive-date Content-type Archive-length WARC-Date: 2009-10-10T21:33:10Z WARC-Filename: LOC-WEEKLY-008-20091010213310-06162-crawling110.us.archive.org.warc.gz WARC-Record-ID: <urn:uuid:776a760b-f456-48f1-97e3-1d29967c75d2> Content-Type: application/warc-fields Content-Length: 599 software: Heritrix/1.15.4 http://crawler.archive.org ip: 207.241.235.29 hostname: crawling110.us.archive.org format: WARC File Format 0.17 conformsTo: http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc operator: Vinay Goel publisher: Internet Archive audience: Library of Congress isPartOf: LOC-WEEKLY-008-RECOVER created: 2009-09-30T12:21:49Z description: Library of Congress Monthly Harvest robots: ignore http-header-user-agent: Mozilla/5.0 (compatible; archive.org_bot/1.5.0 +http://www.loc.gov/minerva/crawl.html) http-header-from: archive-crawler-agent@lists.sourceforge.net */ private static String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n" + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n" + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n"; private static final String defaultWarcHeaderString = String.format( DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION, DateUtils.getLog17Date(System.currentTimeMillis())); private static final byte[] DEFAULT_WARC_HEADER_BYTES = defaultWarcHeaderString.getBytes(UTF8); public GZRangeClient(File targetDir, String prefix, String timestamp14) throws ParseException { this.targetDir = targetDir; arcNamer = new FileNameSpec(prefix, ".arc.gz"); warcNamer = new FileNameSpec(prefix, ".warc.gz"); this.timestamp14 = timestamp14; long msse = DateUtils.parse14DigitDate(timestamp14).getTime(); this.timestampZ = DateUtils.getLog17Date(msse); maxArcSize = DEFAULT_MAX_ARC_SIZE; maxWarcSize = DEFAULT_MAX_WARC_SIZE; warcHeaderContents = DEFAULT_WARC_HEADER_BYTES; } public void finish() throws IOException { closeArc(); closeWarc(); } private long getGZLength(InputStream is) throws IOException, GZIPFormatException { SimpleStream s = new SimpleStream(is); GZIPMemberSeries gzs = new GZIPMemberSeries(s,"range",0,true); GZIPSeriesMember m = gzs.getNextMember(); m.skipMember(); return m.getCompressedBytesRead(); } public void append(long offset, List<String> urls) throws IOException { boolean isArc = false; String first = urls.get(0); if(first.endsWith(".arc.gz")) { isArc = true; } else if(first.endsWith(".warc.gz")) { } else { throw new IOException("URL (" + first + ") must end with '.arc.gz' or '.warc.gz'"); } for(String url : urls) { FileBackedInputStream fbis = null; InputStream is = null; try { URL u = new URL(url); URLConnection conn = u.openConnection(); conn.setRequestProperty("Range", String.format("bytes=%d-", offset)); LOGGER.info(String.format("Attempting(%d) from(%s)",offset,url)); conn.connect(); is = conn.getInputStream(); fbis = new FileBackedInputStream(is); long length = getGZLength(fbis); InputStream orig = fbis.getInputStream(); if(isArc) { writeARCRecord(orig, length); } else { writeWARCRecord(orig, length); } LOGGER.info(String.format("Wrote record(%d) from(%s)", offset,url)); return; } catch (IOException e) { LOGGER.warning("FAILED URL-OFFSET("+url+")(" + offset+")"); } finally { if(is != null) { is.close(); } if(fbis != null) { fbis.resetBacker(); } } } StringBuilder sb = new StringBuilder(); for(String u : urls) { if(sb.length() != 0) { sb.append(","); } sb.append(u); } String errMsg = String.format("Unable to get offset(%d) from (%s)", offset,sb.toString()); if(exitOnError) { throw new IOException(errMsg); } else { LOGGER.severe(errMsg); } } private String getWARCRecordID() { return "urn:uuid:" + UUID.randomUUID().toString(); } private byte[] getARCHeader(String name) { return String.format(ARC_PATTERN,name,timestamp14).getBytes(UTF8); } private byte[] getWARCHeader(String name) { String t = String.format(WARC_PATTERN, timestampZ,name,getWARCRecordID(),warcHeaderContents.length + 4); byte[] b = t.getBytes(UTF8); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { baos.write(b); baos.write(warcHeaderContents); } catch(IOException e) { // not gonna happen } baos.write(CR); baos.write(LF); baos.write(CR); baos.write(LF); return baos.toByteArray(); } private void writeWARCRecord(InputStream is, long length) throws IOException { if(currentWarcSize == 0) { nextWarc(); } LimitInputStream lis = new LimitInputStream(is, length); ByteStreams.copy(lis, currentWarcOS); currentWarcSize += length; if(currentWarcSize > maxWarcSize) { closeWarc(); } } private void writeARCRecord(InputStream is, long length) throws IOException { if(currentArcSize == 0) { nextArc(); } LimitInputStream lis = new LimitInputStream(is, length); ByteStreams.copy(lis, currentArcOS); currentArcSize += length; if(currentArcSize > maxArcSize) { closeArc(); } } private void closeArc() throws IOException { if(currentArcOS == null) { return; } currentArcOS.close(); if(!currentArcTmp.renameTo(currentArc)) { throw new IOException(String.format("Failed rename(%s) to (%s)", currentArcTmp.getAbsolutePath(), currentArc.getAbsolutePath())); } currentArcOS = null; currentArcSize = 0; LOGGER.info(String.format("Closed(%s)",currentArc.getAbsolutePath())); } private void closeWarc() throws IOException { if(currentWarcOS == null) { return; } currentWarcOS.close(); if(!currentWarcTmp.renameTo(currentWarc)) { throw new IOException(String.format("Failed rename(%s) to (%s)", currentWarcTmp.getAbsolutePath(), currentWarc.getAbsolutePath())); } currentWarcOS = null; currentWarcSize = 0; LOGGER.info(String.format("Closed(%s)",currentWarc.getAbsolutePath())); } private void nextArc() throws IOException { String newArcName = arcNamer.getNextName(); currentArc = new File(targetDir,newArcName); String tmpArcName = newArcName + ".OPEN"; currentArcTmp = new File(targetDir,tmpArcName); currentArcOS = new FileOutputStream(currentArcTmp); byte[] header = getARCHeader(newArcName); GZIPMemberWriter w = new GZIPMemberWriter(currentArcOS); w.write(new ByteArrayInputStream(header)); currentArcSize = w.getBytesWritten(); LOGGER.info(String.format("Openned(%s)",currentArc.getAbsolutePath())); } private void nextWarc() throws IOException { String newWarcName = warcNamer.getNextName(); currentWarc = new File(targetDir,newWarcName); String tmpWarcName = newWarcName + ".OPEN"; currentWarcTmp = new File(targetDir,tmpWarcName); currentWarcOS = new FileOutputStream(currentWarcTmp); byte[] header = getWARCHeader(newWarcName); GZIPMemberWriter w = new GZIPMemberWriter(currentWarcOS); w.write(new ByteArrayInputStream(header)); currentWarcSize = w.getBytesWritten(); LOGGER.info(String.format("Openned(%s)",currentWarc.getAbsolutePath())); } /** * @return the warcHeaderContents */ public byte[] getWarcHeaderContents() { return warcHeaderContents; } /** * @param warcHeaderContents the warcHeaderContents to set */ public void setWarcHeaderContents(byte[] warcHeaderContents) { this.warcHeaderContents = warcHeaderContents; } /** * @return the maxArcSize */ public long getMaxArcSize() { return maxArcSize; } /** * @param maxArcSize the maxArcSize to set */ public void setMaxArcSize(long maxArcSize) { this.maxArcSize = maxArcSize; } /** * @return the maxWarcSize */ public long getMaxWarcSize() { return maxWarcSize; } /** * @param maxWarcSize the maxWarcSize to set */ public void setMaxWarcSize(long maxWarcSize) { this.maxWarcSize = maxWarcSize; } /** * @return the exitOnError */ public boolean isExitOnError() { return exitOnError; } /** * @param exitOnError the exitOnError to set */ public void setExitOnError(boolean exitOnError) { this.exitOnError = exitOnError; } }