/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.io.arc; import it.unimi.dsi.fastutil.io.RepositionableStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import org.archive.url.UsableURI; import org.archive.util.zip.GzipHeader; import org.archive.util.zip.NoGzipMagicException; public class ARCUtils implements ARCConstants { /** * @param pathOrUri Path or URI to extract arc filename from. * @return Extracted arc file name. * @throws URISyntaxException */ public static String parseArcFilename(final String pathOrUri) throws URISyntaxException { String path = pathOrUri; if (UsableURI.hasScheme(pathOrUri)) { URI url = new URI(pathOrUri); path = url.getPath(); } return (new File(path)).getName(); } /** * @param arcFile File to test. * @return True if <code>arcFile</code> is compressed ARC. * @throws IOException */ public static boolean isCompressed(File arcFile) throws IOException { return testCompressedARCFile(arcFile); } /** * Check file is compressed and in ARC GZIP format. * * @param arcFile File to test if its Internet Archive ARC file * GZIP compressed. * * @return True if this is an Internet Archive GZIP'd ARC file (It begins * w/ the Internet Archive GZIP header and has the * COMPRESSED_ARC_FILE_EXTENSION suffix). * * @exception IOException If file does not exist or is not unreadable. */ public static boolean testCompressedARCFile(File arcFile) throws IOException { return testCompressedARCFile(arcFile, false); } /** * Check file is compressed and in ARC GZIP format. * * @param arcFile File to test if its Internet Archive ARC file * GZIP compressed. * @param skipSuffixCheck Set to true if we're not to test on the * '.arc.gz' suffix. * * @return True if this is an Internet Archive GZIP'd ARC file (It begins * w/ the Internet Archive GZIP header). * * @exception IOException If file does not exist or is not unreadable. */ public static boolean testCompressedARCFile(File arcFile, boolean skipSuffixCheck) throws IOException { boolean compressedARCFile = false; isReadable(arcFile); if(!skipSuffixCheck && !arcFile.getName().toLowerCase() .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) { return compressedARCFile; } final InputStream is = new FileInputStream(arcFile); try { compressedARCFile = testCompressedARCStream(is); } finally { is.close(); } return compressedARCFile; } /** * Tests passed stream is gzip stream by reading in the HEAD. * Does not reposition the stream. That is left up to the caller. * @param is An InputStream. * @return True if compressed stream. * @throws IOException */ public static boolean testCompressedARCStream(final InputStream is) throws IOException { boolean compressedARCFile = false; GzipHeader gh = null; try { gh = new GzipHeader(is); } catch (NoGzipMagicException e ) { return compressedARCFile; } byte[] fextra = gh.getFextra(); // Now make sure following bytes are IA GZIP comment. // First check length. ARC_GZIP_EXTRA_FIELD includes length // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD // at +2. if (fextra != null && ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) { compressedARCFile = true; for (int i = 0; i < fextra.length; i++) { if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) { compressedARCFile = false; break; } } } return compressedARCFile; } /** * Tests passed stream is gzip stream by reading in the HEAD. * Does reposition of stream when done. * @param rs An InputStream that is Repositionable. * @return True if compressed stream. * @throws IOException */ public static boolean testCompressedRepositionalStream( final RepositionableStream rs) throws IOException { boolean compressedARCFile = false; long p = rs.position(); try { compressedARCFile = testCompressedStream((InputStream)rs); } finally { rs.position(p); } return compressedARCFile; } /** * Tests passed stream is gzip stream by reading in the HEAD. * Does reposition of stream when done. * @param is An InputStream. * @return True if compressed stream. * @throws IOException */ public static boolean testCompressedStream(final InputStream is) throws IOException { boolean compressedARCFile = false; try { new GzipHeader(is); compressedARCFile = true; } catch (NoGzipMagicException e) { return compressedARCFile; } return compressedARCFile; } /** * Check file is uncompressed ARC file. * * @param arcFile * File to test if its Internet Archive ARC file uncompressed. * * @return True if this is an Internet Archive ARC file. * * @exception IOException * If file does not exist or is not unreadable. */ public static boolean testUncompressedARCFile(File arcFile) throws IOException { boolean uncompressedARCFile = false; isReadable(arcFile); if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) { FileInputStream fis = new FileInputStream(arcFile); try { byte [] b = new byte[ARC_MAGIC_NUMBER.length()]; int read = fis.read(b, 0, ARC_MAGIC_NUMBER.length()); fis.close(); if (read == ARC_MAGIC_NUMBER.length()) { StringBuffer beginStr = new StringBuffer(ARC_MAGIC_NUMBER.length()); for (int i = 0; i < ARC_MAGIC_NUMBER.length(); i++) { beginStr.append((char)b[i]); } if (beginStr.toString(). equalsIgnoreCase(ARC_MAGIC_NUMBER)) { uncompressedARCFile = true; } } } finally { fis.close(); } } return uncompressedARCFile; } /** * @param arcFile File to test. * @exception IOException If file does not exist or is not unreadable. */ private static void isReadable(File arcFile) throws IOException { if (!arcFile.exists()) { throw new FileNotFoundException(arcFile.getAbsolutePath() + " does not exist."); } if (!arcFile.canRead()) { throw new FileNotFoundException(arcFile.getAbsolutePath() + " is not readable."); } } }