package org.archive.wayback.resourcestore; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import org.archive.format.gzip.zipnum.ZipNumBlockLoader; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveReaderFactory; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCRecord; import org.archive.io.warc.WARCReader; import org.archive.io.warc.WARCRecord; import org.archive.util.binsearch.SeekableLineReader; import org.archive.util.binsearch.SortedTextFile; import org.archive.util.iterator.CloseableIterator; import org.archive.wayback.ResourceStore; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.Resource; import org.archive.wayback.exception.ResourceNotAvailableException; import org.archive.wayback.resourcestore.resourcefile.ArcResource; import org.archive.wayback.resourcestore.resourcefile.WarcResource; public class FlexResourceStore implements ResourceStore { final static String[] EMPTY_STRINGS = new String[0]; private final static Logger LOGGER = Logger.getLogger(FlexResourceStore.class.getName()); protected ZipNumBlockLoader blockLoader; protected String customHeader; protected List<SourceResolver> sources; protected boolean failOnFirstUnavailable = false; public ZipNumBlockLoader getBlockLoader() { return blockLoader; } public void setBlockLoader(ZipNumBlockLoader blockLoader) { this.blockLoader = blockLoader; } public String getCustomHeader() { return customHeader; } public void setCustomHeader(String customHeader) { this.customHeader = customHeader; } public List<SourceResolver> getSources() { return sources; } public void setSources(List<SourceResolver> sources) { this.sources = sources; } public boolean isFailOnFirstUnavailable() { return failOnFirstUnavailable; } public void setFailOnFirstUnavailable(boolean failOnFirstUnavailable) { this.failOnFirstUnavailable = failOnFirstUnavailable; } public interface SourceResolver { String[] lookupPath(String filename) throws IOException; } public static class PathIndex implements SourceResolver { final static String DELIMITER = "\t"; protected SortedTextFile pathIndex; protected String path; protected String prefixPath; public void setPathIndex(String path) throws IOException { this.path = path; this.pathIndex = new SortedTextFile(path, false); } public String getPathIndex() { return path; } public String getPrefixPath() { return prefixPath; } public void setPrefixPath(String prefixPath) { this.prefixPath = prefixPath; } @Override public String[] lookupPath(String filename) throws IOException { CloseableIterator<String> iter = null; List<String> paths = new ArrayList<String>(); try { String prefix = filename + DELIMITER; iter = pathIndex.getRecordIterator(prefix); while (iter.hasNext()) { String line = iter.next(); if (line.startsWith(prefix)) { String path = line.substring(prefix.length()); if (prefixPath != null) { paths.add(prefixPath + path); } else { paths.add(path); } } else { break; } } } finally { if (iter != null) { try { iter.close(); } catch (IOException e) { LOGGER.warning(e.toString()); } } } if (paths.isEmpty()) { return EMPTY_STRINGS; } String[] pathsArray = new String[paths.size()]; return paths.toArray(pathsArray); } } public static class PrefixLookup implements SourceResolver { String prefix; String skipPrefix = "http://"; String includeFilter; public String getPrefix() { return prefix; } public void setPrefix(String prefix) { this.prefix = prefix; } public String getSkipPrefix() { return skipPrefix; } public void setSkipPrefix(String skipPrefix) { this.skipPrefix = skipPrefix; } public String getIncludeFilter() { return includeFilter; } public void setIncludeFilter(String includeFilter) { this.includeFilter = includeFilter; } @Override public String[] lookupPath(String filename) { if (includeFilter != null) { if (!filename.contains(includeFilter)) { return EMPTY_STRINGS; } } if ((skipPrefix != null) && filename.startsWith(skipPrefix)) { return new String[]{filename}; } else { return new String[]{prefix + filename}; } } } @Override public Resource retrieveResource(CaptureSearchResult result) throws ResourceNotAvailableException { String filename = result.getFile(); if (filename == null || filename.isEmpty()) { throw new ResourceNotAvailableException("No ARC/WARC name in search result...", filename); } Resource resource = null; boolean breakOnErr = false; StringBuilder excMsg = new StringBuilder(); IOException lastExc = null; for (SourceResolver resolver : sources) { String[] paths = null; try { paths = resolver.lookupPath(filename); } catch (IOException io) { if (excMsg.length() > 0) { excMsg.append(" "); } excMsg.append(io.getMessage()); lastExc = io; if (failOnFirstUnavailable) { breakOnErr = true; break; } } if (paths.length == 0) { continue; } for (String path : paths) { try { resource = getResource(path, result); if (resource != null) { return resource; } } catch (IOException io) { if (excMsg.length() > 0) { excMsg.append(" "); } excMsg.append(io.getMessage()); lastExc = io; if (failOnFirstUnavailable) { breakOnErr = true; break; } } } if (breakOnErr) { break; } } if (lastExc == null) { lastExc = new FileNotFoundException(filename); excMsg.append("File not Found: " + filename); } ResourceNotAvailableException rnae = new ResourceNotAvailableException(excMsg.toString(), filename, lastExc); throw rnae; } public Resource getResource(String path, CaptureSearchResult result) throws IOException, ResourceNotAvailableException { Resource r = null; long offset = result.getOffset(); int length = (int)result.getCompressedLength(); if (LOGGER.isLoggable(Level.INFO)) { LOGGER.info("Loading " + path + " - " + offset + ":" + length); } boolean success = false; SeekableLineReader slr = blockLoader.attemptLoadBlock(path, offset, length, false, false); if (slr == null) { return null; } try { InputStream is = slr.getInputStream(); r = loadResource(path, is); r.parseHeaders(); success = true; } finally { if (!success) { if (slr != null) { slr.close(); } } } return r; } protected Resource loadResource(String path, InputStream is) throws IOException, ResourceNotAvailableException { ArchiveReader archiveReader = ArchiveReaderFactory.get(path, is, false); if (archiveReader instanceof ARCReader) { return new ArcResource((ARCRecord)archiveReader.get(), archiveReader); } else if (archiveReader instanceof WARCReader) { return new WarcResource((WARCRecord)archiveReader.get(), archiveReader); } else { throw new IOException("Unknown ArchiveReader"); } } @Override public void shutdown() throws IOException { blockLoader.close(); } }