package org.archive.wayback.resourceindex; import java.io.IOException; import java.util.List; import org.archive.format.gzip.zipnum.TimestampBestPickDedupIterator; import org.archive.format.gzip.zipnum.ZipNumCluster; import org.archive.format.gzip.zipnum.ZipNumParams; import org.archive.util.iterator.CloseableIterator; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.resourceindex.cdx.format.CDXFlexFormat; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; /** * SearchResultSource wrapper for new implementation of CDX input source, including ZipNumCluster and CDX File * from archive-commons * @author ilya * */ public class ZipNumClusterSearchResultSource implements SearchResultSource, Adapter<String,CaptureSearchResult> { protected ZipNumCluster cluster; protected ZipNumParams params = null; protected ZipNumParams oneBlockParams; protected int timestampDedupLength = 0; protected List<String> ignoreRobotPaths; public void init() throws IOException { //this.cluster = new ZipNumCluster(clusterUri, summaryFile, blockLoader); oneBlockParams = new ZipNumParams(); oneBlockParams.setMaxBlocks(1); } @Override public CloseableIterator<CaptureSearchResult> getPrefixIterator( String urlkey) throws ResourceIndexNotAvailableException { try { CloseableIterator<String> cdxIter = null; String prefix = urlkey; int space = prefix.indexOf(' '); // One-block query if (space >= 0) { prefix = prefix.substring(0, space); cdxIter = cluster.getCDXIterator(urlkey, prefix, true, oneBlockParams); // Exact Match } else if (!prefix.endsWith("*\t")) { cdxIter = cluster.getCDXIterator(urlkey, prefix, true, params); // Prefix Match } else { cdxIter = cluster.getCDXIterator(urlkey, prefix.substring(0, prefix.length() - 2), false, params); } if (timestampDedupLength > 0) { cdxIter = new TimestampBestPickDedupIterator(cdxIter, timestampDedupLength); } return new AdaptedIterator<String,CaptureSearchResult>(cdxIter, this); } catch (IOException e) { throw new ResourceIndexNotAvailableException(e.toString()); } } @Override public CloseableIterator<CaptureSearchResult> getPrefixReverseIterator( String prefix) throws ResourceIndexNotAvailableException { throw new ResourceIndexNotAvailableException("Unsupported"); } @Override public void cleanup(CloseableIterator<CaptureSearchResult> c) throws IOException { c.close(); } @Override public void shutdown() throws IOException { // TODO Auto-generated method stub } public ZipNumCluster getCluster() { return cluster; } public void setCluster(ZipNumCluster cluster) { this.cluster = cluster; } public ZipNumParams getParams() { return params; } public void setParams(ZipNumParams params) { this.params = params; } @Override public CaptureSearchResult adapt(String line) { CaptureSearchResult result = CDXFlexFormat.parseCDXLineFlexFast(line); if (ignoreRobotPaths != null) { for (String path : ignoreRobotPaths) { if (result.getUrlKey().startsWith(path)) { result.setRobotIgnore(); break; } } } return result; } public List<String> getIgnoreRobotPaths() { return ignoreRobotPaths; } public void setIgnoreRobotPaths(List<String> ignoreRobotPaths) { this.ignoreRobotPaths = ignoreRobotPaths; } public int getTimestampDedupLength() { return timestampDedupLength; } public void setTimestampDedupLength(int timestampDedupLength) { this.timestampDedupLength = timestampDedupLength; } }