/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.resourcestore.indexer; import java.io.File; import java.io.IOException; import org.apache.commons.httpclient.Header; import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** * * * @author brad * @version $Date$, $Revision$ */ public class ARCRecordToSearchResultAdapter implements Adapter<ARCRecord,CaptureSearchResult>{ // private static final Logger LOGGER = Logger.getLogger( // ARCRecordToSearchResultAdapter.class.getName()); private static final String VERSION = "0.1.0"; private static final String ARC_FILEDESC_VERSION = "arc/filedesc" + VERSION; private HTTPRecordAnnotater annotater = null; private UrlCanonicalizer canonicalizer = null; public ARCRecordToSearchResultAdapter() { canonicalizer = new IdentityUrlCanonicalizer(); annotater = new HTTPRecordAnnotater(); } /* (non-Javadoc) * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) */ public CaptureSearchResult adapt(ARCRecord rec) { try { return adaptInner(rec); } catch (IOException e) { e.printStackTrace(); return null; } } private CaptureSearchResult adaptInner(ARCRecord rec) throws IOException { rec.close(); ARCRecordMetaData meta = rec.getMetaData(); CaptureSearchResult result = new CaptureSearchResult(); String arcName = meta.getArc(); int index = arcName.lastIndexOf(File.separator); if (index > 0 && (index + 1) < arcName.length()) { arcName = arcName.substring(index + 1); } result.setFile(arcName); result.setOffset(meta.getOffset()); // initialize with default HTTP code... result.setHttpCode("-"); result.setRedirectUrl("-"); // result.setDigest("sha1:"+rec.getDigestStr()); result.setDigest(rec.getDigestStr()); result.setCaptureTimestamp(meta.getDate()); String uriStr = meta.getUrl(); result.setOriginalUrl(uriStr); if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) { result.setMimeType(ARC_FILEDESC_VERSION); } else if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) { // skip URL + HTTP header processing for dns records... result.setUrlKey(uriStr); result.setMimeType("text/dns"); result.setCompressedLength(rec.compressedBytes); } else { result.setUrlKey(canonicalizer.urlStringToKey(uriStr)); String statusCode = (meta.getStatusCode() == null) ? "-" : meta .getStatusCode(); result.setHttpCode(statusCode); Header[] headers = rec.getHttpHeaders(); annotater.annotateHTTPContent(result, rec, headers, meta.getMimetype()); } return result; } public UrlCanonicalizer getCanonicalizer() { return canonicalizer; } public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } /** * @return the annotater */ public HTTPRecordAnnotater getAnnotater() { return annotater; } /** * @param annotater the annotater to set */ public void setAnnotater(HTTPRecordAnnotater annotater) { this.annotater = annotater; } }