/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.resourceindex.cdx; import java.util.logging.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.url.UrlOperations; /** * Adapter that converts a CDX record String into a CaptureSearchResult * * @author brad * @version $Date$, $Revision$ */ public class CDXLineToSearchResultAdapter implements Adapter<String,CaptureSearchResult> { private static final Logger LOGGER = Logger.getLogger( CDXLineToSearchResultAdapter.class.getName()); private final static String SCHEME_STRING = "://"; private final static String DEFAULT_SCHEME = "http://"; private static int getEndOfHostIndex(String url) { int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); int pathIdx = url.indexOf(UrlOperations.PATH_START); if(portIdx == -1 && pathIdx == -1) { return url.length(); } if(portIdx == -1) { return pathIdx; } if(pathIdx == -1) { return portIdx; } if(pathIdx > portIdx) { return portIdx; } else { return pathIdx; } } public CaptureSearchResult adapt(String line) { return doAdapt(line); } /** * @param line * @return SearchResult representation of input line */ public static CaptureSearchResult doAdapt(String line) { CaptureSearchResult result = new CaptureSearchResult(); String[] tokens = line.split(" "); boolean hasRobotFlags = false; boolean hasLengthFlag = false; if (tokens.length != 9) { hasRobotFlags = true; if(tokens.length == 10) { } else if(tokens.length == 11) { hasLengthFlag = true; } else { return null; } //throw new IllegalArgumentException("Need 9 columns("+line+")"); } String urlKey = tokens[0]; String captureTS = tokens[1]; String originalUrl = tokens[2]; // convert from ORIG_HOST to ORIG_URL here: if(!originalUrl.contains(SCHEME_STRING)) { StringBuilder sb = new StringBuilder(urlKey.length()); sb.append(DEFAULT_SCHEME); sb.append(originalUrl); sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); originalUrl = sb.toString(); } String mimeType = tokens[3]; String httpCode = tokens[4]; String digest = tokens[5]; String redirectUrl = tokens[6]; long compressedOffset = -1; int nextToken = 7; if(hasRobotFlags) { result.setRobotFlags(tokens[nextToken]); nextToken++; } String length = "-"; if(hasLengthFlag) { length = tokens[nextToken]; nextToken++; } if(!tokens[nextToken].equals("-")) { try { compressedOffset = Long.parseLong(tokens[nextToken]); if(!length.equals("-")) { // try to set the endOffset: result.setCompressedLength(Long.parseLong(length)); } } catch (NumberFormatException e) { LOGGER.warning("Bad compressed Offset field("+nextToken+") in (" + line +")"); return null; } } nextToken++; String fileName = tokens[nextToken]; result.setUrlKey(urlKey); result.setCaptureTimestamp(captureTS); result.setOriginalUrl(originalUrl); result.setMimeType(mimeType); result.setHttpCode(httpCode); result.setDigest(digest); result.setRedirectUrl(redirectUrl); result.setOffset(compressedOffset); result.setFile(fileName); return result; } }