package org.archive.wayback.resourceindex.cdx.format;
import java.util.logging.Logger;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.FastCaptureSearchResult;
import org.archive.wayback.util.url.UrlOperations;
public class CDXFlexFormat extends CDXFormat {
private final static String SCHEME_STRING = "://";
private final static String DEFAULT_SCHEME = "http://";
private final static String EMPTY_VALUE = "-";
private static final Logger LOGGER =
Logger.getLogger(CDXFlexFormat.class.getName());
public CDXFlexFormat(String cdxSpec) throws CDXFormatException {
super(cdxSpec);
}
private static int getEndOfHostIndex(String url) {
int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR);
int pathIdx = url.indexOf(UrlOperations.PATH_START);
if(portIdx == -1 && pathIdx == -1) {
return url.length();
}
if(portIdx == -1) {
return pathIdx;
}
if(pathIdx == -1) {
return portIdx;
}
if(pathIdx > portIdx) {
return portIdx;
} else {
return pathIdx;
}
}
// Single place to do the flex cdx-line parsing logic
public static CaptureSearchResult parseCDXLineFlex(String line) {
CaptureSearchResult result = new CaptureSearchResult();
return parseCDXLineFlex(line, result);
}
// Use FastCaptureSearchResult to
public static CaptureSearchResult parseCDXLineFlexFast(String line) {
CaptureSearchResult result = new FastCaptureSearchResult();
return parseCDXLineFlex(line, result);
}
public static CaptureSearchResult parseCDXLineFlex(String line, CaptureSearchResult result) {
String[] tokens = line.split(" ");
boolean hasRobotFlags = false;
boolean hasLengthFlag = false;
if (tokens.length != 9) {
hasRobotFlags = true;
if(tokens.length == 10) {
} else if(tokens.length == 11) {
hasLengthFlag = true;
} else {
return null;
}
//throw new IllegalArgumentException("Need 9 columns("+line+")");
}
String urlKey = tokens[0];
String captureTS = tokens[1];
String originalUrl = tokens[2];
// convert from ORIG_HOST to ORIG_URL here:
if(!originalUrl.contains(SCHEME_STRING)) {
StringBuilder sb = new StringBuilder(urlKey.length());
sb.append(DEFAULT_SCHEME);
sb.append(originalUrl);
sb.append(urlKey.substring(getEndOfHostIndex(urlKey)));
originalUrl = sb.toString();
}
String mimeType = tokens[3];
String httpCode = tokens[4];
String digest = tokens[5];
String redirectUrl = tokens[6];
long compressedOffset = -1;
int nextToken = 7;
if(hasRobotFlags) {
result.setRobotFlags(tokens[nextToken]);
nextToken++;
}
String length = EMPTY_VALUE;
if(hasLengthFlag) {
length = tokens[nextToken];
nextToken++;
}
if(!tokens[nextToken].equals(EMPTY_VALUE)) {
try {
compressedOffset = Long.parseLong(tokens[nextToken]);
if(!length.equals(EMPTY_VALUE)) {
// try to set the endOffset:
result.setCompressedLength(Long.parseLong(length));
}
} catch (NumberFormatException e) {
LOGGER.warning("Bad compressed Offset field("+nextToken+") in (" +
line +")");
return null;
}
}
nextToken++;
String fileName = tokens[nextToken];
result.setUrlKey(urlKey);
result.setCaptureTimestamp(captureTS);
result.setOriginalUrl(originalUrl);
result.setMimeType(mimeType);
result.setHttpCode(httpCode);
result.setDigest(digest);
result.setRedirectUrl(redirectUrl);
result.setOffset(compressedOffset);
result.setFile(fileName.trim());
return result;
}
/* (non-Javadoc)
* @see org.archive.wayback.resourceindex.cdx.format.CDXFormat#parseResult(java.lang.String)
*/
@Override
public CaptureSearchResult parseResult(String line)
throws CDXFormatException {
return CDXFlexFormat.parseCDXLineFlex(line);
}
}