/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.resourceindex.cdx.format;
import org.archive.wayback.core.CaptureSearchResult;
/**
* Class which allows serialization/deserialization of CaptureSearchResult
* objects into/out of a single line String representation.
*
*
* @author brad
*
*/
public class CDXFormat {
/*
* A canonized url
* B news group
* C rulespace category ***
* D compressed dat file offset
* F canonized frame
* G multi-columm language description (* soon)
* H canonized host
* I canonized image
* J canonized jump point
* K Some weird FBIS what's changed kinda thing
* L canonized link
* M meta tags (AIF) *
* N massaged url
* P canonized path
* Q language string
* R canonized redirect
* U uniqness ***
* V compressed arc file offset *
* X canonized url in other href tages
* Y canonized url in other src tags
* Z canonized url found in script
* a original url **
* b date **
* c old style checksum *
* d uncompressed dat file offset
* e IP **
* f frame *
* g file name
* h original host
* i image *
* j original jump point
* k new style checksum *
* l link *
* m mime type of original document *
* n arc document length *
* o port
* p original path
* r redirect *
* s response code *
* t title *
* v uncompressed arc file offset *
* x url in other href tages *
* y url in other src tags *
* z url found in script *
* # comment
*
* * in alexa-made dat file
* ** in alexa-made dat file meta-data line
* *** future data
*/
protected CDXField[] fields = null;
protected char delimiter = ' ';
protected String delimiterS = null;
public static String CDX_MAGIC = " CDX";
public static char URL_KEY = 'A';
public static char TIMESTAMP = 'b';
public static char ORIGINAL_URL = 'a';
public static char MIME_TYPE = 'm';
public static char HTTP_CODE = 's';
public static char DIGEST = 'k';
public static char REDIRECT = 'r';
public static char ROBOT_FLAGS = 'M';
public static char COMPRESSED_OFFSET = 'V';
public static char COMPRESSED_LENGTH = 'n';
public static char FILE = 'g';
/**
* Construct a CDXFormat reader/writer based on the specification argument
* @param cdxSpec
* @throws CDXFormatException
*/
public CDXFormat(String cdxSpec) throws CDXFormatException {
if(!cdxSpec.startsWith(CDX_MAGIC)) {
throw new CDXFormatException("Spec '" + cdxSpec
+ "' does not start with '" + CDX_MAGIC + "'");
}
delimiter = cdxSpec.charAt(CDX_MAGIC.length());
String fieldsString = cdxSpec.substring(CDX_MAGIC.length()+1);
int fieldCount = (fieldsString.length() + 1) / 2;
if(fieldsString.length() != (fieldCount * 2) - 1) {
throw new CDXFormatException("Extra char after spec '"
+ cdxSpec + "'");
}
fields = new CDXField[fieldCount];
for(int i = 0; i < fieldCount; i++) {
char f = fieldsString.charAt(i * 2);
if(i < fieldCount - 1) {
char d = fieldsString.charAt((i*2)+1);
if(d != delimiter) {
throw new CDXFormatException("Non-delimiter char in '"
+ fieldsString + "'");
}
}
fields[i] = getField(f);
}
delimiterS = new String(""+delimiter);
}
protected CDXField getField(char fieldChar) throws CDXFormatException {
CDXField field = null;
switch (fieldChar) {
case 'A': field = new URLKeyCDXField(); break;
// backvards compat with Alexa tools:
case 'N': field = new URLKeyCDXField(); break;
case 'b': field = new TimestampCDXField(); break;
case 'a': field = new OriginalURLCDXField(); break;
case 'm': field = new MIMETypeCDXField(); break;
case 's': field = new HTTPCodeCDXField(); break;
case 'k': field = new DigestCDXField(); break;
case 'r': field = new RedirectURLCDXField(); break;
case 'M': field = new RobotFlagsCDXField(); break;
case 'V': field = new StartOffsetCDXField(); break;
// Experimental..
case 'S': field = new CompressedLengthCDXField(); break;
case 'g': field = new FilenameCDXField(); break;
}
if(field == null) {
throw new CDXFormatException("Unknown field '"+fieldChar+"'");
}
return field;
}
/**
* @param line
* @return CaptureSearchResult containing data from the 'line' argument
* parsed according the the specification for this CDXFormat
* @throws CDXFormatException
*/
public CaptureSearchResult parseResult(String line)
throws CDXFormatException {
CaptureSearchResult result = new CaptureSearchResult();
String[] parts = line.split(delimiterS);
if(parts.length != fields.length) {
throw new CDXFormatException("Wrong number of fields");
}
for(int i = 0; i < fields.length; i++) {
fields[i].apply(parts[i], result);
}
return result;
}
/**
* @param result
* @return String representation of the data in 'result' formatted according
* to the specification for this CDXFormat
*/
public String serializeResult(CaptureSearchResult result) {
StringBuilder sb = new StringBuilder(100);
for(int i = 0; i < fields.length; i++) {
String value = fields[i].serialize(result);
if((value == null) || (value.length() == 0)) {
sb.append(CDXField.DEFAULT_VALUE);
} else {
sb.append(value);
}
if(i < fields.length - 1) {
sb.append(delimiter);
}
}
return sb.toString();
}
}