/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io.arc;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.archive.io.ArchiveRecordHeader;
/**
* An immutable class to hold an ARC record meta data.
*
* @author stack
*/
public class ARCRecordMetaData implements ArchiveRecordHeader, ARCConstants {
/**
* Map of record header fields.
*
* We store all in a hashmap. This way we can hold version 1 or
* version 2 record meta data.
*
* <p>Keys are lowercase.
*/
protected Map<String,Object> headerFields = null;
/**
* Digest for the record.
*
* Only available after the record has been read in totality.
*/
private String digest = null;
/**
* Status for this request.
*
* There may be no status.
*/
private String statusCode = null;
/**
* The arc this metadata came out.
* Descriptive String, either path or URL.
*/
private String arc = null;
private int contentBegin = 0;
/**
* Shut down the default constructor.
*/
protected ARCRecordMetaData() {
super();
}
/**
* Constructor.
*
* @param arc The arc file this metadata came out of.
* @param headerFields Hash of meta fields.
*
* @throws IOException
*/
public ARCRecordMetaData(final String arc, Map<String,Object> headerFields)
throws IOException {
// Make sure the minimum required fields are present,
for (Iterator<String> i = REQUIRED_VERSION_1_HEADER_FIELDS.iterator();
i.hasNext(); ) {
testRequiredField(headerFields, (String)i.next());
}
this.headerFields = headerFields;
this.arc = arc;
}
/**
* Test required field is present in hash.
*
* @param fields Map of fields.
* @param requiredField Field to test for.
*
* @exception IOException If required field is not present.
*/
protected void testRequiredField(Map<String,Object> fields, String requiredField)
throws IOException {
if (!fields.containsKey(requiredField)) {
throw new IOException("Required field " + requiredField +
" not in meta data.");
}
}
/**
* Get the time when the record was harvested.
* <p>
* Returns the date in Heritrix 14 digit time format (UTC). See the
* {@link org.archive.util.ArchiveUtils} class for converting to Java
* dates.
*
* @return Header date in Heritrix 14 digit format.
* @see org.archive.util.ArchiveUtils#parse14DigitDate(String)
*/
public String getDate() {
return (String) this.headerFields.get(DATE_FIELD_KEY);
}
/**
* @return Return length of the record.
*/
public long getLength() {
return Long.parseLong((String)this.headerFields.
get(LENGTH_FIELD_KEY));
}
/**
* @return Return Content-Length of the contents of the record
* Same as record length for arcs? TODO
*/
public long getContentLength() {
return getLength();
}
/**
* @return Header url.
*/
public String getUrl() {
return (String)this.headerFields.get(URL_FIELD_KEY);
}
/**
* @return IP.
*/
public String getIp()
{
return (String)this.headerFields.get(IP_HEADER_FIELD_KEY);
}
/**
* @return mimetype The mimetype that is in the ARC metaline -- NOT the http
* content-type content.
*/
public String getMimetype() {
return (String)this.headerFields.get(MIMETYPE_FIELD_KEY);
}
/**
* @return Arcfile version.
*/
public String getVersion() {
return (String)this.headerFields.get(VERSION_FIELD_KEY);
}
/**
* @return Arcfile origin code.
*/
public String getOrigin() {
return (String)this.headerFields.get(ORIGIN_FIELD_KEY);
}
/**
* @return Offset into arcfile at which this record begins.
*/
public long getOffset() {
return ((Long)this.headerFields.get(ABSOLUTE_OFFSET_KEY)).longValue();
}
/**
* @param key Key to use looking up field value.
* @return value for passed key of null if no such entry.
*/
public Object getHeaderValue(String key) {
return this.headerFields.get(key);
}
/**
* @return Header field name keys.
*/
public Set<String> getHeaderFieldKeys()
{
return this.headerFields.keySet();
}
/**
* @return Map of header fields.
*/
public Map<String,Object> getHeaderFields() {
return this.headerFields;
}
/**
* @return Returns identifier for ARC.
*/
public String getArc() {
return this.arc;
}
/**
* @return Convenience method that does a
* return new File(this.arc) (Be aware this.arc is not always
* full path to an ARC file -- may be an URL). Test
* returned file for existence.
*/
public File getArcFile() {
return new File(this.arc);
}
/**
* @return Returns the digest.
*/
public String getDigest() {
return this.digest;
}
/**
* @param d The digest to set.
*/
public void setDigest(String d) {
this.digest = d;
}
/**
* @return Returns the statusCode. May be null.
*/
public String getStatusCode() {
return this.statusCode;
}
/**
* @param statusCode The statusCode to set.
*/
public void setStatusCode(String statusCode) {
this.statusCode = statusCode;
}
public String toString() {
return ((this.arc != null)? this.arc: "") +
": " +
((this.headerFields != null)? this.headerFields.toString(): "");
}
public String getReaderIdentifier() {
return this.getArc();
}
public String getRecordIdentifier() {
return getDate() + "/" + getUrl();
}
public int getContentBegin() {
return this.contentBegin;
}
protected void setContentBegin(final int offset) {
this.contentBegin = offset;
}
}