/*
* Cloud9: A Hadoop toolkit for working with big data
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
/**
* Container for a generic Warc Record
*
* (C) 2009 - Carnegie Mellon University
*
* 1. Redistributions of this source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. The names "Lemur", "Indri", "University of Massachusetts",
* "Carnegie Mellon", and "lemurproject" must not be used to
* endorse or promote products derived from this software without
* prior written permission. To obtain permission, contact
* license@lemurproject.org.
*
* 4. Products derived from this software may not be called "Lemur" or "Indri"
* nor may "Lemur" or "Indri" appear in their names without prior written
* permission of The Lemur Project. To obtain permission,
* contact license@lemurproject.org.
*
* THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
* PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
* NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* @author mhoy@cs.cmu.edu (Mark J. Hoy)
*/
package edu.umd.cloud9.collection.clue;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.EOFException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.Map.Entry;
import edu.umd.cloud9.collection.Indexable;
public class ClueWarcRecord extends Indexable {
public static String WARC_VERSION = "WARC/0.18";
public static String WARC_VERSION_LINE = "WARC/0.18\n";
private static String NEWLINE = "\n";
private static byte MASK_THREE_BYTE_CHAR = (byte) (0xE0);
private static byte MASK_TWO_BYTE_CHAR = (byte) (0xC0);
private static byte MASK_TOPMOST_BIT = (byte) (0x80);
private static byte MASK_BOTTOM_SIX_BITS = (byte) (0x1F);
private static byte MASK_BOTTOM_FIVE_BITS = (byte) (0x3F);
private static byte MASK_BOTTOM_FOUR_BITS = (byte) (0x0F);
/**
* Our read line implementation. We cannot allow buffering here (for gzip
* streams) so, we need to use DataInputStream. Also - we need to account
* for java's UTF8 implementation
*
* @param in
* the input data stream
* @return the read line (or null if eof)
* @throws java.io.IOException
*/
private static String readLineFromInputStream(DataInputStream in) throws IOException {
StringBuilder retString = new StringBuilder();
boolean keepReading = true;
try {
do {
char thisChar = 0;
byte readByte = in.readByte();
// check to see if it's a multibyte character
if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) {
// need to read the next 2 bytes
if (in.available() < 2) {
// treat these all as individual characters
retString.append((char) readByte);
int numAvailable = in.available();
for (int i = 0; i < numAvailable; i++) {
retString.append((char) (in.readByte()));
}
continue;
}
byte secondByte = in.readByte();
byte thirdByte = in.readByte();
// ensure the topmost bit is set
if (((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT)
|| ((thirdByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT)) {
// treat these as individual characters
retString.append((char) readByte);
retString.append((char) secondByte);
retString.append((char) thirdByte);
continue;
}
int finalVal = (thirdByte & MASK_BOTTOM_FIVE_BITS) + 64
* (secondByte & MASK_BOTTOM_FIVE_BITS) + 4096
* (readByte & MASK_BOTTOM_FOUR_BITS);
thisChar = (char) finalVal;
} else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) {
// need to read next byte
if (in.available() < 1) {
// treat this as individual characters
retString.append((char) readByte);
continue;
}
byte secondByte = in.readByte();
if ((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) {
retString.append((char) readByte);
retString.append((char) secondByte);
continue;
}
int finalVal = (secondByte & MASK_BOTTOM_FIVE_BITS) + 64
* (readByte & MASK_BOTTOM_SIX_BITS);
thisChar = (char) finalVal;
} else {
// interpret it as a single byte
thisChar = (char) readByte;
}
if (thisChar == '\n') {
keepReading = false;
} else {
retString.append(thisChar);
}
} while (keepReading);
} catch (EOFException eofEx) {
return null;
}
if (retString.length() == 0) {
return "";
}
return retString.toString();
}
/**
* The actual heavy lifting of reading in the next WARC record
*
* @param in
* the data input stream
* @param headerBuffer
* a blank string buffer to contain the WARC header
* @return the content byts (w/ the headerBuffer populated)
* @throws java.io.IOException
*/
private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuffer)
throws IOException {
if (in == null) {
return null;
}
if (headerBuffer == null) {
return null;
}
String line = null;
boolean foundMark = false;
boolean inHeader = true;
byte[] retContent = null;
// cannot be using a buffered reader here!!!!
// just read the header
// first - find our WARC header
while ((!foundMark) && ((line = readLineFromInputStream(in)) != null)) {
if (line.startsWith(WARC_VERSION)) {
foundMark = true;
}
}
// no WARC mark?
if (!foundMark) {
return null;
}
// then read to the first newline
// make sure we get the content length here
int contentLength = -1;
boolean foundContentLength = false;
while (!foundContentLength && inHeader && ((line = readLineFromInputStream(in)) != null)) {
if ((line.trim().length() == 0) && foundContentLength) {
inHeader = false;
} else {
headerBuffer.append(line);
headerBuffer.append(NEWLINE);
String[] thisHeaderPieceParts = line.split(":", 2);
if (thisHeaderPieceParts.length == 2) {
if (thisHeaderPieceParts[0].toLowerCase().startsWith("content-length")) {
foundContentLength = true;
try {
contentLength = Integer.parseInt(thisHeaderPieceParts[1].trim());
} catch (NumberFormatException nfEx) {
contentLength = -1;
}
}
}
}
}
if (contentLength < 0) {
return null;
}
// now read the bytes of the content
retContent = new byte[contentLength];
int totalWant = contentLength;
int totalRead = 0;
while (totalRead < contentLength) {
try {
int numRead = in.read(retContent, totalRead, totalWant);
if (numRead < 0) {
return null;
} else {
totalRead += numRead;
totalWant = contentLength - totalRead;
} // end if (numRead < 0) / else
} catch (EOFException eofEx) {
// resize to what we have
if (totalRead > 0) {
byte[] newReturn = new byte[totalRead];
System.arraycopy(retContent, 0, newReturn, 0, totalRead);
return newReturn;
} else {
return null;
}
} // end try/catch (EOFException)
} // end while (totalRead < contentLength)
return retContent;
}
/**
* Reads in a WARC record from a data input stream
*
* @param in
* the input stream
* @return a WARC record (or null if eof)
* @throws java.io.IOException
*/
public static ClueWarcRecord readNextWarcRecord(DataInputStream in) throws IOException {
StringBuffer recordHeader = new StringBuffer();
byte[] recordContent = readNextRecord(in, recordHeader);
if (recordContent == null) {
return null;
}
// extract out our header information
String thisHeaderString = recordHeader.toString();
String[] headerLines = thisHeaderString.split(NEWLINE);
ClueWarcRecord retRecord = new ClueWarcRecord();
for (int i = 0; i < headerLines.length; i++) {
String[] pieces = headerLines[i].split(":", 2);
if (pieces.length != 2) {
retRecord.addHeaderMetadata(pieces[0], "");
continue;
}
String thisKey = pieces[0].trim();
String thisValue = pieces[1].trim();
// check for known keys
if (thisKey.equals("WARC-Type")) {
retRecord.setWarcRecordType(thisValue);
} else if (thisKey.equals("WARC-Date")) {
retRecord.setWarcDate(thisValue);
} else if (thisKey.equals("WARC-Record-ID")) {
retRecord.setWarcUUID(thisValue);
} else if (thisKey.equals("Content-Type")) {
retRecord.setWarcContentType(thisValue);
} else {
retRecord.addHeaderMetadata(thisKey, thisValue);
}
}
// set the content
retRecord.setContent(recordContent);
return retRecord;
}
/**
* Warc header class
*/
public class WarcHeader {
public String contentType = "";
public String UUID = "";
public String dateString = "";
public String recordType = "";
public HashMap<String, String> metadata = new HashMap<String, String>();
public int contentLength = 0;
/**
* Default constructor
*/
public WarcHeader() {
}
/**
* Copy Constructor
*
* @param o
* other WARC header
*/
public WarcHeader(WarcHeader o) {
this.contentType = o.contentType;
this.UUID = o.UUID;
this.dateString = o.dateString;
this.recordType = o.recordType;
this.metadata.putAll(o.metadata);
this.contentLength = o.contentLength;
}
/**
* Serialization output
*
* @param out
* the data output stream
* @throws java.io.IOException
*/
public void write(DataOutput out) throws IOException {
out.writeUTF(contentType);
out.writeUTF(UUID);
out.writeUTF(dateString);
out.writeUTF(recordType);
out.writeInt(metadata.size());
Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
while (metadataIterator.hasNext()) {
Entry<String, String> thisEntry = metadataIterator.next();
out.writeUTF(thisEntry.getKey());
out.writeUTF(thisEntry.getValue());
}
out.writeInt(contentLength);
}
/**
* Serialization input
*
* @param in
* the data input stream
* @throws java.io.IOException
*/
public void readFields(DataInput in) throws IOException {
contentType = in.readUTF();
UUID = in.readUTF();
dateString = in.readUTF();
recordType = in.readUTF();
metadata.clear();
int numMetaItems = in.readInt();
for (int i = 0; i < numMetaItems; i++) {
String thisKey = in.readUTF();
String thisValue = in.readUTF();
metadata.put(thisKey, thisValue);
}
contentLength = in.readInt();
}
@Override
public String toString() {
StringBuffer retBuffer = new StringBuffer();
retBuffer.append(WARC_VERSION);
retBuffer.append(NEWLINE);
retBuffer.append("WARC-Type: " + recordType + NEWLINE);
retBuffer.append("WARC-Date: " + dateString + NEWLINE);
retBuffer.append("WARC-Record-ID: " + UUID + NEWLINE);
Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
while (metadataIterator.hasNext()) {
Entry<String, String> thisEntry = metadataIterator.next();
retBuffer.append(thisEntry.getKey());
retBuffer.append(": ");
retBuffer.append(thisEntry.getValue());
retBuffer.append(NEWLINE);
}
retBuffer.append("Content-Type: " + contentType + NEWLINE);
retBuffer.append("Content-Length: " + contentLength + NEWLINE);
return retBuffer.toString();
}
}
private WarcHeader warcHeader = new WarcHeader();
private byte[] warcContent = null;
private String warcFilePath = "";
/**
* Default Constructor
*/
public ClueWarcRecord() {
}
/**
* Copy Constructor
*
* @param o
*/
public ClueWarcRecord(ClueWarcRecord o) {
this.warcHeader = new WarcHeader(o.warcHeader);
this.warcContent = o.warcContent;
}
/**
* Retrieves the total record length (header and content)
*
* @return total record length
*/
public int getTotalRecordLength() {
int headerLength = warcHeader.toString().length();
return (headerLength + warcContent.length);
}
/**
* Sets the record content (copy)
*
* @param o
* record to copy from
*/
public void set(ClueWarcRecord o) {
this.warcHeader = new WarcHeader(o.warcHeader);
this.warcContent = o.warcContent;
}
/**
* Gets the file path from this WARC file (if set)
*/
public String getWarcFilePath() {
return warcFilePath;
}
/**
* Sets the warc file path (optional - for use with getWarcFilePath)
*
* @param path
*/
public void setWarcFilePath(String path) {
warcFilePath = path;
}
/**
* Sets the record type string
*
* @param recordType
*/
public void setWarcRecordType(String recordType) {
warcHeader.recordType = recordType;
}
/**
* Sets the content type string
*
* @param contentType
*/
public void setWarcContentType(String contentType) {
warcHeader.contentType = contentType;
}
/**
* Sets the WARC header date string
*
* @param dateString
*/
public void setWarcDate(String dateString) {
warcHeader.dateString = dateString;
}
/**
* Sets the WARC uuid string
*
* @param UUID
*/
public void setWarcUUID(String UUID) {
warcHeader.UUID = UUID;
}
/**
* Adds a key/value pair to a WARC header. This is needed to filter out
* known keys
*
* @param key
* @param value
*/
public void addHeaderMetadata(String key, String value) {
// don't allow addition of known keys
if (key.equals("WARC-Type")) {
return;
}
if (key.equals("WARC-Date")) {
return;
}
if (key.equals("WARC-Record-ID")) {
return;
}
if (key.equals("Content-Type")) {
return;
}
if (key.equals("Content-Length")) {
return;
}
warcHeader.metadata.put(key, value);
}
/**
* Clears all metadata items from a header
*/
public void clearHeaderMetadata() {
warcHeader.metadata.clear();
}
/**
* Gets the set of metadata items from the header
*/
public Set<Entry<String, String>> getHeaderMetadata() {
return warcHeader.metadata.entrySet();
}
/**
* Gets a value for a specific header metadata key
*
* @param key
*/
public String getHeaderMetadataItem(String key) {
return warcHeader.metadata.get(key);
}
/**
* Sets the byte content for this record
*
* @param content
*/
public void setContent(byte[] content) {
warcContent = content;
warcHeader.contentLength = content.length;
}
/**
* Sets the byte content for this record
*
* @param content
*/
public void setContent(String content) {
setContent(content.getBytes());
}
/**
* Retrieves the byte content for this record
*/
public byte[] getByteContent() {
return warcContent;
}
/**
* Retrieves the bytes content as a UTF-8 string
*/
public String getContentUTF8() {
String retString = null;
try {
retString = new String(warcContent, "UTF-8");
} catch (UnsupportedEncodingException ex) {
retString = new String(warcContent);
}
return retString;
}
/**
* Gets the header record type string
*/
public String getHeaderRecordType() {
return warcHeader.recordType;
}
@Override
public String toString() {
StringBuffer retBuffer = new StringBuffer();
retBuffer.append(warcHeader.toString());
retBuffer.append(NEWLINE);
retBuffer.append(warcContent);
return retBuffer.toString();
}
/**
* Gets the WARC header as a string
*/
public String getHeaderString() {
return warcHeader.toString();
}
/**
* Serialization output
*
* @param out
* @throws java.io.IOException
*/
public void write(DataOutput out) throws IOException {
warcHeader.write(out);
out.write(warcContent);
}
/**
* Serialization input
*
* @param in
* @throws java.io.IOException
*/
public void readFields(DataInput in) throws IOException {
warcHeader.readFields(in);
int contentLengthBytes = warcHeader.contentLength;
warcContent = new byte[contentLengthBytes];
in.readFully(warcContent);
}
public String getDocid() {
return getHeaderMetadataItem("WARC-TREC-ID");
}
public String getContent() {
String str = getContentUTF8();
int i = str.indexOf("Content-Length:");
int j = str.indexOf("\n", i);
return str.substring(j+1);
}
public String getDisplayContentType() {
return "text/html";
}
}