ClueWeb09WarcRecord.java example

Explorer

TemporalSearch-master
- common
  - src
    - main
      - java
        de
        l3s
        common
        TextConsole.java
        WikiEventsHBaseImport.java
        features
        TimeSeriesFeatures.java
        hadoop
        TimeSeriesJob.java
        TimeSeriesMapper.java
        TimeSeriesReducer.java
        autocorrelation
        CorrelationJob.java
        CorrelationMapper.java
        CorrelationReducer.java
        movingaverage
        MovingAverageJob.java
        MovingAverageMapper.java
        MovingAverageReducer.java
        NoShuffleSort_MovingAverageJob.java
        NoShuffleSort_MovingAverageMapper.java
        NoShuffleSort_MovingAverageReducer.java
        SlidingWindow.java
        hadoop
        WholeFileInputFormat.java
        WholeFileRecordReader.java
        models
        App.java
        timeseries
        CompositeKeyComparator.java
        KeyData.java
        NaturalKeyGroupingComparator.java
        NaturalKeyPartitioner.java
        Timeseries.java
        TimeseriesDataPoint.java
        TimeseriesKey.java
    - test
      - java
        de
        l3s
        common
        AppTest.java
        hbase
        test
        HBaseTest.java
- content
  - src
    - main
      - java
        de
        l3s
        content
        mapred
        ClueWeb09InputFormat.java
        WikipediaPageInputFormat.java
        WikipediaPagesBz2InputStream.java
        timex
        extracting
        ClueWeb09Timex.java
        ClueWeb09TimexWriteToHDFS.java
        WikiTimex.java
        utils
        DateUtil.java
        edu
        umd
        cloud9
        collection
        Indexable.java
        IndexableFileInputFormat.java
        XMLInputFormat.java
        XMLInputFormatOld.java
        wikipedia
        WikipediaPage.java
        language
        ArabicWikipediaPage.java
        ChineseWikipediaPage.java
        CzechWikipediaPage.java
        EnglishWikipediaPage.java
        GermanWikipediaPage.java
        SpanishWikipediaPage.java
        SwedishWikipediaPage.java
        TurkishWikipediaPage.java
        WikipediaPageFactory.java
        org
        clueweb
        clueweb09
        ClueWeb09WarcRecord.java
        mapreduce
        ClueWeb09InputFormat.java
        data
        DocVector.java
        Indexable.java
        PForDocVector.java
        TermStatistics.java
        VByteDocVector.java
        WarcTrecIdMapping.java
        wikimedia
        wikihadoop
        ByteMatcher.java
        SeekableInputStream.java
        StreamWikiDumpInputFormat.java
- heideltime
  - src
    - de
      - tudarmstadt
        ukp
        dkpro
        core
        type
        Lemma.java
        Lemma_Type.java
        POS.java
        POS_Type.java
        Sentence.java
        Sentence_Type.java
        Stem.java
        Stem_Type.java
        Token.java
        Token_Type.java
        pos
        ADJ.java
        ADJ_Type.java
        ADV.java
        ADV_Type.java
        ART.java
        ART_Type.java
        CARD.java
        CARD_Type.java
        CONJ.java
        CONJ_Type.java
        N.java
        NN.java
        NN_Type.java
        NP.java
        NP_Type.java
        N_Type.java
        O.java
        O_Type.java
        PP.java
        PP_Type.java
        PR.java
        PR_Type.java
        PUNC.java
        PUNC_Type.java
        V.java
        V_Type.java
      - unihd
        dbs
        heideltime
        standalone
        CLISwitch.java
        Config.java
        Constants.java
        DocumentType.java
        HeidelTimeAnnotator.java
        HeidelTimeStandalone.java
        OutputType.java
        POSTagger.java
        components
        JCasFactory.java
        PartOfSpeechTagger.java
        ResultFormatter.java
        UIMAAnnotator.java
        impl
        IntervalTaggerWrapper.java
        JCasFactoryImpl.java
        JVnTextProWrapper.java
        StandaloneConfigContext.java
        StanfordPOSTaggerWrapper.java
        TimeMLResultFormatter.java
        TreeTaggerWrapper.java
        UimaContextImpl.java
        XMIResultFormatter.java
        exceptions
        DocumentCreationTimeMissingException.java
        uima
        annotator
        annotationtranslator
        AnnotationTranslator.java
        heideltime
        HeidelTime.java
        HeidelTimeException.java
        ProcessorManager.java
        processors
        GenericProcessor.java
        HolidayProcessor.java
        ProcessorInitializationException.java
        ProcessorProcessingException.java
        resources
        GenericResourceManager.java
        Language.java
        NormalizationManager.java
        RePatternManager.java
        RegexHashMap.java
        RuleManager.java
        utilities
        ContextAnalyzer.java
        DateCalculator.java
        LocaleException.java
        Logger.java
        Toolbox.java
        intervaltagger
        IntervalTagger.java
        jvntextprowrapper
        JVnTextProWrapper.java
        stanfordtagger
        StanfordPOSTaggerWrapper.java
        treetagger
        TreeTaggerWrapper.java
        consumer
        aceternwriter
        ACETernWriter.java
        tempeval2writer
        Tempeval2Writer.java
        tempeval3writer
        TempEval3Writer.java
        reader
        aceternreader
        ACETernReader.java
        tempeval2reader
        Tempeval2Reader.java
        tempeval3reader
        Tempeval3Reader.java
        types
        heideltime
        Dct.java
        Dct_Type.java
        Event.java
        Event_Type.java
        GoldEvent.java
        GoldEvent_Type.java
        IntervalCandidateSentence.java
        IntervalCandidateSentence_Type.java
        Sentence.java
        Sentence_Type.java
        SourceDocInfo.java
        SourceDocInfo_Type.java
        Timex3.java
        Timex3Interval.java
        Timex3Interval_Type.java
        Timex3_Type.java
        Token.java
        Token_Type.java

/*
 * ClueWeb Tools: Hadoop tools for manipulating ClueWeb collections
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

/*
 * Container for a generic Warc Record 
 * 
 * (C) 2009 - Carnegie Mellon University
 * 
 * 1. Redistributions of this source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 * 2. The names "Lemur", "Indri", "University of Massachusetts",  
 *    "Carnegie Mellon", and "lemurproject" must not be used to 
 *    endorse or promote products derived from this software without
 *    prior written permission. To obtain permission, contact 
 *    license@lemurproject.org.
 *
 * 4. Products derived from this software may not be called "Lemur" or "Indri"
 *    nor may "Lemur" or "Indri" appear in their names without prior written
 *    permission of The Lemur Project. To obtain permission,
 *    contact license@lemurproject.org.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 
 * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY 
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 * POSSIBILITY OF SUCH DAMAGE. 
 * 
 * @author mhoy@cs.cmu.edu (Mark J. Hoy)
 */

package org.clueweb.clueweb09;

import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.EOFException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.log4j.Logger;
import org.clueweb.data.Indexable;

import de.l3s.content.timex.extracting.ClueWeb09Timex;

public class ClueWeb09WarcRecord extends Indexable {
	private static final Logger LOG = Logger.getLogger(ClueWeb09WarcRecord.class);
	public static String WARC_VERSION = "WARC/0.18";
	public static String WARC_VERSION_LINE = "WARC/0.18\n";
	private static String NEWLINE = "\n";

	private static byte MASK_THREE_BYTE_CHAR = (byte) (0xE0);
	private static byte MASK_TWO_BYTE_CHAR = (byte) (0xC0);
	private static byte MASK_TOPMOST_BIT = (byte) (0x80);
	private static byte MASK_BOTTOM_SIX_BITS = (byte) (0x1F);
	private static byte MASK_BOTTOM_FIVE_BITS = (byte) (0x3F);
	private static byte MASK_BOTTOM_FOUR_BITS = (byte) (0x0F);

	/**
	 * Our read line implementation. We cannot allow buffering here (for gzip
	 * streams) so, we need to use DataInputStream. Also - we need to account
	 * for java's UTF8 implementation
	 * 
	 * @param in
	 *            the input data stream
	 * @return the read line (or null if eof)
	 * @throws java.io.IOException
	 */
	private static String readLineFromInputStream(DataInputStream in) throws IOException {
		StringBuilder retString = new StringBuilder();

		boolean keepReading = true;
		try {
			do {
				char thisChar = 0;
				byte readByte = in.readByte();

				// check to see if it's a multibyte character
				if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) {
					// need to read the next 2 bytes
					if (in.available() < 2) {
						// treat these all as individual characters
						retString.append((char) readByte);
						int numAvailable = in.available();
						for (int i = 0; i < numAvailable; i++) {
							retString.append((char) (in.readByte()));
						}
						continue;
					}
					byte secondByte = in.readByte();
					byte thirdByte = in.readByte();
					// ensure the topmost bit is set
					if (((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT)
							|| ((thirdByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT)) {
						// treat these as individual characters
						retString.append((char) readByte);
						retString.append((char) secondByte);
						retString.append((char) thirdByte);
						continue;
					}
					int finalVal = (thirdByte & MASK_BOTTOM_FIVE_BITS) + 64
							* (secondByte & MASK_BOTTOM_FIVE_BITS) + 4096
							* (readByte & MASK_BOTTOM_FOUR_BITS);
					thisChar = (char) finalVal;
				} else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) {
					// need to read next byte
					if (in.available() < 1) {
						// treat this as individual characters
						retString.append((char) readByte);
						continue;
					}
					byte secondByte = in.readByte();
					if ((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) {
						retString.append((char) readByte);
						retString.append((char) secondByte);
						continue;
					}
					int finalVal = (secondByte & MASK_BOTTOM_FIVE_BITS) + 64
							* (readByte & MASK_BOTTOM_SIX_BITS);
					thisChar = (char) finalVal;
				} else {
					// interpret it as a single byte
					thisChar = (char) readByte;
				}

				if (thisChar == '\n') {
					keepReading = false;
				} else {
					retString.append(thisChar);
				}
			} while (keepReading);
		} catch (EOFException eofEx) {
			return null;
		}

		if (retString.length() == 0) {
			return "";
		}

		return retString.toString();
	}

	/**
	 * The actual heavy lifting of reading in the next WARC record
	 * 
	 * @param in
	 *            the data input stream
	 * @param headerBuffer
	 *            a blank string buffer to contain the WARC header
	 * @return the content byts (w/ the headerBuffer populated)
	 * @throws java.io.IOException
	 */
	private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuffer)
			throws IOException {
		if (in == null) {
			return null;
		}
		if (headerBuffer == null) {
			return null;
		}

		String line = null;
		boolean foundMark = false;
		boolean inHeader = true;
		byte[] retContent = null;

		// cannot be using a buffered reader here!!!!
		// just read the header
		// first - find our WARC header
		while ((!foundMark) && ((line = readLineFromInputStream(in)) != null)) {
			if (line.startsWith(WARC_VERSION)) {
				foundMark = true;
			}
		}

		// no WARC mark?
		if (!foundMark) {
			return null;
		}

		// then read to the first newline
		// make sure we get the content length here
		int contentLength = -1;
		boolean foundContentLength = false; //modified: till the second content-length, to load in http-header
		boolean linespace = false;
		int count = 0; //content_length count
		int space_count = 0;
		while (!foundContentLength && inHeader && ((line = readLineFromInputStream(in)) != null)) {
		
			if (line.trim().length() == 0) {
				space_count++;
				if (space_count % 2 == 0 && space_count != 0) linespace = true;
			}
			if (linespace && foundContentLength) {
				inHeader = false;
				space_count = 0; //reset counter
			} else {
				headerBuffer.append(line);
				headerBuffer.append(NEWLINE);
				String[] thisHeaderPieceParts = line.split(":", 2);
				
				if (thisHeaderPieceParts.length == 2) {
					if (thisHeaderPieceParts[0].toLowerCase().startsWith("content-length")) {
						count++;
						if (count % 2 == 0 && count != 0) {
							foundContentLength = true;
							count = 0; //reset counter
							try {
								contentLength = Integer.parseInt(thisHeaderPieceParts[1].trim());
							} catch (NumberFormatException nfEx) {
								contentLength = -1;
							}
						}
					}
				}
			}
		}

		if (contentLength < 0) {
			return null;
		}

		// now read the bytes of the content
		retContent = new byte[contentLength];
		int totalWant = contentLength;
		int totalRead = 0;
		while (totalRead < contentLength) {
			try {
				int numRead = in.read(retContent, totalRead, totalWant);
				if (numRead < 0) {
					return null;
				} else {
					totalRead += numRead;
					totalWant = contentLength - totalRead;
				} // end if (numRead < 0) / else
			} catch (EOFException eofEx) {
				// resize to what we have
				if (totalRead > 0) {
					byte[] newReturn = new byte[totalRead];
					System.arraycopy(retContent, 0, newReturn, 0, totalRead);
					return newReturn;
				} else {
					return null;
				}
			} // end try/catch (EOFException)
		} // end while (totalRead < contentLength)

		return retContent;
	}

	/**
	 * Reads in a WARC record from a data input stream
	 * 
	 * @param in
	 *            the input stream
	 * @return a WARC record (or null if eof)
	 * @throws java.io.IOException
	 */
	public static ClueWeb09WarcRecord readNextWarcRecord(DataInputStream in) throws IOException {
		StringBuffer recordHeader = new StringBuffer();
		byte[] recordContent = readNextRecord(in, recordHeader);
		if (recordContent == null) {
			return null;
		}

		// extract out our header information
		String thisHeaderString = recordHeader.toString();
		String[] headerLines = thisHeaderString.split(NEWLINE);

		ClueWeb09WarcRecord retRecord = new ClueWeb09WarcRecord();
		for (int i = 0; i < headerLines.length; i++) {
			String[] pieces = headerLines[i].split(":", 2);
			if (pieces.length != 2) {
				retRecord.addHeaderMetadata(pieces[0], "");
				continue;
			}
			String thisKey = pieces[0].trim();
			String thisValue = pieces[1].trim();

			// check for known keys
			if (thisKey.equals("WARC-Type")) {
				retRecord.setWarcRecordType(thisValue);
			} else if (thisKey.equals("WARC-Date")) {
				retRecord.setWarcDate(thisValue);
			} else if (thisKey.equals("WARC-Record-ID")) {
				retRecord.setWarcUUID(thisValue);
			} else if (thisKey.equals("Content-Type")) {
				retRecord.setWarcContentType(thisValue);
			} else {
				retRecord.addHeaderMetadata(thisKey, thisValue);
			}
		}

		// set the content
		retRecord.setContent(recordContent);

		return retRecord;
	}

	/**
	 * Warc header class
	 */
	public class WarcHeader {
		public String contentType = "";
		public String UUID = "";
		public String dateString = "";
		public String recordType = "";
		public HashMap<String, String> metadata = new HashMap<String, String>();
		public int contentLength = 0;

		/**
		 * Default constructor
		 */
		public WarcHeader() {
		}

		/**
		 * Copy Constructor
		 * 
		 * @param o
		 *            other WARC header
		 */
		public WarcHeader(WarcHeader o) {
			this.contentType = o.contentType;
			this.UUID = o.UUID;
			this.dateString = o.dateString;
			this.recordType = o.recordType;
			this.metadata.putAll(o.metadata);
			this.contentLength = o.contentLength;
		}

		/**
		 * Serialization output
		 * 
		 * @param out
		 *            the data output stream
		 * @throws java.io.IOException
		 */
		public void write(DataOutput out) throws IOException {
			out.writeUTF(contentType);
			out.writeUTF(UUID);
			out.writeUTF(dateString);
			out.writeUTF(recordType);
			out.writeInt(metadata.size());
			Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
			while (metadataIterator.hasNext()) {
				Entry<String, String> thisEntry = metadataIterator.next();
				out.writeUTF(thisEntry.getKey());
				out.writeUTF(thisEntry.getValue());
			}
			out.writeInt(contentLength);
		}

		/**
		 * Serialization input
		 * 
		 * @param in
		 *            the data input stream
		 * @throws java.io.IOException
		 */
		public void readFields(DataInput in) throws IOException {
			contentType = in.readUTF();
			UUID = in.readUTF();
			dateString = in.readUTF();
			recordType = in.readUTF();
			metadata.clear();
			int numMetaItems = in.readInt();
			for (int i = 0; i < numMetaItems; i++) {
				String thisKey = in.readUTF();
				String thisValue = in.readUTF();
				metadata.put(thisKey, thisValue);
			}
			contentLength = in.readInt();
		}

		@Override
		public String toString() {
			StringBuffer retBuffer = new StringBuffer();

			retBuffer.append(WARC_VERSION);
			retBuffer.append(NEWLINE);

			retBuffer.append("WARC-Type: " + recordType + NEWLINE);
			retBuffer.append("WARC-Date: " + dateString + NEWLINE);

			retBuffer.append("WARC-Record-ID: " + UUID + NEWLINE);
			Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
			while (metadataIterator.hasNext()) {
				Entry<String, String> thisEntry = metadataIterator.next();
				retBuffer.append(thisEntry.getKey());
				retBuffer.append(": ");
				retBuffer.append(thisEntry.getValue());
				retBuffer.append(NEWLINE);
			}

			retBuffer.append("Content-Type: " + contentType + NEWLINE);
			retBuffer.append("Content-Length: " + contentLength + NEWLINE);

			return retBuffer.toString();
		}
	}

	private WarcHeader warcHeader = new WarcHeader();
	private byte[] warcContent = null;
	private String warcFilePath = "";

	/**
	 * Default Constructor
	 */
	public ClueWeb09WarcRecord() {
	}

	/**
	 * Copy Constructor
	 * 
	 * @param o
	 */
	public ClueWeb09WarcRecord(ClueWeb09WarcRecord o) {
		this.warcHeader = new WarcHeader(o.warcHeader);
		this.warcContent = o.warcContent;
	}

	/**
	 * Retrieves the total record length (header and content)
	 * 
	 * @return total record length
	 */
	public int getTotalRecordLength() {
		int headerLength = warcHeader.toString().length();
		return (headerLength + warcContent.length);
	}

	/**
	 * Sets the record content (copy)
	 * 
	 * @param o
	 *            record to copy from
	 */
	public void set(ClueWeb09WarcRecord o) {
		this.warcHeader = new WarcHeader(o.warcHeader);
		this.warcContent = o.warcContent;
	}

	/**
	 * Gets the file path from this WARC file (if set)
	 */
	public String getWarcFilePath() {
		return warcFilePath;
	}

	/**
	 * Sets the warc file path (optional - for use with getWarcFilePath)
	 * 
	 * @param path
	 */
	public void setWarcFilePath(String path) {
		warcFilePath = path;
	}

	/**
	 * Sets the record type string
	 * 
	 * @param recordType
	 */
	public void setWarcRecordType(String recordType) {
		warcHeader.recordType = recordType;
	}

	/**
	 * Sets the content type string
	 * 
	 * @param contentType
	 */
	public void setWarcContentType(String contentType) {
		warcHeader.contentType = contentType;
	}

	/**
	 * Sets the WARC header date string
	 * 
	 * @param dateString
	 */
	public void setWarcDate(String dateString) {
		warcHeader.dateString = dateString;
	}

	/**
	 * Sets the WARC uuid string
	 * 
	 * @param UUID
	 */
	public void setWarcUUID(String UUID) {
		warcHeader.UUID = UUID;
	}

	/**
	 * Adds a key/value pair to a WARC header. This is needed to filter out
	 * known keys
	 * 
	 * @param key
	 * @param value
	 */
	public void addHeaderMetadata(String key, String value) {
		// don't allow addition of known keys
		if (key.equals("WARC-Type")) {
			return;
		}
		if (key.equals("WARC-Date")) {
			return;
		}
		if (key.equals("WARC-Record-ID")) {
			return;
		}
		if (key.equals("Content-Type")) {
			return;
		}
		if (key.equals("Content-Length")) {
			return;
		}

		warcHeader.metadata.put(key, value);
	}

	/**
	 * Clears all metadata items from a header
	 */
	public void clearHeaderMetadata() {
		warcHeader.metadata.clear();
	}

	/**
	 * Gets the set of metadata items from the header
	 */
	public Set<Entry<String, String>> getHeaderMetadata() {
		return warcHeader.metadata.entrySet();
	}

	/**
	 * Gets a value for a specific header metadata key
	 * 
	 * @param key
	 */
	public String getHeaderMetadataItem(String key) {
		return warcHeader.metadata.get(key);
	}

	/**
	 * Sets the byte content for this record
	 * 
	 * @param content
	 */
	public void setContent(byte[] content) {
		warcContent = content;
		warcHeader.contentLength = content.length;
	}

	/**
	 * Sets the byte content for this record
	 * 
	 * @param content
	 */
	public void setContent(String content) {
		setContent(content.getBytes());
	}

	/**
	 * Retrieves the byte content for this record
	 */
	public byte[] getByteContent() {
		return warcContent;
	}

	/**
	 * Retrieves the bytes content as a UTF-8 string
	 */
	public String getContentUTF8() {
		String retString = null;
		try {
			retString = new String(warcContent, "UTF-8");
		} catch (UnsupportedEncodingException ex) {
			retString = new String(warcContent);
		}
		return retString;
	}

	/**
	 * Gets the header record type string
	 */
	public String getHeaderRecordType() {
		return warcHeader.recordType;
	}

	@Override
	public String toString() {
		StringBuffer retBuffer = new StringBuffer();
		retBuffer.append(warcHeader.toString());
		retBuffer.append(NEWLINE);
		retBuffer.append(warcContent);
		return retBuffer.toString();
	}

	/**
	 * Gets the WARC header as a string
	 */
	public String getHeaderString() {
		return warcHeader.toString();
	}

	/**
	 * Serialization output
	 * 
	 * @param out
	 * @throws java.io.IOException
	 */
	public void write(DataOutput out) throws IOException {
		warcHeader.write(out);
		out.write(warcContent);
	}

	/**
	 * Serialization input
	 * 
	 * @param in
	 * @throws java.io.IOException
	 */
	public void readFields(DataInput in) throws IOException {
		warcHeader.readFields(in);
		int contentLengthBytes = warcHeader.contentLength;
		warcContent = new byte[contentLengthBytes];
		in.readFully(warcContent);
	}

	public String getDocid() {
		return getHeaderMetadataItem("WARC-TREC-ID");
	}

	public String getContent() {
		String str = getContentUTF8();
		int i = str.indexOf("Content-Length:");
		int j = str.indexOf("\n", i);
		
		return str.substring(j+1);
	}
	
	public String getDisplayContentType() {
		return "text/html";
	}
}