/**
*
*/
package uk.bl.wa.tika.parser.warc;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
import org.archive.io.arc.UncompressedARCReader;
import org.archive.io.warc.UncompressedWARCReader;
import org.archive.io.warc.WARCRecord;
/**
* @author Andrew Jackson <Andrew.Jackson@bl.uk>
*
*/
public class WebARCExtractor {
private final ContentHandler handler;
private final Metadata metadata;
private final EmbeddedDocumentExtractor extractor;
private boolean isWARC = false;
public WebARCExtractor(
ContentHandler handler, Metadata metadata, ParseContext context, boolean isWARC ) {
this.handler = handler;
this.metadata = metadata;
this.isWARC = isWARC;
EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
if (ex==null) {
this.extractor = new ParsingEmbeddedDocumentExtractor(context);
} else {
this.extractor = ex;
}
}
/* (non-Javadoc)
* @see org.apache.tika.parser.AbstractParser#parse(java.io.InputStream, org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata)
*/
//@Override
public void parse(InputStream stream) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
System.out.println("GO: "+metadata.get( Metadata.RESOURCE_NAME_KEY ));
// Open the ARCReader:
// This did not work as assumes compressed:
// ArchiveReaderFactory.get("name.arc", stream, true);
ArchiveReader ar = null;
if( isWARC ) {
ar = UncompressedWARCReader.get("name.warc", stream, true);
} else {
ar = UncompressedARCReader.get("name.arc", stream, true);
}
// Go through the records:
if (ar != null) {
// Also get out the archive format version:
metadata.set("version",ar.getVersion());
Iterator<ArchiveRecord> it = ar.iterator();
while (it.hasNext()) {
ArchiveRecord entry = it.next();
InputStream is = (WARCRecord) entry;
if( this.isWARC ) {
String firstLine[] = HttpParser.readLine(is, "UTF-8").split(" ");
String statusCode = firstLine[1].trim();
Header[] headers = HttpParser.parseHeaders(is, "UTF-8");
}
String name = entry.getHeader().getUrl();
name = entry.getHeader().getHeaderValue(WARCRecord.HEADER_KEY_TYPE)+":"+name;
// Now parse it...
// Setup
Metadata entrydata = new Metadata();
entrydata.set(Metadata.RESOURCE_NAME_KEY, name );
// Use the delegate parser to parse the compressed document
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(is, xhtml, entrydata, true);
}
}
}
xhtml.endDocument();
}
}