package org.archive.resource.html;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.LexParser;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.resource.ResourceContainer;
import org.archive.resource.ResourceFactory;
import org.archive.resource.ResourceParseException;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
public class HTMLResourceFactory implements ResourceFactory {
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException, IOException {
HTMLMetaData hmd = new HTMLMetaData(parentMetaData);
ExtractingParseObserver epo = new ExtractingParseObserver(hmd);
LexParser parser = new LexParser(epo);
CDATALexer lex = new CDATALexer();
// TODO: figure out charset:
String charset = "UTF-8";
Page page;
try {
page = new Page(is, charset);
lex.setPage(page);
parser.doParse(lex);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
throw new ResourceParseException(e);
} catch (ParserException e) {
e.printStackTrace();
throw new ResourceParseException(e);
} catch(OutOfMemoryError e) {
throw new ResourceParseException(null);
}
return new HTMLResource(hmd,container);
}
}