package glaze.soup.async; import glaze.soup.Mode; import glaze.util.ResponseUtil; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.Charset; import org.apache.http.HttpException; import org.apache.http.HttpResponse; import org.apache.http.entity.ContentType; import org.apache.http.nio.IOControl; import org.apache.http.nio.client.methods.AsyncByteConsumer; import org.apache.http.protocol.HttpContext; import org.jsoup.helper.Validate; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.parser.Parser; public abstract class SoupConsumer<T> extends AsyncByteConsumer<T> { public static SoupConsumer<Document> instance() { return instance(Mode.HTML); } public static SoupConsumer<Document> instance(Mode mode) { return new SoupConsumer<Document>(mode) { @Override protected Document onDocumentReceived(Document document) { return document; } }; } private volatile StringBuilder stringBuffer; private Mode mode; private String charsetName; private String baseUri; public SoupConsumer() { this(Mode.HTML); } public SoupConsumer(Mode mode) { this("", ResponseUtil.DEFAULT_ENCODING, mode); } public SoupConsumer(String baseUri, String charsetName, Mode mode) { this.mode = mode; this.charsetName = charsetName; this.baseUri = baseUri; } void appendByteData(ByteBuffer byteData, StringBuilder docData, String charsetName) { if (charsetName == null) { docData.append(Charset.forName(ResponseUtil.DEFAULT_ENCODING).decode(byteData).toString()); } else { Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); docData.append(Charset.forName(charsetName).decode(byteData).toString()); } } String getCharsetFromContentType(String contentType) { Charset charset = ContentType.parse(contentType).getCharset(); return charset == null ? ResponseUtil.DEFAULT_ENCODING : charset.toString(); } Document parseDocData(String docData, String charsetName, String baseUri, Parser parser) { Document doc = null; if (charsetName == null) { doc = parser.parseInput(docData, baseUri); Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first(); if (meta != null) { String foundCharset = (meta.hasAttr("http-equiv")) ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset"); if ((foundCharset != null) && (foundCharset.length() != 0) && (!(foundCharset.equals(ResponseUtil.DEFAULT_ENCODING)))) { // XXX we cannot rewind :_ // charsetName = foundCharset; // byteData.rewind(); // docData = // Charset.forName(foundCharset).decode(byteData).toString(); // doc = null; throw new IllegalStateException(String.format("Invalid enconding %s found %s", charsetName, foundCharset)); } } } if (doc == null) { if (docData.charAt(0) == 65279) { docData = docData.substring(1); } doc = parser.parseInput(docData, baseUri); doc.outputSettings().charset(charsetName); } return doc; } @Override protected T buildResult(HttpContext ctx) throws Exception { Document document = parseDocData(stringBuffer.toString(), charsetName, baseUri, mode.getParser()); return onDocumentReceived(document); } @Override protected void onByteReceived(ByteBuffer byteBuffer, IOControl paramIOControl) throws IOException { appendByteData(byteBuffer, stringBuffer, charsetName); } abstract protected T onDocumentReceived(Document document); @Override protected void onResponseReceived(HttpResponse response) throws HttpException, IOException { this.stringBuffer = new StringBuilder(); this.charsetName = charsetName == null ? ResponseUtil.resolveEncoding(response) : charsetName; } }