package de.l3s.boilerpipe.sax; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; /** * A very simple HTTP/HTML fetcher, really just for demo purposes. * * @author Christian Kohlschütter */ public class HTMLFetcher { private HTMLFetcher() { } private static final Pattern PAT_CHARSET = Pattern.compile("charset=([^; ]+)$"); /** * Fetches the document at the given URL, using {@link URLConnection}. * @param url * @return * @throws IOException */ public static HTMLDocument fetch(final URL url) throws IOException { final URLConnection conn = url.openConnection(); final String ct = conn.getContentType(); // Charset cs = Charset.forName("Cp1252"); Charset cs = Charset.forName("UTF-8"); if (ct != null) { Matcher m = PAT_CHARSET.matcher(ct); if(m.find()) { final String charset = m.group(1); try { cs = Charset.forName(charset); } catch (UnsupportedCharsetException e) { // keep default } } } InputStream in = conn.getInputStream(); final String encoding = conn.getContentEncoding(); if(encoding != null) { if("gzip".equalsIgnoreCase(encoding)) { in = new GZIPInputStream(in); } else { System.err.println("WARN: unsupported Content-Encoding: "+encoding); } } ByteArrayOutputStream bos = new ByteArrayOutputStream(); byte[] buf = new byte[4096]; int r; while ((r = in.read(buf)) != -1) { bos.write(buf, 0, r); } in.close(); byte[] data = bos.toByteArray(); // System.out.println("encoding: " + cs.getClass().getName()); // Clean up invalid chars CharsetDecoder decoder = cs.newDecoder(); decoder.onMalformedInput(CodingErrorAction.IGNORE); decoder.onUnmappableCharacter(CodingErrorAction.IGNORE); CharBuffer parsedData = decoder.decode(ByteBuffer.wrap(data)); CharsetEncoder encoder = cs.newEncoder(); ByteBuffer encodedParsedData = encoder.encode(parsedData); data = encodedParsedData.array(); return new HTMLDocument(data, cs); } }