package com.geccocrawler.gecco.downloader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public abstract class AbstractDownloader implements Downloader { private static Log log = LogFactory.getLog(AbstractDownloader.class); private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)"); private String getCharsetFromContentType(String contentType) { if (contentType == null) return null; Matcher m = charsetPattern.matcher(contentType); if (m.find()) { return m.group(1).trim().toUpperCase(); } return null; } protected String getCharset(String requestCharset, String contentType) { //先取contentType的字符集 String charset = getCharsetFromContentType(contentType); if(charset == null) { //再取request指定的字符集 charset = requestCharset; } if(charset == null) { //默认采用utf-8 charset = "UTF-8"; } return charset; } /** * 将原始的inputStream转换为ByteArrayInputStream使raw可以重复使用 * * @param in 原始的inputStream * @return 可以重复使用的ByteArrayInputStream */ protected ByteArrayInputStream toByteInputStream(InputStream in) { ByteArrayInputStream bis = null; ByteArrayOutputStream bos = new ByteArrayOutputStream(); try { byte[] b = new byte[1024]; for (int c = 0; (c = in.read(b)) != -1;) { bos.write(b, 0, c); } b = null; bis = new ByteArrayInputStream(bos.toByteArray()); } catch(EOFException eof){ bis = new ByteArrayInputStream(bos.toByteArray()); log.warn("inputstream " + in.getClass().getName() + " eof!"); } catch (IOException e) { log.warn("inputstream " + in.getClass().getName() + " don't to byte inputstream!"); return null; } finally { try { in.close(); bos.close(); } catch (IOException e) { bos = null; in = null; } } return bis; } }