package org.arong.egdownloader.spider; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.security.KeyManagementException; import java.security.NoSuchAlgorithmException; /** * 分析指定的url或者html源码页面,抓取网页特定内容 * @author 阿荣 * @since 2013-8-18 * */ public final class Spider { /** * 分析指定链接结果,并返回字符串数值 * @param searchURL * @param anchor 所要抓取字符串的前面的特定字符串 * @param trail 所要抓取字符串的后面紧跟的特定字符串 * @return * @throws SpiderException * @throws IOException * @throws NoSuchAlgorithmException * @throws KeyManagementException * @throws UnsupportedEncodingException * @throws Exception */ public static String getText(final String url, final String encoding, final String prefix, final String suffix) throws SpiderException, WebClientException, KeyManagementException, NoSuchAlgorithmException, IOException { String text = null; String serverResponse = WebClient.getRequestUseJava(url, encoding);//WebClient.postRequest(url, encoding); if(serverResponse != null){ int pos = serverResponse.indexOf(prefix); if (pos != -1) { serverResponse = serverResponse.substring(pos + prefix.length()); pos = serverResponse.indexOf(suffix); if(pos != -1) { text = serverResponse.substring(0, pos).trim(); } else{ throw new SpiderException(url, "---" + prefix + "之后找不到标识符:", suffix); } } else { throw new SpiderException(url, "-找不到标识符:", prefix); } } return text; } public static String getTextUseJava(final String url, final String encoding, final String prefix, final String suffix) throws SpiderException, WebClientException, KeyManagementException, NoSuchAlgorithmException { String text = null; try { String serverResponse = WebClient.getRequestUseJava(url, encoding); if(serverResponse != null){ int pos = serverResponse.indexOf(prefix); if (pos != -1) { serverResponse = serverResponse.substring(pos + prefix.length()); pos = serverResponse.indexOf(suffix); if(pos != -1) { text = serverResponse.substring(0, pos).trim(); } else{ throw new SpiderException(url, "---" + prefix + "之后找不到标识符:", suffix); } } else { throw new SpiderException(url, "-找不到标识符:", prefix); } } } catch (IOException e) { throw new WebClientException(url + ":解析错误"); } return text; } /** * 分析指定html源码,并返回字符串数值 * @param source * @param prefix 所要抓取字符串的前面的特定字符串 * @param suffix 所要抓取字符串的后面紧跟的特定字符串 * @return * @throws SpiderException * @throws Exception */ public static String getTextFromSource(final String source, final String prefix, final String suffix) throws SpiderException{ String text = null; String serverResponse = source; if(serverResponse != null){ int pos = serverResponse.indexOf(prefix); if (pos != -1) { serverResponse = serverResponse.substring(pos + prefix.length()); pos = serverResponse.indexOf(suffix); if(pos != -1){ text = serverResponse.substring(0, pos).trim(); } else { throw new SpiderException("---" + prefix + "之后找不到标识符:", suffix); } } else{ throw new SpiderException("找不到标识符:", prefix); } } return text; } public static String substring(String htmlSource, String prefix) throws SpiderException{ String text = null; if(htmlSource != null){ int pos = htmlSource.indexOf(prefix); if (pos != -1) { text = htmlSource.substring(pos + prefix.length()); } else{ throw new SpiderException("找不到标识符:", prefix); } } return text; } public static String substring(String htmlSource, String prefix, int step) throws SpiderException{ String text = null; if(htmlSource != null){ int pos = htmlSource.indexOf(prefix); if (pos != -1) { text = htmlSource.substring(pos + prefix.length() + step); } else{ throw new SpiderException("找不到标识符:", prefix); } } return text; } public static Boolean containText(String url, String encoding, String text) throws WebClientException, SpiderException, KeyManagementException, NoSuchAlgorithmException, IOException{ String serverResponse = WebClient.getRequestUseJava(url, encoding);//WebClient.postRequest(url, encoding); containTextFromSource(serverResponse, text); return true; } /** * 判断源码中是否包含指定的字符串,如果不包含则抛出一个异常 * @param htmlSource * @param text * @return * @throws SpiderException */ public static Boolean containTextFromSource(String htmlSource, String text) throws SpiderException{ if(htmlSource != null){ int pos = htmlSource.indexOf(text); if (pos != -1) { return true; } else{ throw new SpiderException("找不到标识符:", text); } } return false; } }