/**
*
* APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,
* yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.seo.rank.tools;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
*
* 自动更改IP地址反爬虫封锁,支持多线程
* 使用代理服务器的方式
*
* @author 杨尚川
*/
public class ProxyIp {
private ProxyIp(){}
private static final Logger LOGGER = LoggerFactory.getLogger(ProxyIp.class);
private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
private static final String ENCODING = "gzip, deflate";
private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
private static final String CONNECTION = "keep-alive";
private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
private static volatile boolean isSwitching = false;
private static volatile long lastSwitchTime = 0l;
private static final WebClient WEB_CLIENT = new WebClient(BrowserVersion.INTERNET_EXPLORER_11);
private static final Pattern IP_PATTERN = Pattern.compile("((?:(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d)))\\.){3}(?:25[0-5]|2[0-4]\\d|((1\\d{2})|([1-9]?\\d))))");
//可用代理IP列表
private static final List<String> IPS = new Vector<>();
private static volatile int currentIpIndex = 0;
private static volatile boolean detect = true;
//五分钟
private static volatile int detectInterval = 300000;
private static final Path PROXY_IPS_FILE = Paths.get("src/main/resources/proxy_ips.txt");
//自身IP地址
private static String previousIp = getCurrentIp();
//能隐藏自己IP的代理
private static final Set<String> EXCELLENT_IPS = new ConcurrentSkipListSet<>();
private static final Set<String> EXCELLENT_USA_IPS = new ConcurrentSkipListSet<>();
//不能隐藏自己IP的代理
private static final Set<String> NORMAL_IPS = new ConcurrentSkipListSet<>();
private static final Path EXCELLENT_PROXY_IPS_FILE = Paths.get("src/main/resources/proxy_ips_excellent.txt");;
private static final Path EXCELLENT_USA_PROXY_IPS_FILE = Paths.get("src/main/resources/proxy_ips_excellent_usa.txt");
private static final Path NORMAL_PROXY_IPS_FILE = Paths.get("src/main/resources/proxy_ips_normal.txt");
static {
Set<String> ipSet = new HashSet<>();
//如果本地有则读取
try {
if(Files.notExists(PROXY_IPS_FILE.getParent())){
PROXY_IPS_FILE.getParent().toFile().mkdirs();
}
if(Files.notExists(PROXY_IPS_FILE)){
PROXY_IPS_FILE.toFile().createNewFile();
}
if(Files.notExists(EXCELLENT_PROXY_IPS_FILE)){
EXCELLENT_PROXY_IPS_FILE.toFile().createNewFile();
}
if(Files.notExists(EXCELLENT_USA_PROXY_IPS_FILE)){
EXCELLENT_USA_PROXY_IPS_FILE.toFile().createNewFile();
}
if(Files.notExists(NORMAL_PROXY_IPS_FILE)){
NORMAL_PROXY_IPS_FILE.toFile().createNewFile();
}
LOGGER.info("代理IP存放路径:"+PROXY_IPS_FILE.toAbsolutePath().toString());
ipSet.addAll(Files.readAllLines(PROXY_IPS_FILE));
ipSet.addAll(Files.readAllLines(EXCELLENT_PROXY_IPS_FILE));
}catch (Exception e){
LOGGER.error("读取本地代理IP失败", e);
}
if(ipSet.isEmpty()){
//从已知的网站获取代理IP和端口
ipSet.addAll(getProxyIps());
}
IPS.addAll(ipSet);
LOGGER.info("所有IP列表("+IPS.size()+"):");
AtomicInteger i = new AtomicInteger();
IPS.forEach(ip->LOGGER.info(i.incrementAndGet()+"、"+ip));
new Thread(()->{
//检查次数
int count=0;
while(detect) {
try {
save();
if(count%10==9){
//也要防止被更新IP站点封锁
toNewIp();
}
Thread.sleep(detectInterval);
//检查网站是否有新IP
getProxyIps().forEach(ip -> {
if (!IPS.contains(ip)) {
IPS.add(ip);
LOGGER.info("发现新代理IP:" + ip);
}
});
count++;
} catch (Exception e) {
LOGGER.error("更新代理IP出错", e);
}
}
}).start();
}
public static void stopDetect(){
detect = false;
}
public static void startDetect(){
detect = true;
}
private static void save(){
try {
//将本地的和新发现的代理IP进行合并保存到本地
Set<String> ips = new ConcurrentSkipListSet<>();
ips.addAll(Files.readAllLines(PROXY_IPS_FILE));
ips.addAll(IPS);
//移除不能隐藏自己的IP
ips.removeAll(NORMAL_IPS);
Files.write(PROXY_IPS_FILE, toVerify(ips));
LOGGER.info("将" + ips.size() + "条代理IP地址写入本地");
Set<String> excellentIps = new HashSet<>();
excellentIps.addAll(Files.readAllLines(EXCELLENT_PROXY_IPS_FILE));
excellentIps.addAll(EXCELLENT_IPS);
Files.write(EXCELLENT_PROXY_IPS_FILE, toVerify(excellentIps));
LOGGER.info("将" + excellentIps.size() + "条能隐藏自己的代理IP地址写入本地");
Set<String> excellentUsaIps = new HashSet<>();
excellentUsaIps.addAll(Files.readAllLines(EXCELLENT_USA_PROXY_IPS_FILE));
excellentUsaIps.addAll(EXCELLENT_USA_IPS);
Files.write(EXCELLENT_USA_PROXY_IPS_FILE, toVerify(excellentUsaIps));
LOGGER.info("将" + excellentUsaIps.size() + "条能隐藏自己的美国代理IP地址写入本地");
Set<String> normalIps = new HashSet<>();
normalIps.addAll(Files.readAllLines(NORMAL_PROXY_IPS_FILE));
normalIps.addAll(NORMAL_IPS);
Files.write(NORMAL_PROXY_IPS_FILE, toVerify(normalIps));
LOGGER.info("将" + normalIps.size() + "条不能隐藏自己的代理IP地址写入本地");
}catch (Exception e){
LOGGER.error("保存失败", e);
}
}
private static List<String> toVerify(Set<String> ips){
AtomicInteger i = new AtomicInteger();
AtomicInteger f = new AtomicInteger();
List<String> list = ips.parallelStream().filter(ip->{
LOGGER.info("验证进度:"+ips.size()+"/"+i.incrementAndGet());
String[] attr = ip.split(":");
if(verify(attr[0], Integer.parseInt(attr[1]))){
return true;
}
IPS.remove(ip);
f.incrementAndGet();
return false;
}).sorted().collect(Collectors.toList());
LOGGER.info("验证成功的IP数:"+(ips.size()-f.get()));
LOGGER.info("验证失败的IP数:"+f.get());
return list;
}
private static String getNextProxyIp(){
int index = currentIpIndex%IPS.size();
currentIpIndex++;
return IPS.get(index);
}
public static boolean toNewIp() {
long requestSwitchTime = System.currentTimeMillis();
LOGGER.info(Thread.currentThread()+"请求重新更换代理");
synchronized (ProxyIp.class) {
if (isSwitching) {
LOGGER.info(Thread.currentThread()+"已经有其他线程在进行更换代理了,我睡觉等待吧,其他线程更换代理完毕会叫醒我的");
try {
ProxyIp.class.wait();
} catch (InterruptedException e) {
LOGGER.error(e.getMessage(), e);
}
LOGGER.info(Thread.currentThread()+"其他线程已经更换完代理了,我可以返回了");
return true;
}
isSwitching = true;
}
//保险起见,这里再判断一下
//如果请求更换代理的时间小于上次成功更换代理的时间,则说明这个请求来的【太迟了】,则返回。
if(requestSwitchTime <= lastSwitchTime){
LOGGER.info("请求来的太迟了");
isSwitching = false;
return true;
}
LOGGER.info(Thread.currentThread()+"开始重新更换代理");
long start = System.currentTimeMillis();
String proxyIp = useNewProxyIp();
String currentIp = null;
int times=0;
//如果当前IP没有变化,还是等于之前的IP,则继续设置下一个代理IP
//为了防止无休止重试,设置限制次数
while((currentIp=getCurrentIp()).equals(previousIp)
&& (times++)<Integer.MAX_VALUE){
NORMAL_IPS.add(proxyIp);
IPS.remove(proxyIp);
proxyIp = useNewProxyIp();
}
if(!currentIp.equals(previousIp)) {
previousIp =currentIp;
EXCELLENT_IPS.add(proxyIp);
LOGGER.info(Thread.currentThread()+"自动更换代理成功!");
LOGGER.info(Thread.currentThread()+"更换代理耗时:"+(System.currentTimeMillis()-start)+"毫秒");
//通知其他线程结束等待
synchronized (ProxyIp.class) {
ProxyIp.class.notifyAll();
}
isSwitching = false;
lastSwitchTime = System.currentTimeMillis();
return true;
}
NORMAL_IPS.add(proxyIp);
IPS.remove(proxyIp);
LOGGER.info(Thread.currentThread()+"自动更换代理失败!");
LOGGER.info(Thread.currentThread()+"更换代理耗时:"+(System.currentTimeMillis()-start)+"毫秒");
//通知其他线程结束等待
synchronized (ProxyIp.class) {
ProxyIp.class.notifyAll();
}
isSwitching = false;
return false;
}
private static String useNewProxyIp(){
String newProxy = getNextProxyIp();
String[] attr = newProxy.split(":");
System.setProperty("proxySet", "true");
System.setProperty("http.proxyHost", attr[0]);
System.setProperty("http.proxyPort", attr[1]);
LOGGER.info("尝试使用新的代理:"+newProxy);
return newProxy;
}
/**
* 验证代理IP是否能工作,能工作不代表能向目标网站隐藏自己的IP
* @param host
* @param port
* @return
*/
public static boolean verify(String host, int port){
try {
String url = "http://apdplat.org";
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(host, port));
HttpURLConnection connection = (HttpURLConnection)new URL(url).openConnection(proxy);
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
connection.setUseCaches(false);
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
StringBuilder html = new StringBuilder();
String line = null;
while ((line=reader.readLine()) != null){
html.append(line);
}
LOGGER.info("HTML:"+html);
if(html.toString().contains("APDPlat应用级产品开发平台")){
LOGGER.info("代理IP验证成功:"+host+":"+port);
return true;
}
}catch (Exception e){
LOGGER.error(e.getMessage());
}
LOGGER.info("代理IP验证失败:"+host+":"+port);
return false;
}
/**
* 看看在ip138的眼中,自己的IP是多少
* @return
*/
public static String getCurrentIp(){
try {
String url = "http://1212.ip138.com/ic.asp?timestamp="+System.nanoTime();
String text = Jsoup.connect(url)
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("Host", "1111.ip138.com")
.header("Referer", "http://ip138.com/")
.header("User-Agent", USER_AGENT)
.ignoreContentType(true)
.timeout(5000)
.get()
.text();
LOGGER.info("检查自身IP地址:"+text);
Matcher matcher = IP_PATTERN.matcher(text);
if(matcher.find()){
String ip = matcher.group();
LOGGER.info("自身IP地址:"+ip);
if(text.contains("美国")){
EXCELLENT_USA_IPS.add(System.getProperty("http.proxyHost") + ":" + System.getProperty("http.proxyPort"));
}
return ip;
}
}catch (Exception e){
LOGGER.error(e.getMessage());
}
LOGGER.info("检查自身IP地址失败,返回之前的IP地址:"+ previousIp);
return previousIp;
}
private static Set<String> getProxyIps(){
Set<String> ips = new HashSet<>();
ips.addAll(getProxyIpOne());
ips.addAll(getProxyIpTwo());
ips.addAll(getProxyIpThree());
ips.addAll(getProxyIpFour());
return ips;
}
private static List<String> getProxyIpOne(){
String url = "http://proxy.goubanjia.com/?timestamp="+System.nanoTime();
String cssPath = "html body div.wrap.fullwidth div#content div#post-2.post-2.page.type-page.status-publish.hentry div.entry.entry-content div#list table.table tbody tr";
return getProxyIp(url, cssPath);
}
private static List<String> getProxyIpTwo(){
String url = "http://ip.qiaodm.com/?timestamp="+System.nanoTime();
String cssPath = "html body div#main_container div.inner table.iplist tbody tr";
return getProxyIp(url, cssPath);
}
private static List<String> getProxyIp(String url, String cssPath){
List<String> ips = new ArrayList<>();
try {
String html = ((HtmlPage)WEB_CLIENT.getPage(url)).getBody().asXml();
//LOGGER.info("html:"+html);
Document doc = Jsoup.parse(html);
Elements elements = doc.select(cssPath);
elements
.forEach(element -> {
try {
Elements tds = element.children();
String ip = null;
int port = 0;
if (tds.size() > 1) {
Element ele = tds.get(0);
ip = getIps(ele);
String text = tds.get(1).text();
LOGGER.info("端口:"+text+" -> "+tds.get(1).outerHtml());
port = Integer.parseInt(text);
}
if(ip != null && port > 0){
LOGGER.info("解析出IP:"+ip+",端口:"+port);
if(verify(ip, port)){
LOGGER.info("IP:"+ip+",端口:"+port+"可以使用");
ips.add(ip + ":" + port);
}else {
LOGGER.info("IP:"+ip+",端口:"+port+"不能使用");
}
}
}catch (Exception e){
LOGGER.error("解析IP出错", e);
}
});
}catch (Exception e){
LOGGER.error("解析IP出错", e);
}
return ips;
}
private static List<String> getProxyIpThree(){
List<String> ips = new ArrayList<>();
for(int i=1; i<=10; i++){
ips.addAll(getProxyIpThree(i));
}
return ips;
}
private static List<String> getProxyIpThree(int page){
List<String> ips = new ArrayList<>();
try {
String url = "http://www.kuaidaili.com/proxylist/"+page;
String html = ((HtmlPage)WEB_CLIENT.getPage(url)).getBody().asXml();
//LOGGER.info("html:"+html);
Document doc = Jsoup.parse(html);
Elements elements = doc.select("html body div#container div#list table.table.table-bordered.table-striped tbody tr");
elements
.forEach(element -> {
try {
Elements tds = element.children();
String ip = null;
int port = 0;
if (tds.size() > 1) {
ip = tds.get(0).text();
String text = tds.get(1).text();
LOGGER.info("IP:"+ip);
LOGGER.info("端口:"+text);
Matcher matcher = IP_PATTERN.matcher(ip.toString());
if(matcher.find()){
ip = matcher.group();
LOGGER.info("ip地址验证通过:"+ip);
}else{
LOGGER.info("ip地址验证失败:"+ip);
ip = null;
}
try{
port = Integer.parseInt(text);
LOGGER.info("端口验证通过:"+port);
}catch (Exception e){
LOGGER.info("端口验证失败:"+port);
}
}
if(ip != null && port > 0){
LOGGER.info("解析出IP:"+ip+",端口:"+port);
if(verify(ip, port)){
LOGGER.info("IP:"+ip+",端口:"+port+"可以使用");
ips.add(ip + ":" + port);
}else {
LOGGER.info("IP:"+ip+",端口:"+port+"不能使用");
}
}
}catch (Exception e){
LOGGER.error("解析IP出错", e);
}
});
}catch (Exception e){
LOGGER.error("解析IP出错", e);
}
return ips;
}
private static List<String> getProxyIpFour(){
List<String> ips = new ArrayList<>();
for(int i=1; i<=10; i++){
ips.addAll(getProxyIpFour(i));
}
return ips;
}
private static List<String> getProxyIpFour(int page){
List<String> ips = new ArrayList<>();
try {
String url = "http://www.kxdaili.com/ipList/"+page+".html";
String html = ((HtmlPage)WEB_CLIENT.getPage(url)).getBody().asXml();
//LOGGER.info("html:"+html);
Document doc = Jsoup.parse(html);
Elements elements = doc.select("html body#nav_btn01 div.tab_c_box.buy_tab_box table.ui.table.segment tbody tr");
elements
.forEach(element -> {
try {
Elements tds = element.children();
String ip = null;
int port = 0;
if (tds.size() > 1) {
ip = tds.get(0).text();
String text = tds.get(1).text();
LOGGER.info("IP:"+ip);
LOGGER.info("端口:"+text);
Matcher matcher = IP_PATTERN.matcher(ip.toString());
if(matcher.find()){
ip = matcher.group();
LOGGER.info("ip地址验证通过:"+ip);
}else{
LOGGER.info("ip地址验证失败:"+ip);
ip = null;
}
try{
port = Integer.parseInt(text);
LOGGER.info("端口验证通过:"+port);
}catch (Exception e){
LOGGER.info("端口验证失败:"+port);
}
}
if(ip != null && port > 0){
LOGGER.info("解析出IP:"+ip+",端口:"+port);
if(verify(ip, port)){
LOGGER.info("IP:"+ip+",端口:"+port+"可以使用");
ips.add(ip + ":" + port);
}else {
LOGGER.info("IP:"+ip+",端口:"+port+"不能使用");
}
}
}catch (Exception e){
LOGGER.error("解析IP出错", e);
}
});
}catch (Exception e){
LOGGER.error("解析IP出错", e);
}
return ips;
}
private static String getIps(Element element){
StringBuilder ip = new StringBuilder();
Elements all = element.children();
LOGGER.info("");
LOGGER.info("开始解析IP地址,机器读到的文本:"+element.text());
AtomicInteger count = new AtomicInteger();
all.forEach(ele -> {
String html = ele.outerHtml();
LOGGER.info(count.incrementAndGet() + "、" + "原始HTML:"+html.replaceAll("[\n\r]", ""));
String text = ele.text();
if(ele.hasAttr("style")
&& (ele.attr("style").equals("display: none;")
|| ele.attr("style").equals("display:none;"))) {
LOGGER.info("忽略不显示的文本:"+text);
}else{
if(StringUtils.isNotBlank(text)){
LOGGER.info("需要的文本:"+text);
ip.append(text);
}else{
LOGGER.info("忽略空文本");
}
}
});
LOGGER.info("----------------------------------------------------------------");
LOGGER.info("解析到的ip: "+ip);
LOGGER.info("----------------------------------------------------------------");
Matcher matcher = IP_PATTERN.matcher(ip.toString());
if(matcher.find()){
String _ip = matcher.group();
LOGGER.info("ip地址验证通过:"+_ip);
return _ip;
}else{
LOGGER.info("ip地址验证失败:"+ip);
}
return null;
}
public static void main(String[] args) {
//如果只是想收集IP,则一直运行此程序即可,更新时间改为1秒钟。
detectInterval=1000;
while(true){
toNewIp();
}
}
}