爬虫脚本代理池调度
有时在使用爬虫或者使用脚本需要频繁访问一个网站,这种时候很容易被服务器给ban掉ip,这种情况就可以使用代理池。从代理池中进行调度获取新的ip进行访问。
使用的是开源免费的python项目地址如下:
https://github.com/jhao104/proxy_pool
除了python还需要安装Redis
启动
启动redis
redis-server.exe redis.windows.conf
启动proxy_pool
启动调度程序
python proxyPool.py schedule
启动webApi服务
python proxyPool.py server
爬虫使用代理池
启动web服务后, 默认配置下会开启 http://127.0.0.1:5010 的api接口服务:
api | method | Description | params |
---|---|---|---|
/ | GET | api介绍 | None |
/get | GET | 随机获取一个代理 | 可选参数: ?type=https 过滤支持https的代理 |
/pop | GET | 获取并删除一个代理 | 可选参数: ?type=https 过滤支持https的代理 |
/all | GET | 获取所有代理 | 可选参数: ?type=https 过滤支持https的代理 |
/count | GET | 查看代理数量 | None |
/delete | GET | 删除代理 | ?proxy=host:ip |
示例demo:
import requests
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").json()
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
# your spider code
def getHtml():
# ....
retry_count = 5
proxy = get_proxy().get("proxy")
while retry_count > 0:
try:
print(proxy)
html = requests.get('https://www.baidu.com', proxies={"http": "http://{}".format(proxy)})
# 使用代理访问
return html
except Exception:
retry_count -= 1
# 删除代理池中代理
delete_proxy(proxy)
return None
if __name__ == '__main__':
while(True):
print(getHtml().text)
sqlmap使用代理池
获取所有的代理ip存入文件ips.txt(其他脚本同理)
import requests
def get_proxy():
return requests.get("http://127.0.0.1:5010/all/").json()
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
def get_proxyCount():
return requests.get("http://127.0.0.1:5010/count").json()
count = get_proxyCount().get('count').get('total')
print("代理池中共计:%s个代理." % count)
f = open("ips.txt", "w")
for i in range(count):
b = get_proxy()[i].get('proxy')
print(b)
f.write(b + "\n")
print("over!")
f.close()
本地代理转发
借用前人的成果,实现的效果是启用本地192.168.3.17:9999
服务,将ips.txt
内的代理转发给本地客户端
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import socket
from socket import error
import threading
import random
import time
localtime = time.asctime(time.localtime(time.time()))
class ProxyServerTest:
def __init__(self, proxyip):
# 本地socket服务
self.ser = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.proxyip = proxyip
def run(self):
try:
# 本地服务IP和端口
self.ser.bind(('192.168.3.17', 9999))
# 最大连接数
self.ser.listen(5)
except error as e:
print("[-]The local service : " + str(e))
return "[-]The local service : " + str(e)
while True:
try:
# 接收客户端数据
client, addr = self.ser.accept()
print('[*]accept %s connect' % (addr,))
data = client.recv(1024)
if not data:
break
print('[*' + localtime + ']: Accept data...')
except error as e:
print("[-]Local receiving client : " + str(e))
return "[-]Local receiving client : " + str(e)
while True:
# 目标代理服务器,将客户端接收数据转发给代理服务器
mbsocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
iplen = len(self.proxyip)
proxyip = self.proxyip[random.randint(0, iplen - 1)]
print("[!]Now proxy ip:" + str(proxyip))
prip = proxyip[0]
prpo = proxyip[1]
try:
mbsocket.settimeout(3)
mbsocket.connect((prip, prpo))
except:
print("[-]RE_Connect...")
continue
break
# except :
# print("[-]Connect failed,change proxy ip now...")
# pass
try:
mbsocket.send(data)
except error as e:
print("[-]Sent to the proxy server : " + str(e))
return "[-]Sent to the proxy server : " + str(e)
while True:
try:
# 从代理服务器接收数据,然后转发回客户端
data_1 = mbsocket.recv(1024)
if not data_1:
break
print('[*' + localtime + ']: Send data...')
client.send(data_1)
except socket.timeout as e:
print(proxyip)
print("[-]Back to the client : " + str(e))
continue
# 关闭连接
client.close()
mbsocket.close()
def Loadips():
print("[*]Loading proxy ips..")
ip_list = []
ip = ['ip', 'port']
with open("ips.txt") as ips:
lines = ips.readlines()
for line in lines:
ip[0], ip[1] = line.strip().split(":")
ip[1] = eval(ip[1])
nip = tuple(ip)
ip_list.append(nip)
return ip_list
def main():
print('''*Atuhor : V@1n3R.
*Blog :http://www.Lz1y.cn
*date: 2017.7.17
*http://www.Lz1y.cn/wordpress/?p=643
__ __ _ _____ ____
\ \ / /_ _/ |_ __ |___ /| _ \
\ \ / / _` | | '_ \ |_ \| |_) |
\ V / (_| | | | | |___) | _ < _
\_/ \__,_|_|_| |_|____/|_| \_(_)
''')
ip_list = Loadips()
# ip_list = [('118.89.148.92',8088)]
# ip_list = tuple(ip_list)
try:
pst = ProxyServerTest(ip_list)
# 多线程
t = threading.Thread(target=pst.run, name='LoopThread')
print('[*]Waiting for connection...')
# 关闭多线程
t.start()
t.join()
except Exception as e:
print("[-]main : " + str(e))
return "[-]main : " + str(e)
if __name__ == '__main__':
main()
sqlmap使用 --proxy进行调用
免费的代理池,有的ip质量不行会连不上