我们以这个网站为例: http://120.86.191.138/hbgs/zwgk/dirData.do?dirId=402881204e959150014e959f42f30014&subjectId=93e889f2501d3fe8015024305bdf0efc
往后点到第四页后会出现验证码
一.获取到背景图片和缺口图片
我们发现图片是base64格式通过API直接发送
二.识别缺口位置输出x距离
img_data = base64.b64decode(img_data) # png
img_btn_data = base64.b64decode(img_btn_data) # png
# 读取图片
img = Image.open(io.BytesIO(img_data))
img_btn = Image.open(io.BytesIO(img_btn_data))
# 识别缺口
left = identify_gap(img, img_btn, bg_params, gp_params)[0]
print(left)
identify_gap.py
import cv2
import numpy as np
def identify_gap(background, gap, bg_edge_list: list[int], gp_edge_list: list[int]):
"""
获取缺口的X坐标
使用OpenCV的边缘检测实现
:param background: 背景图片对象
:param gap: 缺口图片对象
图片对象可以是cv2.imread('opencv_logo.jpg',0)
也可以是PIL.Image.open('opencv_logo.jpg')
:param bg_edge_list: 背景图片对比度调整参数[50, 100] 50为最小对比度,100为最大对比度
:param gp_edge_list:缺口图片对比度调整参数[50, 100] 50为最小对比度,100为最大对比度
:return:
"""
background_image = background # 背景图片
background_image = np.array(background_image)
gap_image = gap # 缺口图片
gap_image = np.array(gap_image)
# 识别图片边缘
background_edge = cv2.Canny(background_image, bg_edge_list[0], bg_edge_list[1])
gap_edge = cv2.Canny(gap_image, gp_edge_list[0], gp_edge_list[1])
# 转换图片格式
background_picture = cv2.cvtColor(background_edge, cv2.COLOR_GRAY2RGB)
gap_picture = cv2.cvtColor(gap_edge, cv2.COLOR_GRAY2RGB)
res = cv2.matchTemplate(background_picture, gap_picture, cv2.TM_CCOEFF_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res) # type: ignore
return max_loc
3.完整代码
import base64
import io
import re
import requests
from PIL import Image
from packages.scrapy_spiders.utils.captcha.compare_image import identify_gap
def getlist_cookie(yzm_res):
"""
获取验证码返回的set-cookie并返回
:param response:
:return dic:
"""
extracted_cookies = {}
cookies = yzm_res.headers["Set-Cookie"].split(",")
# 解析Set-Cookie头部中的cookies
for cookie in cookies:
parts = cookie.split(";")[0].split("=")
if len(parts) == 2:
key, value = parts
extracted_cookies[key.strip()] = value.strip()
return extracted_cookies
def generate_random_string():
htmlurl='http://120.86.191.138/hbgs/verify/index.do'
response = requests.get(htmlurl).text
# 编写正则表达式来匹配verify后的值,注意这里我们使用了转义字符来匹配冒号和单引号
pattern = r"verify\s*:\s*'([^']*)'"
# 使用re.search查找匹配项
match = re.search(pattern, response)
# 如果找到匹配项,则提取括号内的内容(即需要的值)
if match:
value = match.group(1)
print("提取的verify值为:", value)
else:
print("未找到匹配的verify值")
return None
return value
ssst=generate_random_string()
yzm_url=f'http://120.86.191.138/hbgs/verify/get_img_verify.do?verify={ssst}'
res=requests.get(yzm_url)
cookies = getlist_cookie(res)
res=res.json()
bg_params = [100, 200]
gp_params = [100, 200]
img_data=res["oriCopyImage"]
img_btn_data=res["newImage"]
img_data = base64.b64decode(img_data) # jpg
img_btn_data = base64.b64decode(img_btn_data) # png
# 读取图片
img = Image.open(io.BytesIO(img_data))
img_btn = Image.open(io.BytesIO(img_btn_data))
# 识别缺口
left = identify_gap(img, img_btn, bg_params, gp_params)[0]
print(left)
dataurl='http://120.86.191.138/hbgs/verify/check.do'
header={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
formdata=f'verify={ssst}&IY={res["Y"]}&X={left}&Y={res["Y"]}'
res2=requests.post(dataurl, formdata,cookies=cookies,headers=header).json()
print(res2)
itemurl='http://120.86.191.138/hbgs/zwgk/item.do'
formdata=f'page=6&rows=20&HBTB_XH=&HBTB_XH_END=&HBTB_XMMC=&HBTB_SPWH=&HBTB_TXDZ=&HBTB_GSSJ=&HBTB_GSSJ_END=&dirId=402881204e959150014e95bb85b5010f&subjectId=93e889f2501d3fe8015024305bdf0efc&backPage=&vcode={res2["data"]}'
res=requests.post(itemurl, formdata)
print(res.text)