selenium模块和爬虫之间的关联
- 便捷的获取网站中动态加载的数据
- 便捷实现模拟登录
什么是selenium模块
基于浏览器自动化的一个模块
selenium使用流程:
- 环境安装:pip install selenium
- 下载一个浏览器的驱动程序(谷歌浏览器)
- 下载路径:http://chromedriver.storage.googleapis.com/index.html
资源绑定了Chrome116对应的chhromedriver.exe
selenium的操作代码:
- 发起请求:
get(url)
- 标签定位:
find_element(By.属性, value='')
value是值 - 标签交互:
send_keys('xxx')
- 执行js程序:
excute_script('jsCode')
- 前进、后退:
back(), forward()
- 关闭浏览器:
quit()
标签定位中By的属性有:
ID = "id"
XPATH = "xpath"
LINK_TEXT = "link text"
PARTIAL_LINK_TEXT = "partial link text"
NAME = "name"
TAG_NAME = "tag name"
CLASS_NAME = "class name"
CSS_SELECTOR = "css selector"
- selenium处理iframe:
- 如果定位的标签存在于iframe中,则表明是在标签页的一个子标签页中,必须使用
switch_to.frame(id)
- 动作链(拖动):
from selenium.webdriver import ActionChains
- 实例化一个动作链对象:action = ActionChains(bro)
- click_and_hold(div):长按且点击操作
- move_by_offset(x,y)
- perform()让动作链立即执行
- action.release()释放动作链对象
selenium的常用操作自动化
找到输入框
找到搜索按钮
import time
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
service = Service(executable_path='./chromedriver.exe')
browser = webdriver.Chrome(service=service)
browser.get('https://www.taobao.com/')
# 标签定位
search_input = browser.find_element(By.ID, value='q') # 通过id为q找到搜索框
# 标签交互
search_input.send_keys('iphone') #发送字符串
# 执行一组js程序
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)
# 点击搜索按钮
btn = browser.find_element(By.CSS_SELECTOR, '.btn-search') #CSS_SELECTOR用法: .classname:选择具有指定类名的元素。
btn.click() # 点击按钮
sleep(5)
browser.get('https://www.baidu.com')
sleep(2)
# 回退
browser.back()
time.sleep(2)
# 前进
browser.forward()
sleep(2)
# 退出浏览器
browser.quit()
"""
By有的函数:
ID = "id"
XPATH = "xpath"
LINK_TEXT = "link text"
PARTIAL_LINK_TEXT = "partial link text"
NAME = "name"
TAG_NAME = "tag name"
CLASS_NAME = "class name"
CSS_SELECTOR = "css selector"
"""
04.动作链和iframe的处理.py
from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
service = Service(executable_path='./chromedriver.exe')
browser = webdriver.Chrome(service=service)
browser.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
# 如果定位的标签是存在于iframe标签之中的,则必须通过如下操作进行标签定位
browser.switch_to.frame('iframeResult') # 切换浏览器标签定位的作用域 参数为iframe标签的id
div = browser.find_element(By.ID, 'draggable')
# 动作链
action = ActionChains(browser)
# 点击长按指定的标签
action.click_and_hold(div)
for i in range(5):
# perform()立即执行动作链操作
# move_by_offset(x, y): x:水平方向 y:垂直方向
action.move_by_offset(18, 0).perform()
sleep(.5)
# 释放动作链
action.release().perform()
sleep(5)
browser.quit()
05.模拟登录QQ空间
import time
from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# 浏览器驱动
service = Service(executable_path='./chromedriver.exe')
browser = webdriver.Chrome(service=service)
browser.get('https://qzone.qq.com/')
browser.switch_to.frame('login_frame') # 如果有frame,先转到frame对应的id
password_login = browser.find_element(By.ID, 'switcher_plogin') # 通过id找对应元素非常方便
password_login.click() # 点击密码登录
id = browser.find_element(By.ID, 'u')
pasword = browser.find_element(By.ID, 'p')
login_btn = browser.find_element(By.ID, 'login_button')
id.send_keys("1457154996")
pasword.send_keys("12346")
login_btn.click()
谷歌无头浏览器+反检测
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.service import Service
#实现无可视化界面导入的包
from selenium.webdriver.chrome.options import Options
#实现规避检测
from selenium.webdriver import ChromeOptions
#实现无可视化界面的操作
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#实现规避检测
option = ChromeOptions()
option.add_argument('--headless')
option.add_argument('--disable-gpu')
option.add_experimental_option('excludeSwitches', ['enable-automation'])
#如何实现让selenium规避被检测到的风险
service = Service(executable_path='./chromedriver')
bro = webdriver.Chrome(service=service,options=chrome_options)
#无可视化界面(无头浏览器) phantomJs
bro.get('https://www.baidu.com')
print(bro.page_source)
sleep(2)
bro.quit()
07.基于selenium实现12306模拟登录.py
from selenium import webdriver
import time
from PIL import Image
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
#实现无可视化界面导入的包
from selenium.webdriver.chrome.options import Options
from time import sleep
#实现规避检测
#实现无可视化界面的操作
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
service = Service(executable_path='./chromedriver.exe', options=options)
browser = webdriver.Chrome(service=service)
browser.get('https://kyfw.12306.cn/otn/resources/login.html')
time.sleep(1)
id = browser.find_element(By.ID, 'J-userName')
password = browser.find_element(By.ID, 'J-password')
id.send_keys("1457154996")
password.send_keys("liu1457154996")
login_in = browser.find_element(By.ID, 'J-login')
login_in.click()
# verify_url = 'https://kyfw.12306.cn/passport/web/checkLoginVerify'
# browser_verify = webdriver.Chrome(service=service, options=chrome_options)
# browser_verify.get(verify_url)
browser_verify = browser
ID = browser_verify.find_element(By.ID, 'id_card') # 获取身份证输入框
ID.send_keys("2410")
get_verify_code = browser_verify.find_element(By.ID, 'verification_code')
sleep(1)
get_verify_code.click()
verify_code = browser_verify.find_element(By.ID, 'code')
verify_code.send_keys(input("请输入验证码"))
time.sleep(10)
verify = browser_verify.find_element(By.ID, 'sureClick')
verify.click()
# 获取验证码异常了!!!!
07.基于selenium实现12306模拟登录.py
from selenium import webdriver
import time
from PIL import Image
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
#实现无可视化界面导入的包
from selenium.webdriver.chrome.options import Options
from time import sleep
#实现规避检测
#实现无可视化界面的操作
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
service = Service(executable_path='./chromedriver.exe', options=options)
browser = webdriver.Chrome(service=service)
browser.get('https://kyfw.12306.cn/otn/resources/login.html')
time.sleep(1)
id = browser.find_element(By.ID, 'J-userName')
password = browser.find_element(By.ID, 'J-password')
id.send_keys("1457154996")
password.send_keys("liu1457154996")
login_in = browser.find_element(By.ID, 'J-login')
login_in.click()
# verify_url = 'https://kyfw.12306.cn/passport/web/checkLoginVerify'
# browser_verify = webdriver.Chrome(service=service, options=chrome_options)
# browser_verify.get(verify_url)
browser_verify = browser
ID = browser_verify.find_element(By.ID, 'id_card') # 获取身份证输入框
ID.send_keys("2410")
get_verify_code = browser_verify.find_element(By.ID, 'verification_code')
sleep(1)
get_verify_code.click()
verify_code = browser_verify.find_element(By.ID, 'code')
verify_code.send_keys(input("请输入验证码"))
time.sleep(10)
verify = browser_verify.find_element(By.ID, 'sureClick')
verify.click()
# 获取验证码异常了!!!!