第十六天-爬虫selenium库

1.介绍

2.使用 selenium

1.安装

2.使用

1.测试打开网页，抓取雷速体育日职乙信息

2.通过xpath查找

3.输入文本框内容 send_keys

4.点击事件 click

5.获取网页源码：

6.获取cookies

7.seleniumt提供元素定位方式：8种

8.控制浏览器前进、后退、刷新

9.控制鼠标

10. 设置等待

11设置后台运行

12.后台终止

3.实战

1.介绍

1. selenium是一个用于web应用程序自动化测试工具，Selenium测试直接运行在浏览器中；

2.像真正的用户在操作一样2，驱动浏览执行特定的动作，如点击、下来等操作；

3.selenium支持浏览器

4.支持的语言

5.selenium在爬虫的应用

2.使用 selenium

1.安装

pip3 install selenium

2.使用

1.测试打开网页，抓取雷速体育日职乙信息

# coding:utf-8

import time
from selenium import webdriver
from selenium.webdriver.common.by import By

# 打开浏览器
webdriver_chrome = webdriver.Chrome()
# 浏览器窗口最大化：
webdriver_chrome.maximize_window()
# 爬取日职乙联赛信息
webdriver_chrome.get("https://www.leisu.com/data/zuqiu/comp-568/season-11286")
# 默认页面是升级附加赛：提取主队信息
for home in webdriver_chrome.find_elements(By.XPATH, "//td[@class='home']/a"):
    print("升级附加赛：主队信息:", home.text)
# 点击联赛
webdriver_chrome.find_element(By., "//div[@class='stage_name']").click()
# 获取联赛信息
for home in webdriver_chrome.find_elements(By.XPATH, "//td[@class='home']/a"):
    print("联赛：主队信息:", home.text)

# 获取标题
print("标题：", webdriver_chrome.title)
print("获取cookie", webdriver_chrome.get_cookies())
print("获取页面源码", webdriver_chrome.page_source())

# 打开5秒关闭
time.sleep(5)

# 关闭浏览器
webdriver_chrome.quit()

2.通过xpath查找

webdriver_chrome.find_elements(By.XPATH, "//td[@class='home']/a")

3.输入文本框内容 send_keys

webdriver_chrome.find_element(By.XPATH,"//input[@id='']").send_keys("内容")

4.点击事件 click

webdriver_chrome.find_element(By.XPATH,"//input[@id='']").click()

5.获取网页源码：

webdriver_chrome.page_source()

6.获取cookies

webdriver_chrome.get_cookies()

7.seleniumt提供元素定位方式：8种

老版本使用

新版使用

find_elements(By.XPATH, "原始值")

find_elements(By.ID, "原始值")

find_elements(By.CLASS_NAME, "原始值")

等等

8.控制浏览器前进、后退、刷新

9.控制鼠标

import time
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By


# 打开浏览器
webdriver_chrome = webdriver.Chrome()
# 浏览器窗口最大化：
webdriver_chrome.maximize_window()
# 打开百度
webdriver_chrome.get("https://www.baidu.com/")
#鼠标移动到设置上
#定位设置
set_element_above=webdriver_chrome.find_element(By.ID,"s-usersetting-top")
print(set_element_above.text)
#移动鼠标到设置上
ActionChains(webdriver_chrome).move_to_element(set_element_above).perform()


time.sleep(5)
webdriver_chrome.quit()

10. 设置等待

1. 使用场景：有时候需要等某些元素加载后进行操作，或者网络原因需要加载；

2.等待分为2种方式，分为显式等待和隐式等待

3.显式等待代码：打开百度，输入内容

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 打开浏览器
webdriver_chrome = webdriver.Chrome()
# 浏览器窗口最大化：
webdriver_chrome.maximize_window()
# 打开百度
webdriver_chrome.get("https://www.baidu.com/")
# WebDriverWait:显示等待
# 参数:1.webdriver_chrome打开浏览器对象，2.timeout,3.轮训参数
# until:EC场景判断，通过id找到输入框
element = WebDriverWait(webdriver_chrome, 5, 0.5).until(EC.presence_of_element_located((By.ID, "kw")))
# 找到元素输入查找内容
element.send_keys("Python")

time.sleep(5)
webdriver_chrome.quit()

4.隐式等待代码：打开百度，输入内容

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import  NoSuchElementException

# 打开浏览器
webdriver_chrome = webdriver.Chrome()
# 浏览器窗口最大化：
webdriver_chrome.maximize_window()
#隐式等待
webdriver_chrome.implicitly_wait(5)
# 打开百度
webdriver_chrome.get("https://www.baidu.com/")

try:
    webdriver_chrome.find_element(By.ID,"kw1").send_keys("python")
except NoSuchElementException as e:
    print("超时没有找到元素:",e)

time.sleep(5)
webdriver_chrome.quit()

11设置后台运行

from selenium.webdriver.chrome.options import Options

options = {
    "headless": "--headless",
    "no_sandbox": "--no-sandbox",
    "gpu": "--disable-gpu"
}
chrome_options = Options()

driver = webdriver.Chrome(options=chrome_options)

12.后台终止

1.如运行异常可使用任务管理器，找到进程“chromediver.exe”结束进程

3.实战

1. 自动爬取比赛信息：彩票500

2.自动翻页

3.导出到excel中

# coding:utf-8
import xlsxwriter
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class caipao500_project():

    def __init__(self, chrome_options):
        self.driver = webdriver.Chrome(options=chrome_options)
        # 设置浏览器最大化
        self.driver.maximize_window()

    def open_page(self, url):
        """
        打开页面方法
        :param self:
        :param url:  页面地址
        :return:
        """
        print("打开页面：{}".format(url))
        self.driver.get(url)
        # 判断是否打开
        if WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_element_located((By.LINK_TEXT, "联赛赛程"))):
            print("打开页面成功")
            # 创建excel
            self.create_excel()
            # 解析数据,返回当前轮次
            round_num = self.parse_html(self.driver.page_source)

            while True:
                round_num = round_num - 1
                if round_num == 0:
                    break

                round_num_btn = self.driver.find_element(By.XPATH,
                                                         "//div[@class='lsaiguo_round_list_wrap_in']/ul/li/a[@data-group={}]".format(
                                                             round_num)).text
                # 点击上一个轮次
                if not round_num_btn:
                    # 点击翻页
                    self.driver.find_element(By.XPATH, "//a[@class='itm_arrow itm_arrow_up']").click()
                    # 等待5秒
                    time.sleep(5)

                if WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_element_located(((By.XPATH,
                                                                                             "//div[@class='lsaiguo_round_list_wrap_in']/ul/li/a[@data-group={}]".format(
                                                                                                 round_num))))):
                    # 点击轮次
                    self.driver.find_element(By.XPATH,
                                             "//div[@class='lsaiguo_round_list_wrap_in']/ul/li/a[@data-group={}]".format(
                                                 round_num)).click()
                round_num = self.parse_html(self.driver.page_source)
            self.book.close()
            return True
        else:
            print("打开页面失败")
            return False

    def create_excel(self):
        """
        创建Excel
        :return:
        """
        # 创建存放excel文件夹
        self.book = xlsxwriter.Workbook(time.strftime("%Y%m%d%H%M%S", time.gmtime()) + "文件.xlsx")
        self.sheet = self.book.add_worksheet("sheet1")
        # 记录添加到第几行
        self.curr_row = 1
        title_data = ("轮次", "时间", "主队", "全场比分", "全场总分", "半场比分", "半场总分", "客队")
        # 添加表头
        for index, title_datum in enumerate(title_data):
            self.sheet.write(0, index, title_datum)

    def parse_html(self, content):
        """
        解析网页数据
        :param content: 网页源码
        :return:
        """
        html = etree.HTML(content)
        table_trs = html.xpath("//table[@class='lsaiguo_list ltable jTrHover']/tbody/tr")
        row_content = {}
        for tr in table_trs:
            # 轮次
            round_num = tr.xpath("./td[1]/text()")[0]
            # 时间
            time = "".join(tr.xpath("./td[2]/text()"))
            # 主队
            home = tr.xpath("./td[3]/a/text()")[0]
            # 比分：全场
            whole_score_array = tr.xpath("./td[4]/span/text()")
            whole_score = ":".join(whole_score_array)
            # 全场总分
            whole_score_total = int(whole_score_array[0]) + int(whole_score_array[1])
            # 半场
            half_score_str = "".join(tr.xpath("./td[4]/text()"))
            half_score = half_score_str[half_score_str.find("(") + 1:half_score_str.find(")")]
            # 总分
            half_score_array = half_score.split(":")
            half_score_total = int(half_score_array[0]) + int(half_score_array[1])
            # 客队
            away = tr.xpath("./td[5]/a/text()")[0]

            row_content = {
                "round_num": round_num,
                "time": time,
                "home": home,
                "whole_score": whole_score,
                "whole_score_total": whole_score_total,
                "half_score": half_score,
                "half_score_total": half_score_total,
                "away": away
            }
            print("row:", row_content)
            for index, e in enumerate(row_content):
                self.sheet.write(self.curr_row, index, row_content.get(e))
            self.curr_row += 1
        return int(round_num)


if __name__ == '__main__':
    options = {
        #"headless": "--headless",
        #"no_sandbox": "--no-sandbox",
        #"gpu": "--disable-gpu",
        "proxy-server": "--proxy-server=https://121.37.201.60:8118"
    }
    chrome_options = Options()
    for k, v in options.items():
        print("设置浏览器参数:{}:{}".format(k, v))
        chrome_options.add_argument(v)
    leisu = caipao500_project(chrome_options=chrome_options)

    leisu.open_page("https://liansai.500.com/zuqiu-6779/jifen-19426/")