selenium爬取多个网站及通过GUI界面点击爬取

news2024/11/20 8:30:09

selenium爬取代码

webcrawl.py

import re
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class AgriInfoSpider:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # 无界面模式
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--no-sandbox")
        self.driver = webdriver.Chrome(options=chrome_options, executable_path='C:\\Users\\Administrator\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe')

    def save_to_json(self, item, filename):
        with open(filename, 'a', encoding='utf-8') as f:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

    def close(self):
        self.driver.quit()

    # TODO
    # 农业网-农业科技
    def agronet(self, stop_event):
        self.driver.get('http://www.agronet.com.cn/Tech/List.html')
        self.driver.implicitly_wait(60)

        # 获取行业
        industrys = self.driver.find_elements(By.XPATH, '//dl[@class="product_classification_nav"]/dd/ul/li/a')
        item = {}
        order = 0
        # 点击各个行业
        for m, industry in enumerate(industrys):
            if stop_event.is_set():
                break
            item["industry"] = industry.text
            industry.click()
            # 确保页面正确到达
            # WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//dl[@class="arrow_700"]/dt/span/em[2]'), item["industry"]))
            articles = self.driver.find_elements(By.XPATH, '//dl[@class="arrow_700"]/dd/ul/li')

            while True:
                if stop_event.is_set():
                    break
                for i, article in enumerate(articles):
                    if stop_event.is_set():
                        break
                    item["order"] = order

                    try:
                        WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//dl[@class="arrow_700"]/dt/em'), "农业技术文章列表"))
                    except TimeoutException:
                        continue
                    # 文章标题
                    article = self.driver.find_elements(By.XPATH, '//dl[@class="arrow_700"]/dd/ul/li/span/a')[i]
                    item["title"] = article.text
                    item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2}', self.driver.find_element(By.XPATH, '//dl[@class="arrow_700"]/dd/ul/li/div').text).group()
                    item["source"] = self.driver.find_element(By.XPATH, '//dl[@class="arrow_700"]/dd/ul/li/em').text

                    # 点击文章
                    article.click()

                    # 获取所有打开的窗口句柄
                    window_handles = self.driver.window_handles
                    # 切换标签页
                    self.driver.switch_to.window(window_handles[-1])
                    try:
                        # 获取内容
                        content = self.driver.find_elements(By.XPATH, '//div[@class="font_bottom"]/p')
                        content_lists = [c.text.strip() for c in content]
                        item["content"] = [''.join(content_lists)]
                    except:
                        item["content"] = []

                    # 写入文件
                    self.save_to_json(item, './results/agronet.json')

                    # 关闭新标签页
                    self.driver.close()

                    # 切换回原始的标签页
                    self.driver.switch_to.window(self.driver.window_handles[0])

                    order += 1

                # 点击下一页
                try:
                    if stop_event.is_set():
                        break
                    next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                    next_page.click()
                    if self.driver.current_url == 'http://www.agronet.com.cn/Message/Error?aspxerrorpath=/Tech/List':
                        break
                except:
                    break

    # 中国农网-三农头条
    def farmer(self, stop_event):
        self.driver.get('https://www.farmer.com.cn/farmer/xw/sntt/list.shtml')

        # 获取所有文章
        articles = self.driver.find_elements(By.XPATH, '//div[contains(@class, "index-font")]')
        item = {}
        order = 0
        # 点击文章
        while True:
            if stop_event.is_set():
                break
            for article in articles:
                if stop_event.is_set():
                    break
                item["order"] = order
                item["title"] = article.text
                article.click()
                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])
                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="index-title"]/span[3]'), "详情"))
                except TimeoutException:
                    continue

                item["author"] = self.driver.find_element(By.XPATH, '//div[@class="index-introduce"]/ul/li[2]/span').text
                item["date"] = self.driver.find_element(By.XPATH, '//div[@class="index-introduce"]/ul/div/span').text
                item["source"] = self.driver.find_element(By.XPATH, '//div[@class="index-introduce"]/ul/li[1]/span').text
                content = self.driver.find_elements(By.XPATH, '//div[@class="textList"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/farmer.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 中国农业农村信息网-数据-市场动态
    def agri(self,stop_event):
        self.driver.get('http://www.agri.cn/sj/scdt/')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//ul[@class="nxw_list_ul"]/li/div/div/p[1]/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[contains(@class, "title_common") and contains(@class, "title_common_w")]'), "市场动态"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//ul[@class="nxw_list_ul"]/li/div/div/p[1]/a')[i]
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="bread_nav"]/a[3]'), "市场动态"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="detailCon_info_tit"]').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2}', self.driver.find_element(By.XPATH, '//div[@class="updateInfo_mess"]/span[1]').text).group()
                item["source"] = re.search(r'来源:(.+)', self.driver.find_element(By.XPATH, '//div[@class="updateInfo_mess"]/span[3]').text).group()
                try:
                    content = self.driver.find_elements(By.XPATH, '//div[contains(@class, "content_body_box") and contains(@class, "ArticleDetails")]/p')
                    content_lists = [c.text.strip() for c in content]
                    item["content"] = [''.join(content_lists)]
                except:
                    item["content"] = []

                # 写入文件
                self.save_to_json(item, './results/agri.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            pages_sum = re.search(r'\d+', self.driver.find_element(By.XPATH, '//font[contains(@class, "clear") and contains(@class, "jump_con")]/span[2]').text).group()
            pages_cur = re.search(r'\d+', self.driver.find_element(By.XPATH, '//a[@class=" cur"]').text).group()

            if pages_cur != pages_sum:
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            else:
                break

    # 新农网-农业新闻-行业资讯
    def xinnong(self, stop_event):
        self.driver.get('http://www.xinnong.net/news/hangye/list_14_1.html')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="newslist"]/ul/li/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="lsttit"]/h1'), "行业资讯"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//div[@class="newslist"]/ul/li/a')[i]
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="spos"]/a[3]'), "行业资讯"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="arctit"]/h1').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2}', self.driver.find_element(By.XPATH, '//div[@class="arcinfo"]').text).group()
                item["source"] = re.search(r'来源:(.+)', self.driver.find_element(By.XPATH, '//div[@class="arcinfo"]').text).group()
                content = self.driver.find_elements(By.XPATH, '//div[@class="arcont"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/xinnong.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 富农网-行业资讯
    def richagri(self, stop_event):
        self.driver.get('http://www.richagri.com/news')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="head"]/ul/li/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="head"]/div[2]/a[2]'), "行业资讯"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//div[@class="head"]/ul/li/a')[i]
                article.click()

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="head"]/div[2]/a[2]'), "行业资讯"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="head"]/b').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2}', self.driver.find_element(By.XPATH, '//div[@class="head"]/font').text).group()
                content = self.driver.find_elements(By.XPATH, '//div[@class="head"]')
                content_lists = [c.text.strip('\n') for c in content]
                content_lists = re.search(r'时间:(\d{4}-\d{1,2}-\d{1,2})\n(.*?)(?=\n回顶部)', content_lists[0], re.S)
                item["content"] = [''.join(content_lists.group(2))]

                # 写入文件
                self.save_to_json(item, './results/richagri.json')

                # 返回原始的标签页
                self.driver.back()

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下页")]')
                next_page.click()
            except:
                break

    # 金农网-分析
    def jinnong(self, stop_event):
        self.driver.get('https://www.jinnong.cn/1002/')

        # 存储已经爬取的文章链接
        crawled_links = set()
        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//ul[@class="left-side-items"]/li/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                article_link = article.get_attribute('href')
                if article_link not in crawled_links:
                    item["order"] = order
                    try:
                        WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//span[@class="current-path-2"]'), "农业市场分析"))
                    except TimeoutException:
                        continue

                    # 点击文章
                    article = self.driver.find_elements(By.XPATH, '//ul[@class="left-side-items"]/li/a')[i]
                    article.click()

                    # 获取所有打开的窗口句柄
                    window_handles = self.driver.window_handles
                    # 切换标签页
                    self.driver.switch_to.window(window_handles[-1])

                    # 确保到达文章详情标签页
                    try:
                        WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="current-path"]/span[3]'), "正文"))
                    except TimeoutException:
                        continue

                    item["title"] = self.driver.find_element(By.XPATH, '//div[@class="article-title"]/h1').text
                    item["author"] = self.driver.find_element(By.XPATH, '//div[@class="article-title"]/div/div[1]/span[3]').text
                    item["date"] = self.driver.find_element(By.XPATH, '//div[@class="article-title"]/div/div[1]/span[1]').text
                    item["source"] = re.search(r'来源:(.+)', self.driver.find_element(By.XPATH, '//div[@class="article-title"]/div/div[1]/span[2]').text).group()
                    content = self.driver.find_elements(By.XPATH, '//div[@class="article-conte-infor"]')
                    content_lists = [c.text.strip() for c in content]
                    item["content"] = [''.join(content_lists)]

                    # 写入文件
                    self.save_to_json(item, './results/jinnong.json')

                    # 记录已爬取的链接
                    crawled_links.add(article_link)

                    # 关闭新标签页
                    self.driver.close()

                    # 切换回原始的标签页
                    self.driver.switch_to.window(self.driver.window_handles[0])

                    order += 1

            # 点击加载更多
            button = self.driver.find_element(By.CSS_SELECTOR, '.click_more a span').text
            if button == "点击加载更多":
                next_page = self.driver.find_element(By.CSS_SELECTOR, '.click_more a')
                self.driver.execute_script("arguments[0].click();", next_page)
            # 结束
            elif button == "已加载全部":
                break

    # 中国乡村振兴服务网-新闻动态-行业资讯
    def xczxfw(self, stop_event):
        self.driver.get('http://www.xczxfw.org.cn/news/12/list')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="zdhd"]/dl/dd/p/a')
            # articles = self.driver.find_elements(By.XPATH, '//dl[@class="lb_dt"]/dd/p/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//div[@class="zdhd"]/dl/dd/p/a')[i]
                # article = self.driver.find_elements(By.XPATH, '//dl[@class="lb_dt"]/dd/p/a')[i]
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="gy_1"]/span/a[2]'), "行业资讯"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="zdhd"]/h1').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2} \d{2}:\d{2}', self.driver.find_element(By.XPATH, '//div[@class="zdhd"]/h2').text).group()
                item["source"] = re.search(r'来源:(.+)', self.driver.find_element(By.XPATH, '//div[@class="zdhd"]/h2').text).group()
                content = self.driver.find_elements(By.XPATH, '//div[@class="com_de"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/xczxfw.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 农博网-农博数据中心-实用技术
    def shujuaweb(self, stop_event):
        self.driver.get('http://shuju.aweb.com.cn/technology/technology-0-1.shtml')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//ul[@class="newList2"]/li/a[2]')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//h2[@class="h2s"]'), "实用技术:"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//ul[@class="newList2"]/li/a[2]')[i]
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                try:
                    # 确保到达文章详情标签页
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//ul[@class="sub"]/li[7]/a/span'), "实用技术"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@id="content"]/div[1]/div[1]/div/p[1]').text
                item["date"] = re.search(r'\d{4}年\d{1,2}月\d{1,2}日 \d{2}:\d{2}', self.driver.find_element(By.XPATH, '//div[@id="content"]/div[1]/div[1]/div/p[2]/span').text).group()
                # 避免来源为空报错
                source_element = self.driver.find_element(By.XPATH, '//div[@id="content"]/div[1]/div[1]/div/p[2]/span')
                source_match = re.search(r'来源:(.+)', source_element.text)
                item["source"] = source_match.group() if source_match else ""
                content = self.driver.find_elements(By.XPATH, '//ul[@class="name"]/following-sibling::p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/shujuaweb.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 三农综合信息服务平台-12316头条-动态
    def agri_12316(self, stop_event):
        self.driver.get('http://12316.agri.cn/news/A12316dt/index.html')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//ul[@class="dongtai_list"]/table[last()]/tbody/tr/td/li/a/p')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order

                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//p[@class="weizhi"]'), "12316头条-动态"))
                except TimeoutException:
                    continue

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//ul[@class="dongtai_list"]/table[last()]/tbody/tr/td/li/a/p')[i]
                article.click()

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="content"]/p[1]'), "正文"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="detail_box"]/h3').text
                item["date"] = self.driver.find_element(By.XPATH, '//p[@class="zuozhe"]/span[1]/i').text
                item["source"] = self.driver.find_element(By.XPATH, '//p[@class="zuozhe"]/span[2]/i').text
                content = self.driver.find_elements(By.XPATH, '//div[@class="news_box"]')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/agri_12316.json')

                # 返回原始的标签页
                self.driver.back()

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下一页")]')
                next_page.click()
            except:
                break

    # 吾谷网-农技通
    def wugu(self, stop_event):
        self.driver.get('http://www.wugu.com.cn/?cat=6')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0

        # 获取总页数
        try:
            pages = self.driver.find_elements(By.XPATH, '//div[@class="nav-links"]/a')
            total_pages = int(pages[1].text)
        except (ValueError, IndexError):
            total_pages = 1

        for current_page in range(1, total_pages + 1):
            if stop_event.is_set():
                break
            # 打开每一页
            page_url = f'http://www.wugu.com.cn/?paged={current_page}&cat=6'
            self.driver.get(page_url)

            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="mg-posts-sec-inner"]/article/div/h4/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="mg-breadcrumb-title"]/h1'), "分类:农技通"))
                    # 点击文章
                    article = self.driver.find_elements(By.XPATH, '//div[@class="mg-posts-sec-inner"]/article/div/h4/a')[i]
                    self.driver.execute_script("arguments[0].click();", article)
                except TimeoutException:
                    continue

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="mg-header"]/div[1]/a'), "农技通"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="mg-header"]/h1/a').text
                item["date"] = self.driver.find_element(By.XPATH, '//div[@class="mg-header"]/div[2]/div/span[1]').text
                content = self.driver.find_elements(By.XPATH, '//div[@class="mg-blog-post-box"]/article/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/wugu.json')

                # 返回原始的标签页
                self.driver.back()

                order += 1

    # 新农村商网-农业资讯-政策法规
    def mofcom(self, stop_event):
        self.driver.get('http://nc.mofcom.gov.cn/nyzx/zcfg')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0

        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@id="showList"]/div//ul/li/h5/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[contains(@class, "w") and contains(@class, "u-nav-wrap")]/span'), "政策法规"))
                except TimeoutException:
                    continue

                # 重新获取文章列表
                articles = self.driver.find_elements(By.XPATH, '//div[@id="showList"]/div//ul/li/h5/a')  # 重新获取文章列表
                article = articles[i]  # 重新获取对应的文章元素

                article_link = self.driver.find_element(By.XPATH, '//div[@id="showList"]/div//ul/li/h5/a')

                try:
                    # 点击文章链接
                    self.driver.execute_script("arguments[0].click();", article_link)
                except StaleElementReferenceException:
                    continue

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[contains(@class, "w") and contains(@class, "u-nav-wrap")]'), "正文"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//h3[@class="u-tt"]').text
                item["date"] = self.driver.find_element(By.XPATH, '//span[@class="u-time"]').text
                # 避免来源为空报错
                source_element = self.driver.find_element(By.XPATH, '//span[@class="u-source"]')
                source_match = re.search(r'信息来源:(.+)', source_element.text)
                item["source"] = source_match.group() if source_match else ""
                content = self.driver.find_elements(By.XPATH, '//div[@class="u-txt"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/mofcom.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//a[contains(text(), "下一页")]')))
                self.driver.execute_script("arguments[0].click();", next_page)
            except:
                break

    # 惠农网-行业资讯-行情资讯
    def cnhnb(self, stop_event):
        self.driver.get('https://news.cnhnb.com/hqjd/?pi=1')

        item = {}
        order = 0
        while True:
            if stop_event.is_set():
                break
            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="latest-list"]/div/div[2]/div[1]/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="ci_crumbs"]/dl/dd[2]'), "行情解读"))
                except TimeoutException:
                    continue

                # 分类
                item["classify"] = self.driver.find_element(By.XPATH, '//span[@class="ct-s"]').text

                # 点击文章
                article = self.driver.find_elements(By.XPATH, '//div[@class="latest-list"]/div/div[2]/div[1]/a')[i]
                article.click()

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//div[@class="ci_crumbs"]/dl/dd[2]/a'), "行情解读"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//div[@class="title"]/h1').text
                item["date"] = re.search(r'\d{4}-\d{1,2}-\d{1,2} \d{2}:\d{2}', self.driver.find_element(By.XPATH, '//div[@class="d-tips"]').text).group()
                # 避免来源为空报错
                source_element = self.driver.find_element(By.XPATH, '//div[@class="d-tips"]')
                source_match = re.search(r'来源:([^采编]+)', source_element.text)
                item["source"] = source_match.group(1) if source_match else ""
                content = self.driver.find_elements(By.XPATH, '//div[@class="content"]/p')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/cnhnb.json')

                # 返回原始的标签页
                self.driver.back()

                order += 1

            # 点击下一页
            try:
                if stop_event.is_set():
                    break
                next_page = self.driver.find_element(By.XPATH, '//button[@class="btn-next"]')
                WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable(next_page))
                next_page.click()
            except:
                break

    # # 农一网 商城(舍弃)
    # def agri_16899(self):
    #     self.driver.get('https://www.16899.com/Product/ProductList.html')
    #     self.driver.implicitly_wait(60)
    #
    #     item = {}
    #     order = 0
    #     while True:
    #         # 获取所有文章
    #         articles = self.driver.find_elements(By.XPATH, '//div[@id="postData"]/div[1]/dl/dd[1]/a')
    #
    #         for i, article in enumerate(articles):
    #             item["order"] = order
    #             try:
    #                 WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//li[@class="selected"]/a'), "农资商城"))
    #             except TimeoutException:
    #                 continue
    #
    #             # 重新获取文章列表
    #             article = self.driver.find_elements(By.XPATH, '//div[@id="postData"]/div[1]/dl/dd[1]/a')[i]
    #             # 点击文章
    #             article.click()
    #
    #             # 获取所有打开的窗口句柄
    #             window_handles = self.driver.window_handles
    #             # 切换标签页
    #             self.driver.switch_to.window(window_handles[-1])
    #
    #             # 确保到达文章详情标签页
    #             try:
    #                 WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//li[@class="active"]/a'), "产品介绍"))
    #             except TimeoutException:
    #                 continue
    #
    #             # 分类
    #             item["classify"] = self.driver.find_element(By.XPATH, '//h1[@class="pro_name"]/span[1]').text
    #
    #             item["title"] = self.driver.find_element(By.XPATH, '//h1[@class="pro_name"]/span[2]').text
    #             item["price"] = self.driver.find_element(By.XPATH, '//span[@id="strongprice"]').text
    #
    #             # 写入文件
    #             self.save_to_json(item, 'agri_16899.json')
    #
    #             # 关闭新标签页
    #             self.driver.close()
    #
    #             # 切换回原始的标签页
    #             self.driver.switch_to.window(self.driver.window_handles[0])
    #
    #             order += 1
    #
    #         # 点击下一页
    #         try:
    #             next_page = self.driver.find_element(By.XPATH, '//a[contains(text(), "下页")]')
    #             if next_page.get_attribute("class") == "disabled":
    #                 break
    #             else:
    #                 next_page.click()
    #         except:
    #             break

    # 191农资人-精华帖-植保技术
    def agri_191(self, stop_event):
        self.driver.get('https://www.191.cn/searcher.php?digest=1&starttime=&endtime=&fid=3')
        self.driver.implicitly_wait(60)

        item = {}
        order = 0

        # 获取总页数
        try:
            pages = self.driver.find_elements(By.XPATH, '//div[@class="pages"]/a')
            total_pages = int(pages[-1].text)
        except (ValueError, IndexError):
            total_pages = 1

        for current_page in range(1, total_pages + 1):
            if stop_event.is_set():
                break
            # 打开每一页
            page_url = f'https://www.191.cn/searcher.php?type=special&condition=digest&authorid=&fid=3&starttime=&endtime=&page={current_page}'
            self.driver.get(page_url)

            # 获取所有文章
            articles = self.driver.find_elements(By.XPATH, '//div[@class="dlA"]/dl/dt/a')

            for i, article in enumerate(articles):
                if stop_event.is_set():
                    break
                item["order"] = order
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//li[@id="fid_3"]/a'), "植保技术"))
                except TimeoutException:
                    continue

                # 重新获取文章列表
                article = self.driver.find_elements(By.XPATH, '//div[@class="dlA"]/dl/dt/a')[i]
                # 点击文章
                article.click()

                # 获取所有打开的窗口句柄
                window_handles = self.driver.window_handles
                # 切换标签页
                self.driver.switch_to.window(window_handles[-1])

                # 确保到达文章详情标签页
                try:
                    WebDriverWait(self.driver, 20).until(EC.text_to_be_present_in_element((By.XPATH, '//td[@id="td_tpc"]/div[1]/a'), "楼主"))
                except TimeoutException:
                    continue

                item["title"] = self.driver.find_element(By.XPATH, '//h1[@id="subject_tpc"]').text
                item["date"] = self.driver.find_element(By.XPATH, '//td[@id="td_tpc"]/div[1]/span[2]').text
                content = self.driver.find_elements(By.XPATH, '//div[@id="read_tpc"]')
                content_lists = [c.text.strip() for c in content]
                item["content"] = [''.join(content_lists)]

                # 写入文件
                self.save_to_json(item, './results/agri_191.json')

                # 关闭新标签页
                self.driver.close()

                # 切换回原始的标签页
                self.driver.switch_to.window(self.driver.window_handles[0])

                order += 1

GUI界面代码

main.py

# 同时爬取所有网站
# import traceback
# import concurrent.futures
# from webcrawl import AgriInfoSpider
#
#
# def run_spider(spider_method):
#     spider = AgriInfoSpider()
#
#     try:
#         spider_method()
#     except Exception as e:
#         print(f"{spider_method.__name__} 爬虫发生错误: {str(e)}")
#         traceback.print_exc()
#
#     spider.close()
#
#
# if __name__ == "__main__":
#     spider_methods = [
#         AgriInfoSpider().agronet,
#         AgriInfoSpider().farmer,
#         AgriInfoSpider().agri,
#         AgriInfoSpider().xinnong,
#         AgriInfoSpider().richagri,
#         AgriInfoSpider().jinnong,
#         AgriInfoSpider().xczxfw,
#         AgriInfoSpider().shujuaweb,
#         AgriInfoSpider().agri_12316,
#         AgriInfoSpider().wugu,
#         AgriInfoSpider().mofcom,
#         AgriInfoSpider().cnhnb,
#         AgriInfoSpider().agri_191,
#     ]
#
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         executor.map(run_spider, spider_methods)

import tkinter as tk
from tkinter import messagebox
from threading import Thread, Event
from webcrawl import AgriInfoSpider

class App:
    def __init__(self, root):
        self.running_spiders = {}  # 跟踪已经运行的爬虫
        self.root = root
        self.root.title("农业信息爬虫")
        self.root.geometry("900x400")

        self.create_listbox()
        self.create_stop_button()
        self.create_run_button()

    def create_listbox(self):
        self.listbox = tk.Listbox(self.root, bd=0, highlightthickness=0, bg=self.root.cget('bg'), selectmode=tk.SINGLE, font=('楷体', 12))
        self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=100, pady=70)

        self.spider_functions = {
            "农业网-农业科技": "agronet",
            "中国农网-三农头条": "farmer",
            "中国农业农村信息网-数据-市场动态": "agri",
            "新农网-农业新闻-行业资讯": "xinnong",
            "富农网-行业资讯": "richagri",
            "金农网-分析": "jinnong",
            "中国乡村振兴服务网-新闻动态-行业资讯": "xczxfw",
            "农博网-农博数据中心-实用技术": "shujuaweb",
            "三农综合信息服务平台-12316头条-动态": "agri_12316",
            "吾谷网-农技通": "wugu",
            "新农村商网-农业资讯-政策法规": "mofcom",
            "惠农网-行业资讯-行情资讯": "cnhnb",
            "191农资人-精华帖-植保技术": "agri_191"
        }

        for spider_name in self.spider_functions:
            self.listbox.insert(tk.END, spider_name)

        scrollbar = tk.Scrollbar(self.root, command=self.listbox.yview)
        scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
        self.listbox.config(yscrollcommand=scrollbar.set)

    def create_run_button(self):
        self.run_button = tk.Button(self.root, text="运行", command=self.run_spider, font=('黑体', 12))
        self.run_button.pack(side=tk.RIGHT, padx=(0, 20), pady=50)

    def create_stop_button(self):
        self.stop_button = tk.Button(self.root, text="停止", command=self.stop_spider, state=tk.DISABLED, font=('黑体', 12))
        self.stop_button.pack(side=tk.RIGHT, padx=(20, 150), pady=50)

    def run_spider(self):
        selected_index = self.listbox.curselection()
        if not selected_index:
            messagebox.showwarning("警告", "请先选择一个爬虫")
            return

        selected_item = self.listbox.get(selected_index)
        spider_name = self.spider_functions.get(selected_item)

        if spider_name:
            if spider_name in self.running_spiders:
                messagebox.showinfo("提示", f"{selected_item} 爬虫已经在运行")
            else:
                stop_event = Event()  # 创建一个Event对象
                spider = AgriInfoSpider()
                thread = Thread(target=self.run_spider_function, args=(spider, spider_name, stop_event))
                thread.start()
                self.running_spiders[spider_name] = {"thread": thread, "stop_event": stop_event}  # 将新运行的爬虫加入跟踪字典
                self.stop_button.config(state=tk.NORMAL)  # 启用停止按钮
        else:
            messagebox.showwarning("警告", "选择的爬虫不存在")

    def run_spider_function(self, spider, spider_name, stop_event):
        try:
            getattr(spider, spider_name)(stop_event)
        except Exception as e:
            messagebox.showerror("错误", f"爬虫运行出错: {e}")
        finally:
            self.root.after(0, self.update_stop_button, spider_name)

    def stop_spider(self):
        selected_index = self.listbox.curselection()
        if not selected_index:
            messagebox.showwarning("警告", "请先选择一个爬虫")
            return

        selected_item = self.listbox.get(selected_index)
        spider_name = self.spider_functions.get(selected_item)

        if spider_name and spider_name in self.running_spiders:
            Thread(target=self.stop_spider_thread, args=(spider_name, selected_item)).start()
        else:
            messagebox.showwarning("警告", "选择的爬虫不存在或未运行")

    def stop_spider_thread(self, spider_name, selected_item):
        spider_info = self.running_spiders[spider_name]
        spider_info["stop_event"].set()
        spider_info["thread"].join()

        self.root.after(0, self.update_stop_button, spider_name)

        messagebox.showinfo("提示", f"{selected_item} 爬虫已停止")

    def update_stop_button(self, spider_name):
        del self.running_spiders[spider_name]
        if not self.running_spiders:
            self.stop_button.config(state=tk.DISABLED)


if __name__ == "__main__":
    root = tk.Tk()
    app = App(root)
    root.mainloop()

运行的GUI界面及结果如下

最开始的界面:
GUI界面
点击运行:
点击运行
运行第二个爬虫:
运行第二个爬虫
停止运行:
停止运行

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/1371023.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

Java:手工触发FullGC及堆占用过高常用分析方法

目录 一、手工触发FullGC方式 1、通过代码 2、通过工具 二、堆占用过高常用分析方法 1、查看堆占用情况 2、手工触发FullGC 3、查看对象占用堆的情况 4、分析可疑对象 使用如下命令查看java进程中内存的使用情况 jstat -gcutil <pid> 5000 发现运行中的java进程堆…

07、Kafka ------ 消息生产者(演示 发送消息) 和 消息消费者(演示 监听消息)

目录 Kafka --- 消息生产者★ 消息★ 消息的分发机制★ 分发到哪个分区★ 轮询策略&#xff08;round-robin&#xff09;★ 使用命令行工具发送消息演示添加消息 Kafka --- 消息消费者★ 消息消费者命令▲ 监听 【指定主题】 的所有消息:▲ 监听 【指定主题、指定分区】的所有消…

AI与低代码解锁无限可能

前言 近年来&#xff0c;人工智能&#xff08;AI&#xff09;和低代码开发技术逐渐成为数字化转型的重要推动力。AI作为一项具有革命性潜力的技术&#xff0c;正在改变我们生活的方方面面。而低代码开发则提供了一种快速构建应用程序的方法&#xff0c;使得开发者无需深入编写…

【刷题日记】青少年CTF-A2 Crypto(全)

Caesar 题目难度&#xff1a;★ 题目描述&#xff1a;凯撒大帝在很早的时候发明了这个&#xff0c;你能解密出来吗&#xff1f;flag格式为&#xff1a;qsnctf{xxx}。 下载附件&#xff0c;题目提示告诉我们是凯撒了&#xff0c;一个简单的移位操作。 使用在线解码网站&#…

C语言基础语法跟练

题源&#xff1a;牛客网 1、输出"Hello Nowcoder!"。开始你的编程之旅吧。 #include <stdio.h>int main() {printf("Hello Nowcoder!");return 0; } 2、KiKi学会了printf在屏幕输出信息&#xff0c;他想输出一架小飞机。请帮他编写程序输出这架小…

react native中使用tailwind并配置自动补全

使用的第三方库是tailwind-react-native-classnames&#xff0c;同类的也有tailwind-rn&#xff0c;但是我更喜欢前者官方demo&#xff1a; import { View, Text } from react-native; import tw from twrnc;const MyComponent () > (<View style{twp-4 android:pt-2 b…

智能化配网故障定位技术:未来发展趋势与应用前景

在当今这个科技高速发展的时代&#xff0c;智能化技术已经渗透到了我们生活的方方面面。作为电力行业的重要组成部分&#xff0c;配电网的自动化和智能化水平也在不断提高。本文将重点介绍一种基于成熟的行波测距技术的智能化配网故障定位技术——配网行波型故障预警与定位系统…

iPhone语音备忘录怎么导出?这3种方法任你选择!

作为iPhone用户&#xff0c;我们应该会经常使用语音备忘录来记录一些重要的信息。有时候&#xff0c;我们可能需要将这些语音备忘录导出&#xff0c;以方便分享或备份。iphone语音备忘录怎么导出&#xff1f;今天&#xff0c;小编将为大家介绍3种导出iPhone语音备忘录的方法&am…

PyTorch: torch.nn 子模块及其在循环神经网络中的应用

目录 torch.nn子模块详解 nn.utils.rnn.PackedSequence 参数说明 注意事项 示例代码 nn.utils.rnn.pack_padded_sequence 参数说明 返回值 注意事项 示例代码 nn.utils.rnn.pad_packed_sequence 参数说明 返回值 注意事项 示例代码 nn.utils.rnn.pad_sequence …

FPGA之按键消抖

目录 1.原理 2.代码 2.1 key_filter.v 2.2 tb_key_filter.v 1.原理 按键分为自锁式按键和机械按键&#xff0c;图左边为自锁式按键 上图为RS触发器硬件消抖&#xff0c;当按键的个数比较多时常常使用软件消抖。硬件消抖会使用额外的器件占用电路板上的空间。 思路就是使用延…

XS5018B是一款针对 CMOS 图像传感器的高性价比图像信号处理芯片

产品概述 XS5018B 是一款针对 CMOS 图像传感器的高性价比图像信号处理芯片&#xff0c;支持 1M/2M 像素 图像传感器&#xff0c;一组 10-bit DVP 输入接口&#xff0c; ISP 具备优异的 3D 降噪功能&#xff0c;标清模拟输出支持 960H &#xff0c; 高清模拟输出支…

NR cell配置带宽时,如何设置carrierBandwidth?

NR中带宽在38.101中有规定。 如上是FR1 38.101-1中与带宽设定有关的table&#xff0c;协议中根据SCS规定的传输带宽和可以配置的RB 数如上表&#xff0c;也就是说在实网下或者lab测试配置带宽时要根据上表内容去配置&#xff0c;举例如下。 如上图分别是几种带宽的配置参数&…

设备管理系统建设方案书

1.设备管理系统 1.1.系统概述 需求描述 建立设备信息库&#xff0c;对设备相关档案的登录、整理。通过建立完善的设备档案&#xff0c;将设备的各类原始信息进行信息化统一管理&#xff0c;使设备档案查询工作方便快捷&#xff0c;维修管理&#xff0c;针对维修计划、维修工单…

2024律师课程推荐:iCourt律师执行实务集训营(赠《执行实务大礼包》)

律师行业竞争激烈&#xff0c;想要突破困境&#xff0c;就一定要把握蓝海机遇&#xff0c;实现提早布局。 如今&#xff0c;还有哪些业务是尚未被“卷起来”的“蓝海业务”&#xff1f; 从数据来看&#xff0c;执行业务一定是其中之一。 在 Alpha 系统中&#xff0c;以“执行…

JAVA静态引擎企业网站源码带文档

JAVA静态引擎企业网站源码带文档 系统介绍&#xff1a; 1.网站后台采用主流的 SSM 框架 jsp JSTL&#xff0c;网站前台采用freemaker静态化模版引擎生成html5 2.因为是生成的html&#xff0c;无需重复读取数据库&#xff0c;所以访问速度快&#xff0c;轻便&#xff0c;对服务器…

Pytorch张量通过索引获取指定数据

import torch x torch.tensor([1,2,3])x Out[3]: tensor([1, 2, 3])x[0] # 索引操作&#xff1a;取单个数字 Out[4]: tensor(1)x[0:1] # 切片操作&#xff1a;可以保持维度不变 Out[5]: tensor([1])x[torch.tensor([True,False,True])] # 布尔值索引&#xff0c;通过条件筛…

程序员面试技巧:成为HR心动的程序猿

文章目录 程序员必备的面试技巧导语一、准备充分二、突出亮点三、展示解决问题的能力四、良好的沟通能力五、积极展示学习态度示例结语&#x1f636; 写在结尾 程序员必备的面试技巧 “程序员必备的面试技巧&#xff0c;就像是编写一段完美的代码一样重要。在面试战场上&#…

探索Shadowsocks-Android:保护你的网络隐私

探索Shadowsocks-Android&#xff1a;保护你的网络隐私 I. 引言 在数字时代&#xff0c;网络隐私和安全变得愈发重要。我们越来越依赖互联网&#xff0c;但同时也面临着各种网络限制和监控。在这个背景下&#xff0c;Shadowsocks-Android应用程序应运而生&#xff0c;为用户提…

2024PMP考试新考纲-【过程领域】近期典型真题和超详细解析

前面的文章&#xff0c;华研荟讲解了三十多道PMP新考纲下的【人员People领域】的近年真题&#xff0c;这篇文章开始为大家分享【过程Process领域】的新考纲下的真题&#xff0c;进一步帮助大家体会和理解新考纲下PMP的考试特点和如何应用知识来解题&#xff0c;并且举一反三&am…

基于ZU19EG的100G-UDP解决方案

概述 本文档介绍ZU19EG与Mellanox CX6 100G网卡通信解决方案。 环境配置 FPGA硬件&#xff1a;519-ZU19EG的4路100G光纤PCIe加上计算卡 电脑&#xff1a;国产国鑫主板&#xff08;双PCU&#xff09;&#xff1a;Gooxi G2DA-B CPU:Intel Xeon Silver 2.2GHz 内存&#xff1…