**
话不多说,直接附源码,可运行!
**
import requests
from lxml import etree
from fake_useragent import UserAgent
import time
class wallhaven(object):
def __init__(self):
# yellow
# self.url = "https://wallhaven.cc/search?colors=cc6633&page={}"
# girl
self.url = "https://wallhaven.cc/search?q=girl&categories=111&purity=110&sorting=date_added&order=desc&ai_art_filter=0&page={}"
ua = UserAgent()
for i in range(1, 50):
self.headers = {
'User-Agent': ua.random,
}
def get_page(self, url):
res = requests.get(url=url, headers=self.headers)
html = res.content.decode("utf-8")
return html
def parse_page(self, html):
parse_html = etree.HTML(html)
image_src_list = parse_html.xpath('//figure//a/@href')
print("当前图片Url列表:", image_src_list)
for image_src in image_src_list:
html1 = self.get_page(image_src) # 二级页面发生请求
parse_html1 = etree.HTML(html1)
filename = parse_html1.xpath('//div[@class="scrollbox"]//img/@src')
if filename is None:
continue
for img in filename:
dirname = "./images/other/" + img[32:]
html2 = requests.get(url=img, headers=self.headers).content
with open(dirname, 'wb') as f:
f.write(html2)
print(f"图片{filename}下载成功:")
def main(self):
startPage = 12
endPage = 99
for page in range(startPage, endPage + 1):
print("获取当前页面图片,页码:", page)
url = self.url.format(page)
html = self.get_page(url)
self.parse_page(html)
time.sleep(1.4)
if __name__ == '__main__':
imageSpider = wallhaven()
imageSpider.main()