正则表达式:
正则表达式的语法:
元字符: \D \d \w \W . [ ]
量词: + ? *
惰性匹配:
玩儿(?P<name>.*?)游戏:
匹配到第一个游戏结束,name = 匹配的文本。
玩儿(?P<name>.*)游戏:
一直匹配到不符合条件元字符.才结束
正则表达式获取html内容:
获取豆瓣top250网页的电影名字及年份:
import re import requests url = "https://movie.douban.com/top250" head = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0" } resp = requests.get(url,headers = head) resp.encoding = "utf-8" regex = re.compile('<div class="item">.*?<span class="title">(?P<name>.*?)</span>.*?<p class="">(?P<year>.*?)</p>',re.S) request = regex.finditer(resp.text) for item in request: print(item.group("name")) year = item.group("year") year_new =re.search("\d+",year) print(year_new.group())
xpath:
依赖下载:
pip install lxml
创建xpath对象et:
from lxml import html
etree = html.etree
file = open('mybaidu.html',mode = 'r',encoding = 'utf-8')
text = file.read()
et = etree.HTML(text)
print(et)
获取标签对象、标签属性、标签内容:
text() 获取文本
@class 获取属性
/* 匹配任意标签
// 当前标签下所有标签
【@class='' ''】限定属性
print(et.xpath("/html/head/title/text()")) #['百度一下,你就知道', '谷歌一下,你就知道'] print(et.xpath("/html/*/title/text()")) #['百度一下,你就知道', '谷歌一下,你就知道'] print(et.xpath("/html/head/title[@class='限定风格']/text()")) #['百度一下,你就知道'] titles = et.xpath("//title/text()") for title in titles: print(title) #百度一下,你就知道 #谷歌一下,你就知道 print(et.xpath("/html/head/title/@name")) #['标题'] print(et.xpath("//@name")) #['referrer', '标题']
<!DOCTYPE html> <!--STATUS OK--> <html> <head> <meta http-equiv=content-type content=text/html;charset=utf-8 class="a"> <meta http-equiv=X-UA-Compatible content=IE=Edge class="b"> <meta content=always name=referrer class="c"> <link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css> <title name="标题" class="限定风格">百度一下,你就知道</title> <title>谷歌一下,你就知道</title> </head> <body link=#0000cc> <a href="">a</a> <a href="">b</a> </body> </html>
from lxml import html etree = html.etree file = open('mybaidu.html',mode = 'r',encoding = 'utf-8') text = file.read() et = etree.HTML(text) print(et) htlm = et.xpath('/html') print(et.xpath("/html/head/title/text()")) #['百度一下,你就知道', '谷歌一下,你就知道'] print(et.xpath("/html/*/title/text()")) #['百度一下,你就知道', '谷歌一下,你就知道'] print(et.xpath("/html/head/title[@class='限定风格']/text()")) #['百度一下,你就知道'] titles = et.xpath("//title/text()") for title in titles: print(title) #百度一下,你就知道 #谷歌一下,你就知道 print(et.xpath("/html/head/title/@name")) #['标题'] print(et.xpath("//@name")) #['referrer', '标题'] metas = et.xpath("/html/head/meta/@class") for meta in metas: print(meta) #a #b #c a_s = et.xpath("/html/body/a") for a in a_s: print(a.xpath("./text()")) #['a'] #['b']