新闻爬虫
from bs4 import BeautifulSoup
import requests
import sys
import random
import pymysql
links = [ ]
datas = [ ]
hea = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'
}
urls = [
"https://www.chinanews.com/china.shtml" ,
"https://www.chinanews.com/society.shtml" ,
"https://www.chinanews.com/compatriot.shtml" ,
"https://www.chinanews.com/wenhua.shtml" ,
"https://www.chinanews.com/world.shtml" ,
"https://www.chinanews.com/cj/gd.shtml" ,
"https://www.chinanews.com/sports.shtml" ,
"https://www.chinanews.com/huaren.shtml"
]
db = pymysql. connect( host= '127.0.0.1' , user= 'root' , password= '123456' , port= 3396 , db= 'news_recommendation_system' )
cursor = db. cursor( )
def main ( ) :
baseurl = 'https://www.chinanews.com/taiwan.shtml'
getLink( baseurl)
getInformationAndSave( )
db. close( )
def getInformationAndSave ( ) :
for link in links:
data = [ ]
url = "https://www.chinanews.com" + link[ 1 ]
cur_html = requests. get( url, headers= hea)
cur_html. encoding = "utf8"
soup = BeautifulSoup( cur_html. text, 'html.parser' )
title = soup. find( 'h1' )
title = title. text. strip( )
tr = soup. find( 'div' , class_= 'left-t' ) . text. split( )
time = tr[ 0 ] + tr[ 1 ]
recourse = tr[ 2 ]
cont = soup. find( 'div' , class_= "left_zw" )
content = cont. text. strip( )
print ( link[ 0 ] + "---" + title + "---" + time + "---" + recourse + "---" + url)
saveDate( title, content, time, recourse, url)
def deleteDate ( ) :
sql = "DELETE FROM news "
try :
cursor. execute( sql)
db. commit( )
except :
db. rollback( )
def saveDate ( title, content, time, recourse, url) :
try :
cursor. execute( "INSERT INTO news(news_title, news_content, type_id, news_creatTime, news_recourse,news_link) VALUES ('%s', '%s', '%s', '%s', '%s' ,'%s')" % \
( title, content, random. randint( 1 , 8 ) , time, recourse, url) )
db. commit( )
print ( "执行成功" )
except :
db. rollback( )
print ( "执行失败" )
def getLink ( baseurl) :
html = requests. get( baseurl, headers= hea)
html. encoding = 'utf8'
soup = BeautifulSoup( html. text, 'html.parser' )
for item in soup. select( 'div.content_list > ul > li' ) :
if ( item. a == None ) :
continue
data = [ ]
type = item. div. text[ 1 : 3 ]
link = item. div. next_sibling. next_sibling. a[ 'href' ]
data. append( type )
data. append( link)
links. append( data)
if __name__ == '__main__' :
main( )