爬取boss直聘上海市人工智能招聘信息
import time
import tqdm
import random
import requests
import json
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
用edge模拟登录boss网站
browser = webdriver.Edge()
time.sleep(5)
browser.get('https://www.zhipin.com/web/geek/job?query=%E6%99%BA%E8%83%BD&city=101020100&page=1')
设置搜索的网站,然后遍历每个列表的网站元素,获取url
base_url = 'https://www.zhipin.com/web/geek/job?query=%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0%E5%AE%9E%E4%B9%A0%E7%94%9F&city=101020100&page={}'
max_clicks = 10 # 设置最大点击次数
click_count = 1 # 记录已经点击的次数
urls = []
while click_count <= max_clicks:
page_url = base_url.format(click_count)
browser.get(page_url)
time.sleep(8)
try:
li_list = browser.find_elements_by_xpath('//div[@class="search-job-result"]/ul/li')
for li in li_list:
link_element = li.find_element_by_xpath('.//div[@class="job-card-body clearfix"]')
link_element=link_element.find_element_by_tag_name('a')
link_href = link_element.get_attribute('href')
print(link_href)
urls.append(link_href)
except Exception as e:
print("Exception occurred:", str(e))
click_count += 1 # 增加点击次数计数器
df = pd.DataFrame({'url': urls})
df.to_csv("./urls.csv", index=False)
遍历刚爬取的网页,用xpath语句定位招聘信息
contents=[]
df=pd.read_csv(r'urls5.csv')
for index, row in df.iterrows():
url = row['url']
try:
browser.get(url)
print(url)
time.sleep(6)
content_e=browser.find_element_by_xpath('//div[@class="job-detail"]')
content_e=content_e.find_element_by_xpath('.//div[@class="job-detail-section"]')
content_e=content_e.find_element_by_xpath('.//div[@class="job-sec-text"]').text
print(content_e)
contents.append(content_e)
except:
pass
df_=pd.DataFrame({'content': contents})
df_.to_excel(r'招聘信息5.xlsx',index=False)
LDA主题建模
数据集为爬取的boss直聘上海市的人工智能相关岗位招聘信息。
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
# from gensim.models.word2vec import Word2Vec
import matplotlib
from pylab import xticks,yticks,np
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
from gensim import corpora, models
from gensim.models import CoherenceModel
warnings.filterwarnings("ignore")
import codecs
import re
import pyLDAvis.gensim
from gensim.models import LdaModel
import pandas as pd
from gensim.corpora import Dictionary
from gensim import corpora, models
# pip install gensim,re,jieba,pylab
# #这里注意gensim的版本要一致,如果报错请查看网上教程
1 读入数据,文本清洗(去除非中文字符,将只有1个字符的数据删除,删除重复行)
df = pd.read_csv(r"招聘信息.csv")
def extract_chinese(text):
chinese_pattern = re.compile(r'[\u4e00-\u9fa5]+')
chinese_words = chinese_pattern.findall(text)
return ' '.join(chinese_words)
df['content'] = df['content'].apply(extract_chinese)
df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 1]))
df.drop_duplicates(inplace=True)
df.head()
这边还可以通过snownlp库进行简单的情感分类
from snownlp import SnowNLP
# 定义情感分类函数
def classify_sentiment(sentiment):
if sentiment >= 0.5:
return 'positive'
else:
return 'negative'
def classify_sentiments(text):
s = SnowNLP(text)
sentiment = s.sentiments
return sentiment
# 对评论列进行情感分类
df['sentiment'] = df['content'].apply(classify_sentiments)
def classify_sentiment(sentiment_value):
if sentiment_value > 0.5:
return '正向'
else:
return '负向'
# 应用分类函数
df['情感类型'] = df['sentiment'].apply(classify_sentiment)
content = df['content'].values.tolist()
content[:5]
2 开启jieba分词
import jieba
segment=[]
for line in content:
try:
segs = jieba.lcut(line)#分词
for seg in segs:
if len(seg)>1 and seg != '\r\n':
segment.append(seg)
except:
print(line)
continue
segment[:10]
3 应用停用词表
np.recfromtxt(r'./stop_words.utf8',encoding='utf-8')
words_df=pd.DataFrame({'segment':segment})
stopwords = np.recfromtxt(r'./stop_words.utf8',encoding='utf-8')
words_df = words_df[~words_df.segment.isin(stopwords)]
words_df.head()
words_df.head(20)
words_stat=words_df.groupby(by=['segment'])['segment'].agg([("计数",np.size)])
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
words_stat.head()
words_stat.iloc[4]
4 文本词云图
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
mask_pic = np.array(Image.open(r"./1234.jpeg"))
wordcloud = WordCloud(width=300, height=200,scale=4,mask=mask_pic,font_path='simhei.ttf',background_color='white',max_font_size=80)
word_frequence = {x[0]:x[1] for x in words_stat.head(220).values}
wordcloud=wordcloud.fit_words(word_frequence)
plt.axis('off')
image = wordcloud.to_image()
wordcloud.to_file('词云.png') # 保存图片
image.show()
words_stat.to_csv(r'文本频率表.csv',index=False)
5 TF_IDF词语权重表
comments = content
segmented_comments = []
for comment in comments:
words = jieba.cut(comment)
filtered_words = [word for word in words if word not in stopwords]
segmented_comments.append(" ".join(filtered_words))
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
# 使用评论数据拟合TF-IDF向量化器并转换数据
tfidf_matrix = tfidf_vectorizer.fit_transform(segmented_comments)
# 获取特征词列表
feature_names = tfidf_vectorizer.get_feature_names_out()
# 将TF-IDF矩阵转换为DataFrame,并加上特征词作为列名
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
# 计算每个特征词的平均TF-IDF权重
avg_tfidf = tfidf_df.mean().sort_values(ascending=False)
print(avg_tfidf)
avg_tfidf.to_csv(r'./词语权重表.csv')
6 LDA建模曲线图
stop_words_file = './stop_words.utf8'
stop_words = set()
with codecs.open(stop_words_file, 'r', 'utf-8') as f:
for word in f:
stop_words.add(word.strip())
texts = [list(filter(lambda x: x not in stop_words, jieba.cut(text.replace(" ", "").strip()))) for text in content]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
def lda_model_values(num_topics, corpus, dictionary):
x = [] # x轴
perplexity_values = [] # 困惑度
coherence_values = [] # 一致性
model_list = [] # 存储对应主题数量下的lda模型,便于生成可视化网页
for topic in range(num_topics):
print("主题数量:", topic+1)
lda_model = models.LdaModel(corpus=corpus, num_topics=topic+1, id2word =dictionary, chunksize = 2000, passes=20, iterations = 400)
model_list.append(lda_model)
x.append(topic+1)
perplexity_values.append(lda_model.log_perplexity(corpus))
coherencemodel = models.CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
print("该主题评价完成\n")
return model_list, x, perplexity_values, coherence_values
# 调用准备函数
model_list, x, perplexity_values, coherence_values = lda_model_values(8, corpus, dictionary)
# 绘制困惑度和一致性折线图
fig = plt.figure(figsize=(15,5))
plt.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
ax1 = fig.add_subplot(1, 2, 1)
plt.plot(x, perplexity_values, marker="o")
plt.title("主题建模-困惑度")
plt.xlabel('主题数目')
plt.ylabel('困惑度大小')
xticks(np.linspace(1, 8,8, endpoint=True)) # 保证x轴刻度为1
ax2 = fig.add_subplot(1, 2, 2)
plt.plot(x, coherence_values, marker="o")
plt.title("主题建模-一致性")
plt.xlabel("主题数目")
plt.ylabel("一致性大小")
xticks(np.linspace(1,8,8 ,endpoint=True))
plt.show()
# plt.savefig('主题建模一致性_困惑度曲线.png')
7 LDA气泡图
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, passes = 10,random_state=42)
#将这里的num_topics换成上述观测到的最佳建模主题数,数值应该取困惑度小同时一致性高的拐点,注意纵坐标!!!
#每次生成的曲线图和LDA气泡图可能不一样,因为该算法为无监督算法,所以每次训练的语料不一样
topic_list=lda.print_topics()
print(topic_list)
result_list =[]
for i in lda.get_document_topics(corpus)[:]:
listj=[]
for j in i:
listj.append(j[1])
bz=listj.index(max(listj))
result_list.append(i[bz][0])
print(result_list)
topic_data = pd.DataFrame(columns=['主题', '关键词', '概率分数'])
for topic in topic_list:
topic_num, topic_terms = topic
terms = topic_terms.split('+')
for term in terms:
probability, word = term.split('*')
word = word.strip()
probability = probability.strip()
topic_data = pd.concat([topic_data, pd.DataFrame({'主题': [topic_num], '关键词': [word], '概率分数': [probability]})], ignore_index=True)
topic_data = topic_data.groupby('主题')['关键词'].apply(lambda x: ' '.join(x)).reset_index()
topic_data['关键词'] = topic_data['关键词'].replace('"', '', regex=True)
# 将 DataFrame 导出到 Excel 文件
topic_data.to_excel('主题关键词概率分数.xlsx', index=False)
pyLDAvis.enable_notebook()
data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(data, './topic.html')