python安全开发
python安全开发
- python安全开发
- 前言
- 一、平台edu
- 二、使用步骤
- 1.引入库
- 2.功能
- **完整代码**
- 完整代码
- 总结
前言
目的:想快速的搜集edu的域名
一、平台edu
https://src.sjtu.edu.cn/rank/firm/0/?page=2
二、使用步骤
1.引入库
代码如下(示例):
Get_EduName.py
import requests //爬虫
from bs4 import BeautifulSoup //提取数据
from concurrent.futures import ThreadPoolExecutor, as_completed //多进程
import csv //csv
2.功能
def get_edu_name(page):
url = f'https://src.sjtu.edu.cn/rank/firm/0/?page={page}'
try:
print(f'正在获取第{page}页的数据...')
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
edu1 = soup.find_all('td', attrs={'class': 'am-text-center'})
edu_names = []
for td in edu1: //找到td
if td.a: //找到a标签
institution_name = td.a.get_text()
edu_names.append(institution_name)
return edu_names
else:
print(f'Failed to retrieve page {page}. Status code: {response.status_code}')
return []
except Exception as e:
print(f'Failed to retrieve page {page}: {e}')
return []
写入数据
def extract_unique_universities(filename):
unique_universities = set()
universities_list = []
with open(filename, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
for row in reader:
if row: # 确保行不为空
university_name = row[0].strip() # 取第一列数据,并去除空格
if university_name not in unique_universities:
unique_universities.add(university_name)
universities_list.append(university_name)
return universities_list
if __name__ == '__main__':
max_pages = 100 # 设置要获取的页面数
edu_names = get_all_edu_names(max_pages)
save_edu_names_to_csv(edu_names, 'edu.csv')
print("学校名称已保存到 edu.csv 文件中。")
filename = 'edu.csv' # 替换为你的 CSV 文件路径
universities = extract_unique_universities(filename)
print("提取的大学数据:")
for university in universities:
print(university)
with open('university.txt', 'a+', encoding='utf-8') as f:
f.write(university + '\n')
完整代码
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
def get_edu_name(page):
url = f'https://src.sjtu.edu.cn/rank/firm/0/?page={page}'
try:
print(f'正在获取第{page}页的数据...')
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
edu1 = soup.find_all('td', attrs={'class': 'am-text-center'})
edu_names = []
for td in edu1:
if td.a:
institution_name = td.a.get_text()
edu_names.append(institution_name)
return edu_names
else:
print(f'Failed to retrieve page {page}. Status code: {response.status_code}')
return []
except Exception as e:
print(f'Failed to retrieve page {page}: {e}')
return []
def get_all_edu_names(max_pages):
all_edu_names = []
with ThreadPoolExecutor() as executor:
futures = [executor.submit(get_edu_name, page) for page in range(1, max_pages + 1)]
for future in as_completed(futures):
edu_names = future.result()
all_edu_names.extend(edu_names)
return all_edu_names
def save_edu_names_to_csv(edu_names, filename):
with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
for name in edu_names:
writer.writerow([name])
def extract_unique_universities(filename):
unique_universities = set()
universities_list = []
with open(filename, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
for row in reader:
if row: # 确保行不为空
university_name = row[0].strip() # 取第一列数据,并去除空格
if university_name not in unique_universities:
unique_universities.add(university_name)
universities_list.append(university_name)
return universities_list
if __name__ == '__main__':
max_pages = 100 # 设置要获取的页面数
edu_names = get_all_edu_names(max_pages)
save_edu_names_to_csv(edu_names, 'edu.csv')
print("学校名称已保存到 edu.csv 文件中。")
filename = 'edu.csv' # 替换为你的 CSV 文件路径
universities = extract_unique_universities(filename)
print("提取的大学数据:")
for university in universities:
print(university)
with open('university.txt', 'a+', encoding='utf-8') as f:
f.write(university + '\n')
Get_Edu_domain.py
原理从bing里面搜索关键字,提取网址,写入csv
完整代码
import requests
from bs4 import BeautifulSoup
import csv
import tldextract
import concurrent.futuresGet_Edu_domain.py
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0',
'cookie': 'MUID=32957CB67A1A615722B972087B656099'
}
university_list = []
with open('university.txt', 'r', encoding='utf-8') as f:
for line in f:
university_list.append(line.strip().replace(' ', '+'))
with open('university_results.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(['大学', '网址', '子域名'])
def process_site(site):
url = f'https://www.bing.com/search?q=intitle%3a{site}&mkt=zh-CN&FPIG=0B6AED8B37BF44B78B8F58E6A949DB10&first=1&FORM=PERE'
print(f"正在搜索: {site}")
try:
response = requests.get(url, headers=headers)
print(f"状态码: {response.status_code}")
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
results = soup.find_all('h2')
found_valid_result = False
for result in results:
try:
u = result.a.get('href')
print(f"网址: {u}")
# 提取子域名
extracted = tldextract.extract(u)
if extracted.subdomain:
domain = f"{extracted.subdomain}.{extracted.domain}.{extracted.suffix}"
if '.edu.cn' in domain or domain.endswith('.edu.cn'):
csvwriter.writerow([site, u, domain])
found_valid_result = True
break
except Exception as e:
print(f"提取网址时出错: {e}")
if not found_valid_result:
print("未找到有效的搜索结果。")
csvwriter.writerow([site, '未找到有效结果', ''])
except requests.RequestException as e:
print(f"请求失败: {e}")
csvwriter.writerow([site, f"请求失败: {e}", ''])
except Exception as e:
print(f"出错: {e}")
csvwriter.writerow([site, f"出错: {e}", ''])
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(process_site, university_list)
效果
可以使用这里的子域名写入到一个文件
可以使用subfinder,httpx,ksubdomain,
./subfinder -d baidu.com -silent|./ksubdomain -verify -silent|./httpx -title -content-length -status-code -o url.html -html
总结
可以批量打edu的域名,快速上分edu