发布时间:2024-12-02 17:01
import time
import requests
import re
from multiprocessing import Pool
import os
import pandas as pd
def to_exel(result_list, columns, file):
writer = pd.ExcelWriter(file)
df = pd.DataFrame(result_list, columns=columns)
# logging.info(\"df = pd.DataFrame(result_list, columns=columns)\")
df.to_excel(writer, startrow=0, startcol=0, sheet_name=\'院士信息\', index=False)
writer.save()
writer.close()
def run(a_url, headers):
print(\"子进程ID号:%d\\n\" % (os.getpid())) # os.getpid()进程ID
# 获取院士详细资料的页面
new_url = \"https://ysg.ckcest.cn\" + a_url
r = requests.get(new_url, headers)
# print(r.text)
try:
# 获取院士姓名
name = re.findall(\'姓名
:(.*?)
\', r.text)[0]
# 性别
sex = re.findall(\'性别
:(.*?)
\', r.text)[0]
# 族别
nation = re.findall(\'民族
:(.*?)
\', r.text)[0]
# 本科院校
college = re.findall(\'(.*?) .*?\\s*?学士学位\\s*?\', r.text)[0]
# 入选院士年份
year = re.findall(\'当选年份
:\\s*?\\s*?中国工程院院士 \\s*(.*?)
\\s*.*?
\\s*?\', r.text, re.S)[0]
# 为每个院士创建文件夹
os.makedirs(\"./院士/%s\" % name)
picture_url = re.findall(\'\', r.text)
picture = requests.get(\"http:\" + picture_url[0], headers)
with open(\"./院士/%s/%s.jpg\" % (name, name), \'wb\') as f:
f.write(picture.content)
introduce_num = re.findall(\'html/details/(.*?)/index.html\', a_url)[0]
# print(introduce_num)
introduce_url = \"https://ysg.ckcest.cn/html/details/subnav/content/\" + introduce_num + \"/detail_grxx_grjj\"
introduce_page = requests.get(introduce_url, headers)
introduce = re.findall(\'(.*?)
\', introduce_page.text)[0]
with open(\"./院士/%s/%s.txt\" % (name, name), \'w\', encoding=\'utf-8\') as f:
f.write(introduce)
print(name, sex, nation, college, year)
print(\"%s院士保存成功!\"%name)
return [name, sex, nation, college, year]
except (FileExistsError, IndexError) as e:
print(str(e))
if __name__ == \"__main__\":
# 计算耗时
end1 = time.time()
print(\"父进程启动:%d\" % os.getpid())
headers = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36\'
}
results = []
for i in range(1, 12):
print(\"第{}轮\".format(i))
url = \"https://ysg.ckcest.cn/ysgList/api/index?pageSize=113&pageNum=\"+str(i)+\"&_=1655211977545\"
req = requests.get(url, headers)
# 原本re正在表达式中\'\\\\\'匹配\'\\\',但是python中反斜杠本身就用于转义,故使用\'\\\\\\\\\'匹配\'\\\'
url_lists = re.findall(\'\', req.text, re.S)
# 去重
url_lists = list(set(url_lists))
processes = []
pool = Pool(10) # 设置最大进程数设置为10
for a_url in url_lists:
# 创建子进程
try:
p = pool.apply_async(func=run, args=(a_url, headers)) # func进程执行的任务, args传参数(元组)
processes.append(p)
except:
print(\"进程启动失败!\")
pool.close()
pool.join()
for p in processes:
if p.get() is not None:
results.append(p.get())
# 等待一段时间,防止被网站认为认为是攻击而强制关闭连接
time.sleep(1)
# 写入Exel
to_exel(results, [\'姓名\', \'性别\', \'族别\', \'本科学校\', \'入选院士年份\'], \"院士信息汇总.xlsx\")
end2 = time.time()
print(\"耗时:%.2f秒\" % (end2 - end1))
三、实验小结
由于对本人本次实验中的爬虫的学习深度不够,对爬虫相关代码的理解也比较粗浅,所以代码中可能还有很多的地方都能优化。
遇到TimeoutError时,可尝试设置设置自己电脑的防火墙:windows->设置->windows安全中心->防火墙和网络保护->允许应用通过防火墙->更改设置,将pycharm和python都勾选上。