今天由于公司需求,需要某查上的一些信息作为参考,所以花了些时间写了下面的爬虫程序,爬取并写入到excel里面,这里是增加了爬取间隔时间的,里面是每爬取1条数据停顿2秒,爬完一个列表页停顿10秒,可以根据自己的情况去进行修改!代码供大家学习参考不要用来干坏事,做不法行为,停顿时间请停顿长一点不要给对方服务器造成困扰!
python代码如下:
#爬取某查数据
import re
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
#excel写入包
import openpyxl as op
headers = {
'Cookie': 'jsid=SEO-BAIDU-ALL-SY-000001; TYCID=3723e75055dc11ee842c15939a0a8d0b; ssuid=8751540630; _ga=GA1.2.705797656.1695011533; _gid=GA1.2.1520328598.1695106868; HWWAFSESID=3c089625739a4710d39; HWWAFSESTIME=16951%7D%222%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2213547966975%22%7D; tyc-user-info-save-time=1695187976775; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU0Nzk2Njk3NSIsImlhdCI6MTY5NTE4Nzk3NiwiZXhwIjoxNjk3Nzc5OTc2fQ.7Tvrvkbrhy8VkOJAO8a2d8QiiqfUfgsg8WeDG8umvBQYFBi7ltuKVEFw10EX8QReQMFadHNVhu2I1_oWjoecmw; searchSessionId=1695189936.62093843; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1695190500',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
# 这里定义个大数组装所有的医院数据
yiyuan_list_all = []
# 采集从多少页到多少页 从1-250页数据
for j in range(240,251):
url = f'https://www.某查.com/search?key=%E7%9C%BC%E7%A7%91%E5%8C%BB%E9%99%A2&pageNum={j}'
html = requests.get(url,headers = headers)
soup = BeautifulSoup(html.text,'lxml')
#天眼查一页20个数据
for i in range(0,20):
info = soup.select('.index_search-box__7YVh6 .index_name__qEdWi a')[i]
company_name = info.text
company_url = info['href']
#print(company_name,company_url)
#每爬抓1个休眠2秒
for k in range(1, 3):
print(f"等待{k}秒...")
time.sleep(1)
# 爬取二级目录
html_detail = requests.get(company_url,headers = headers)
soup_detail = BeautifulSoup(html_detail.text,'lxml')
# print(soup_detail)
data_infos = soup_detail.select('.index_tableBox__ZadJW tr td')
#print(data_infos)
yiyuan_list = []
#把数据变成text的数组 去掉多余代码
yiyuan_list.append(company_name)
for info in data_infos:
yiyuan_list.append(info.text)
#每循环一家医院信息就增加到大的数组里yiyuan_list_all
yiyuan_list_all.append(yiyuan_list)
print("采集成功 %d 条" % (((j - 1) * 20) + i))
#爬取完一页停留10秒
for k in range(1,11):
print(f"爬取了一页了,等待{k}秒...")
time.sleep(1)
#print(yiyuan_list_all)
# 写入表格
wb = op.Workbook() # 创建工作簿对象
ws = wb['Sheet'] # 创建子表
#ws.append(['所有医院']) # 添加表头
for yiyuan in yiyuan_list_all:
ws.append(yiyuan)
wb.save('医院.xlsx')
print("----采集完成------")
# 写入txt文本
# with open('yiyuan.txt','a+',encoding='utf-8') as f:
# if num != 0:
# num = 1
# f.write('\n'*num+company_name + " : " + '\n')
# for info in data_infos:
# print(info.text)
# f.write(info.text)
# num += 1
原创文章,作者:lichen360,如若转载,请注明出处:https://hhpi.cn/5.html