# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import time
urls={
'http://www.biquge.info/86_86175/'
}
headers={
'user-agent':UserAgent().random
}
#所有超链接
def get_all_poem_link(urls):
links=[]
for url in urls:
response=requests.get(url,headers=headers)
response.encoding#默认的解码格式
soup=BeautifulSoup(response.text,'lxml')#1.转成bs规则
dd_list=soup.find('div',id='list').find_all("dd")
for dd in dd_list:
urlc=dd.find('a')['href']#也可用get('href')
print('http://www.biquge.info/86_86175/'+urlc)
links.append('http://www.biquge.info/86_86175/'+urlc)
return links
#存结果
poem_list=[]
#诗或词的超链接
def get_poem(url):
response=requests.get(url,headers=headers)
if response.status_code==200:
html=response.content.decode(response.apparent_encoding,'ignore')
soup=BeautifulSoup(html,'lxml')
content=soup.find('div',id='content').get_text()
return content
else:
print('访问失败')
return None
if __name__ =='__main__':
result=""
url_list=get_all_poem_link(urls)
for i,url in enumerate(url_list):
print('下载 第%d章'%(i+1))
content=get_poem(url)
if content:
result+=content
else:
print('失败了,重新下载第%d章'%(i+1))
time.sleep(2)
content=get_poem(url)
result+=content
with open('ss.txt','w',encoding='utf-8') as f:
f.write(result)python用BeautifulSoup爬小说网站,乱码处理与请求失败重试
阅读:4064 输入:2020-05-11 15:15:49