import requests
from bs4 import BeautifulSoup
import smtplib
import re
import time
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4068.4 Safari/537.36'
}
def weather(url):
response=requests.get(url,headers=headers)
response.encoding='gb18030'
bs=BeautifulSoup(response.text,'html.parser')
#print(bs)
if len(response.text) < 600:
return 'null'
else:
siteName=bs.find('title').get_text()
print(siteName)
position=bs.find(id='position')
positionall=position.find_all("a")
if len(positionall)==2:
positionR1=positionall[1].text
positionR2=''
elif len(positionall)>2:
positionR1=positionall[1].text
positionR2=positionall[2].text
typeTemp=re.findall("""> (.*?) > """,str(position))
if len(typeTemp) is not None:
type=typeTemp[0]
else:
type=''
siteInfo=bs.find(id="siteinfo")
#print(siteInfo)
siteURL=siteInfo.find_all('a')
print(siteURL)
if len(siteURL)>3:
URL1=siteURL[0].get('href')
URL2=siteURL[2].get('href')
elif len(siteURL)>0:
URL1=siteURL[0].get('href')
URL2=''
else:
URL1=''
URL2=''
desc=bs.find(id='sitetext')
print(desc.text)
if desc is None:
descR=''
else:
descR=desc.text
#print(desc.text)
number=re.findall('/(.*?).html',url)
with open('index2-2.txt','a',encoding='utf-8-sig') as f:
f.write('insert into table_temp (number,site,type,siteHref,siteHref,list1,list2,desc) values ("{}","{}","{}","{}","{}","{}","{}","{}");'.format(number[1],siteName,type,URL1,URL2,positionR1,positionR2,descR))
if __name__ =='__main__':
for i in range(143,90000):
time.sleep(5)
url='http://www.k*g-u*o-w-ai.com/html/{}.html'.format(i)
#url='http://www.k/g-u+o+w*a-i.com/html/55.html'
print(url)
weather(url)python综合应用BeautifulSoup、正则爬世界网址并生成文本
阅读:4138 输入:2021-03-29 08:34:51