import time
from selenium import webdriver
import re
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
for j in range(0,67892):#11602
url = 'http://liaotian.puaas.com/{}.html'.format(j)
#拿到响应信息,提取目标信息
driver.get(url)
page_src = driver.page_source
print('-----'+str(len(page_src))+'byte,page number:'+str(j)+'\n')
if '您访问的页面不存' in page_src:
print('not found page',j)
elif len(page_src)>50000:
print(str(j)+' page number \n')
titles=driver.find_elements_by_tag_name('h1')
for _title in titles:
title=_title.text
print(title)
__type=re.findall(r' » 0:
_type=re.findall(r'>(.*?)<', __type[0])
print(_type[0])
listName=_type[0]
#print(page_src)
page_src_noBRSPACE=re.sub(r'[\s]+', '', page_src)
#page=re.findall(r"(.*?)(.*?)','','','',""]
if len(page)>0:
_page=page[0].replace('','').replace('','').replace(r'','')
#for kill in kills:
# _page=page[0].replace(kill,'')
_text = re.sub(r'','',_page)
__text = re.sub(r'','',_text)
___text = re.sub(r'以上,就是关于(.*?)
','',__text)
____text = re.sub(r'学习更多专业恋爱(.*?)
','',___text)
_____text = re.sub(r'','',____text)
______text = re.sub(r'
','',____text)
text = re.sub(r'
学习(.*?)
','',_____text)
#print(_page)
text=text.replace('olstart','ol start')
with open('sql.txt','a',encoding='utf-8') as f:
f.write("('{}','{}','{}','{}'),".format(title,listName,j,text))
采集案例
阅读:2514 输入:2023-02-09 20:48:26