import time
from selenium import webdriver
import re
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
for j in range(0,67892):#11602
    url = 'http://liaotian.puaas.com/{}.html'.format(j)
    #拿到响应信息,提取目标信息
    driver.get(url)
    page_src = driver.page_source
    print('-----'+str(len(page_src))+'byte,page number:'+str(j)+'\n')
    
    if '您访问的页面不存' in page_src:
        print('not found page',j)
    elif len(page_src)>50000:
        print(str(j)+' page number \n')
        titles=driver.find_elements_by_tag_name('h1')
        for _title in titles:
          title=_title.text
          print(title)
        __type=re.findall(r' » 0:
          _type=re.findall(r'>(.*?)<', __type[0])
          print(_type[0])
          listName=_type[0]
        #print(page_src)
        page_src_noBRSPACE=re.sub(r'[\s]+', '', page_src)
        #page=re.findall(r"(.*?)(.*?)','','','',""]
        if len(page)>0:
          
          _page=page[0].replace('','').replace('','').replace(r'','')
          #for kill in kills:
          #  _page=page[0].replace(kill,'')
          _text = re.sub(r'','',_page)
          __text = re.sub(r'','',_text)
          ___text = re.sub(r'以上,就是关于(.*?)
','',__text)
          ____text = re.sub(r'学习更多专业恋爱(.*?)
','',___text)
          _____text = re.sub(r'','',____text)
          ______text = re.sub(r' 
','',____text)
          text = re.sub(r' 
学习(.*?)
','',_____text)
          #print(_page)
          text=text.replace('olstart','ol start')
          with open('sql.txt','a',encoding='utf-8') as f:
              f.write("('{}','{}','{}','{}'),".format(title,listName,j,text))