from selenium import webdriver import time import re option = webdriver.ChromeOptions() option.add_argument('headless') driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option) url='https://www.letpub.com.cn/index.php?journalid=10&page=journalapp&view=detail#tonglytougjy' title='' short='' ISSN='' EISSN='' tds='' for i in range(1,2): url='https://www.letpub.com.cn/index.php?journalid={}&page=journalapp&view=detail#tonglytougjy'.format(i) time.sleep(2) driver.get(url) page_src = driver.page_source #print(page_src) title=re.findall(r'(.*?)期刊基本',page_src) short=re.findall(r'(.*?)',page_src) ISSN=re.findall(r'期刊ISSN
(.*?) (.*?)',page_src) #tds+=driver.find_element_by_class_name('table_yjfx').text print(title) print(short,ISSN,EISSN) time.sleep(3) print(str(i)+': strlen'+str(len(page_src))+' ') driver.quit()
selenium采集,综合应用反爬
阅读:1071 输入:2024-08-04 20:05:35