from selenium import webdriver
import time
from lxml import etree
def get_driver():
driver = webdriver.Chrome()
return driver
def get_page_source(driver, url):
driver.get(url)
##函数睡眠1秒,等待网页响应和渲染
time.sleep(1)
page_source = driver.page_source
return page_source
def get_xhtml(page_source):
xhtml = etree.HTML(page_source)
return xhtml
def parse_city(xhtml):
city_name = xhtml.xpath('//div[@class="classify-content"]/div/div[@class="classify-row"]/div/a/text()')
city_url = xhtml.xpath('//div[@class="classify-content"]/div/div[@class="classify-row"]/div/a/@href')
## 利用集合去重
city_set = set(zip(city_name, city_url))
city_tup = tuple(city_set)
return city_tup
if __name__ == '__main__':
url = 'https://hotel.meituan.com/guangzhou/'
driver = get_driver()
##爬取该城市的第一页酒店数据
page_source=get_page_source(driver, url)
xhtml=get_xhtml(page_source)
print(parse_city(xhtml))selenium爬取美团全国酒店信息:采集全国酒店信息并除重
阅读:3903 输入:2020-10-08 15:29:07