python应用BeautifulSoup爬汽车之家面向对象程序-网页编程网

当前位置：主页 >> Python 3 >> 正文

python应用BeautifulSoup爬汽车之家面向对象程序

阅读：4330 输入：2020-05-07 18:32:20

import requests
from bs4 import BeautifulSoup
import json
import openpyxl
class QiCheSpider():
    #初始化的方法
    def __init__(self):
        self.url='https://www.autohome.com.cn/all/{0}/#liststart'
        self.comm_url='https://reply.autohome.com.cn/api/getData_ReplyCounts.ashx'

        #使用headers来伪装成浏览器，（不让服务器发现）
        self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}


    #发送请求的方法
    def send_request(self,url,param=None):  #发送请求需要一个完整的url
        resp=requests.get(url,headers=self.headers,data=param)
        if resp.status_code==200:
            return resp

    #解析数据
    def parse_content(self,resp):
        lst=[['标题','内容','阅读量','评论数']]
        bs=BeautifulSoup(resp.text,'html.parser') #lxml
        #print(resp.text)
        #提取所有 class为 article ul
        ul_tag=bs.find_all('ul',class_='article')
        #print(len(ul_tag))

        for i in ul_tag:  #分别遍历每一个ul
            li_lst=i.find_all('li')    #分别遍历每一个li
            for item in li_lst:
                h3_tag=item.find('h3')
                #print(h3_tag)
                title= h3_tag.text  if h3_tag!=None else ''  #文章的标题

                p_tag=item.find('p')   #得到p标签
                content=p_tag.text  if p_tag!=None else ''   #获取文章的内容

                #获取span标签，因为span标签中含有阅读数
                span_tag=item.find('span',class_='fn-right')
                if span_tag!=None:
                    em_lst=span_tag.find_all('em')
                    r_num=em_lst[0].text   #阅读数

                    #调用获取文章评论的方法
                    s=item.find('a')
                    if s!=None:
                        objid=s['href'].split('/')[5][:6]  #获取objid

                        c_num=self.get_comment(objid)
                    else:
                    #c_num=em_lst[1].text    #评论数（说明没有获取到）
                        c_num=''
                if title!='':
                    lst.append([title,content,r_num,c_num])

        self.write_content(lst) #调用存储数据的方法

    #用于获取对应文章的评论
    def get_comment(self,objid):
        data={
            'appid': '1',
            'dateType': 'jsonp',
            'objids':objid
        }
        resp=self.send_request(self.comm_url,param=data)
        if resp.status_code==200:
           json_result=json.loads(resp.text[1:-1].replace("'commentlist'",'''"commentlist"'''))

           return  json_result['commentlist'][0]['replycount'] #json数据的获取
        #再次发送请求，获取评论数


    #存储到Excel文件中
    def write_content(self,lst):
        wb=openpyxl.Workbook()   #创建工作薄对象
        sheet=wb.active    #获取活动表 sheet页
        sheet.title='汽车之家'

        for item in lst:
           sheet.append(item)
        #保存
        wb.save('autohome.xlsx')

    #启动爬虫动
    def start(self):
        for item in range(1,2):  #先爬一页
            full_url=self.url.format(item)
            #调用发送请求的方法
            resp=self.send_request(full_url)

            #判断resp是否有数据，如果数据正常，解析
            if resp:
                self.parse_content(resp)   #调用解析的方法

#测试
if __name__ == '__main__':
    spider=QiCheSpider()
    spider.start()

上一篇：python多线程BeautifulSoup采集诗词网，设置爬多url
下一篇：python用BeautifulSoup爬小说网站，乱码处理与请求失败重试

相关阅读: python应用正则爬铃声网站并实现下载; python正则采集百度瀑布流图片并保存在本地; python应用parsel xpath采集构建代理ip池