读远程的代理ip,并动态判断可用性,比生成一个池要好,再爬数据。其中有删除图片标签等操作。

# -*- coding: utf-8 -*-
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import re

headers={
        'user-agent':UserAgent().random
        }

result=''
i=0
k=0
def doReplace(str):
    #删除图片
    p1 = r'()'
    str = re.sub(p1, "", str)
    #删除图片
    p0 = r'()'
    str = re.sub(p0, "", str)    
    #a标签
    p2 = r'()'
    str = re.sub(p2, "", str)
    p3 = r'()'
    str = re.sub(p3, "", str)
    #删除广告
    p4 = r'(':'

','':'

','':'

','':'

','(导师微信pualove104)':''} for key in rep: str=str.replace(key,rep[key]) return str import time import random def getIP(): url='http://webapi.http.zhimacangku.com/getip' response=requests.get(url) result=[] for i in response.text.split("\n"): # inspectIP(i) result.append(i.replace('\r','')) a = random.choice(result) return a def getOneIP(): a=getIP() while True: if inspectIP(a) is not None: break return a def inspectIP(ipprot): time.sleep(1) herder={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.9', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Upgrade-Insecure-Requests':'1' } url='https://www.baidu.com' proxies = { "http": "http://"+str(ipprot) } request=requests.get(url,headers=herder,proxies=proxies) s = requests.session()#请求太频繁 s.keep_alive = False#请求太频繁 if request.status_code==200: print('可用'+ipprot) return ipprot else: print('不可用'+ipprot) return None def doReplace2(str): rep = {'':'

',' ':'

','
':'


'} for key in rep: str=str.replace(key,rep[key]) return str def duihua(str): result='' for i in str: print(i.get_text()) return result def getData(m): global k global i result='' url='http://liaotian.puaas.com/{}.html'.format(m) print(url) #设置代理ip访问方式,http和https proxies = { "https":getOneIP() } print(proxies) response=requests.get(url,headers=headers,proxies=proxies) time.sleep(5) if response.status_code==200 and len(response.text)>4000: k+=1 bs=BeautifulSoup(response.text,'html.parser') #print(bs) title=bs.find('h1').get_text() #print(title.get_text()) p2=r'(">).+?(
» 2:#前2行不要 resultS='' if len(p.contents)>1:#对话 resultS+='
' for line in p.contents: if str(line) != ' ':#过滤 resultS+=doReplace2(str(line)) result+=resultS+'
' else: result+=doReplace(str(p.contents[0])) c='INSERT INTO zhy_article (id,title,type) VALUES({},"{}","{}")\nINSERT INTO zhy_article_body (aid,body,redirecturl) VALUES ({},"{}","{}")\n'.format(k,title,listName,k,result,m) with open('sql.txt','a',encoding='utf-8') as f: f.write(c+'\n') for m in range(900,19900): getData(m)