获取代理IP

为了方便初学者快速获取代理IP，对www.89ip.cn及www.xicidaili.com两个网址对外公布的代理IP进行爬取，并通过myip.ipip.net对爬取到的代理IP进行有效的校验。

使用方法：导入包，执行ip_proxy()得到代理ip

最终返回的代理ip格式为[‘ip:端口’，‘ip:端口’，‘ip:端口’，‘ip:端口’，‘ip:端口’，……]

#!/usr/bin/env python
# coding: utf-8
import urllib.request
from bs4 import BeautifulSoup
import http.cookiejar
import zlib
import re
import time
import random
import threading
from fake_useragent import UserAgent
ua = UserAgent()


# 爬取代理ip
def ip_proxy():
    agent_lists_temp = []
    print('对www.89ip.cn进行爬取')
    for i in range(1, 10):   #  根据需要调整爬取页数，当前是从第1页开始爬10页
        try:
            time.sleep(random.uniform(1 ,3))
            url =f'http://www.89ip.cn/index_%d.html '%i
            header = {'User-Agent':ua.random,'Accept-Encoding': 'gzip, deflate, sdch'}
            req = urllib.request.Request(url,headers = header)
            cookie = http.cookiejar.CookieJar()
            opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
            r = opener.open(req,timeout=10)
            html = zlib.decompress(r.read(), 16+zlib.MAX_WBITS)
            html = html.decode()
            agent_info = BeautifulSoup(html,'lxml').find_all('tr')[1:]
            for i in range(len(agent_info)):
                info = agent_info[i].find_all('td')
                if info[1].string.replace(' ','').strip() != '9999':
                    agents = info[0].string + ':' + info[1].string
                    agent_lists_temp.append(agents.replace('\t','').replace('\n',''))
        except:
            pass
    print('对www.xicidaili.com进行爬取')
    for i in range(3, 6):     #  根据需要调整爬取页数，当前是从第3页开始爬5页
        try:
            time.sleep(random.uniform(1,10))
            url='http://www.xicidaili.com/nn/'+str(i)
            header = {'User-Agent':ua.random,'Accept-Encoding': 'gzip, deflate, sdch'}
            req = urllib.request.Request(url,headers = header)
            cookie = http.cookiejar.CookieJar()
            opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
            r = opener.open(req,timeout=10)
            html = zlib.decompress(r.read(), 16+zlib.MAX_WBITS)
            html = html.decode()
            agent_info = BeautifulSoup(html,'lxml').find(id="ip_list").find_all('tr')[1:]
            for i in range(len(agent_info)):
                info = agent_info[i].find_all('td')
                if info[2].string.replace(' ','').strip() != '9999':
                    agents = info[1].string + ':' + info[2].string
                    agent_lists_temp.append(agents)
        except:
            pass
    # 获取有效的代理IP
    for i in ['39.137.69.8:8080','36.25.243.51:80','39.137.69.7:80','60.191.11.229:3128','117.88.176.102:3000','116.114.19.211:443', '101.231.104.82:80', '117.88.4.179:3000']:
        agent_lists_temp.append(i)
    global agent_lists
    agent_lists = []
    print('校验代理IP的有效性')
    threads = []
    for i in range(len(agent_lists_temp)):
        t = threading.Thread(target=validation, args=(agent_lists_temp[i], ))
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    return(agent_lists)


def validation(agent_ip):
    global agent_lists
    try:
        proxyMeta = agent_ip
        proxies = {'http': proxyMeta, 'https': proxyMeta}
        proxy = urllib.request.ProxyHandler(proxies)
        header = {'User-Agent': ua.random, 'Accept-Encoding': 'gzip, deflate, sdch'}
        url = 'http://myip.ipip.net'
        req = urllib.request.Request(url, headers=header)
        cookie = http.cookiejar.CookieJar()
        opener = urllib.request.build_opener(proxy, urllib.request.HTTPCookieProcessor(cookie))
        html = opener.open(req, timeout=3)
        html = html.read().decode()
        ip = re.compile(r'IP：(.+?)  来').findall(html)
        if ip[0] == agent_ip.split(':')[0]:
            agent_lists.append(agent_ip)
    except:
        pass


def main():
    print(ip_proxy())


if __name__ == '__main__':
    main()

使用脚本获取代理IP

获取代理IP

参与讨论

回复《使用脚本获取代理IP》

EditorJs 编辑器

作者信息

打赏记录

等待回复

上一篇

下一篇

使用脚本获取代理IP

获取代理IP

参与讨论

回复《 使用脚本获取代理IP》

EditorJs 编辑器 什么是EditorJs？更多帮助信息请前往：https://editorjs.io/

作者信息

打赏记录

等待回复

上一篇

下一篇

回复《使用脚本获取代理IP》

EditorJs 编辑器