使用脚本获取代理IP

讨论 Phoenix
Lv4 准师级炼丹师
发布在 综合   2.38万   1
讨论 Phoenix   2.38万   1

    获取代理IP

    为了方便初学者快速获取代理IP,对www.89ip.cn及www.xicidaili.com两个网址对外公布的代理IP进行爬取,并通过myip.ipip.net对爬取到的代理IP进行有效的校验。

    使用方法:导入包,执行ip_proxy()得到代理ip

    最终返回的代理ip格式为[‘ip:端口’,‘ip:端口’,‘ip:端口’,‘ip:端口’,‘ip:端口’,……]

    #!/usr/bin/env python
    # coding: utf-8
    import urllib.request
    from bs4 import BeautifulSoup
    import http.cookiejar
    import zlib
    import re
    import time
    import random
    import threading
    from fake_useragent import UserAgent
    ua = UserAgent()
    
    
    # 爬取代理ip
    def ip_proxy():
        agent_lists_temp = []
        print('对www.89ip.cn进行爬取')
        for i in range(1, 10):   #  根据需要调整爬取页数,当前是从第1页开始爬10页
            try:
                time.sleep(random.uniform(1 ,3))
                url =f'http://www.89ip.cn/index_%d.html '%i
                header = {'User-Agent':ua.random,'Accept-Encoding': 'gzip, deflate, sdch'}
                req = urllib.request.Request(url,headers = header)
                cookie = http.cookiejar.CookieJar()
                opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
                r = opener.open(req,timeout=10)
                html = zlib.decompress(r.read(), 16+zlib.MAX_WBITS)
                html = html.decode()
                agent_info = BeautifulSoup(html,'lxml').find_all('tr')[1:]
                for i in range(len(agent_info)):
                    info = agent_info[i].find_all('td')
                    if info[1].string.replace(' ','').strip() != '9999':
                        agents = info[0].string + ':' + info[1].string
                        agent_lists_temp.append(agents.replace('\t','').replace('\n',''))
            except:
                pass
        print('对www.xicidaili.com进行爬取')
        for i in range(3, 6):     #  根据需要调整爬取页数,当前是从第3页开始爬5页
            try:
                time.sleep(random.uniform(1,10))
                url='http://www.xicidaili.com/nn/'+str(i)
                header = {'User-Agent':ua.random,'Accept-Encoding': 'gzip, deflate, sdch'}
                req = urllib.request.Request(url,headers = header)
                cookie = http.cookiejar.CookieJar()
                opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
                r = opener.open(req,timeout=10)
                html = zlib.decompress(r.read(), 16+zlib.MAX_WBITS)
                html = html.decode()
                agent_info = BeautifulSoup(html,'lxml').find(id="ip_list").find_all('tr')[1:]
                for i in range(len(agent_info)):
                    info = agent_info[i].find_all('td')
                    if info[2].string.replace(' ','').strip() != '9999':
                        agents = info[1].string + ':' + info[2].string
                        agent_lists_temp.append(agents)
            except:
                pass
        # 获取有效的代理IP
        for i in ['39.137.69.8:8080','36.25.243.51:80','39.137.69.7:80','60.191.11.229:3128','117.88.176.102:3000','116.114.19.211:443', '101.231.104.82:80', '117.88.4.179:3000']:
            agent_lists_temp.append(i)
        global agent_lists
        agent_lists = []
        print('校验代理IP的有效性')
        threads = []
        for i in range(len(agent_lists_temp)):
            t = threading.Thread(target=validation, args=(agent_lists_temp[i], ))
            t.start()
            threads.append(t)
        for t in threads:
            t.join()
        return(agent_lists)
    
    
    def validation(agent_ip):
        global agent_lists
        try:
            proxyMeta = agent_ip
            proxies = {'http': proxyMeta, 'https': proxyMeta}
            proxy = urllib.request.ProxyHandler(proxies)
            header = {'User-Agent': ua.random, 'Accept-Encoding': 'gzip, deflate, sdch'}
            url = 'http://myip.ipip.net'
            req = urllib.request.Request(url, headers=header)
            cookie = http.cookiejar.CookieJar()
            opener = urllib.request.build_opener(proxy, urllib.request.HTTPCookieProcessor(cookie))
            html = opener.open(req, timeout=3)
            html = html.read().decode()
            ip = re.compile(r'IP:(.+?)  来').findall(html)
            if ip[0] == agent_ip.split(':')[0]:
                agent_lists.append(agent_ip)
        except:
            pass
    
    
    def main():
        print(ip_proxy())
    
    
    if __name__ == '__main__':
        main()
    
    版权声明:作者保留权利,不代表意本站立场。如需转载请联系本站以及作者。

    参与讨论

    回复《 使用脚本获取代理IP

    EditorJs 编辑器

    沙发,很寂寞~
    反馈
    to-top--btn