获取代理IP
为了方便初学者快速获取代理IP,对www.89ip.cn及www.xicidaili.com两个网址对外公布的代理IP进行爬取,并通过myip.ipip.net对爬取到的代理IP进行有效的校验。
使用方法:导入包,执行ip_proxy()得到代理ip
最终返回的代理ip格式为[‘ip:端口’,‘ip:端口’,‘ip:端口’,‘ip:端口’,‘ip:端口’,……]
#!/usr/bin/env python
# coding: utf-8
import urllib.request
from bs4 import BeautifulSoup
import http.cookiejar
import zlib
import re
import time
import random
import threading
from fake_useragent import UserAgent
ua = UserAgent()
# 爬取代理ip
def ip_proxy():
agent_lists_temp = []
print('对www.89ip.cn进行爬取')
for i in range(1, 10): # 根据需要调整爬取页数,当前是从第1页开始爬10页
try:
time.sleep(random.uniform(1 ,3))
url =f'http://www.89ip.cn/index_%d.html '%i
header = {'User-Agent':ua.random,'Accept-Encoding': 'gzip, deflate, sdch'}
req = urllib.request.Request(url,headers = header)
cookie = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
r = opener.open(req,timeout=10)
html = zlib.decompress(r.read(), 16+zlib.MAX_WBITS)
html = html.decode()
agent_info = BeautifulSoup(html,'lxml').find_all('tr')[1:]
for i in range(len(agent_info)):
info = agent_info[i].find_all('td')
if info[1].string.replace(' ','').strip() != '9999':
agents = info[0].string + ':' + info[1].string
agent_lists_temp.append(agents.replace('\t','').replace('\n',''))
except:
pass
print('对www.xicidaili.com进行爬取')
for i in range(3, 6): # 根据需要调整爬取页数,当前是从第3页开始爬5页
try:
time.sleep(random.uniform(1,10))
url='http://www.xicidaili.com/nn/'+str(i)
header = {'User-Agent':ua.random,'Accept-Encoding': 'gzip, deflate, sdch'}
req = urllib.request.Request(url,headers = header)
cookie = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
r = opener.open(req,timeout=10)
html = zlib.decompress(r.read(), 16+zlib.MAX_WBITS)
html = html.decode()
agent_info = BeautifulSoup(html,'lxml').find(id="ip_list").find_all('tr')[1:]
for i in range(len(agent_info)):
info = agent_info[i].find_all('td')
if info[2].string.replace(' ','').strip() != '9999':
agents = info[1].string + ':' + info[2].string
agent_lists_temp.append(agents)
except:
pass
# 获取有效的代理IP
for i in ['39.137.69.8:8080','36.25.243.51:80','39.137.69.7:80','60.191.11.229:3128','117.88.176.102:3000','116.114.19.211:443', '101.231.104.82:80', '117.88.4.179:3000']:
agent_lists_temp.append(i)
global agent_lists
agent_lists = []
print('校验代理IP的有效性')
threads = []
for i in range(len(agent_lists_temp)):
t = threading.Thread(target=validation, args=(agent_lists_temp[i], ))
t.start()
threads.append(t)
for t in threads:
t.join()
return(agent_lists)
def validation(agent_ip):
global agent_lists
try:
proxyMeta = agent_ip
proxies = {'http': proxyMeta, 'https': proxyMeta}
proxy = urllib.request.ProxyHandler(proxies)
header = {'User-Agent': ua.random, 'Accept-Encoding': 'gzip, deflate, sdch'}
url = 'http://myip.ipip.net'
req = urllib.request.Request(url, headers=header)
cookie = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(proxy, urllib.request.HTTPCookieProcessor(cookie))
html = opener.open(req, timeout=3)
html = html.read().decode()
ip = re.compile(r'IP:(.+?) 来').findall(html)
if ip[0] == agent_ip.split(':')[0]:
agent_lists.append(agent_ip)
except:
pass
def main():
print(ip_proxy())
if __name__ == '__main__':
main()