-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 541edc4
Showing
11 changed files
with
925 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# AutoProxyMiddleware | ||
|
||
## 简介 | ||
一个用于scrapy爬虫的自动代理中间件。可自动抓取和切换代理,自定义抓取和切换规则。 | ||
|
||
## 用法 | ||
将中间件模块放置到项目中,并在项目设置文件中添加该中间件。如 | ||
```python | ||
DOWNLOADER_MIDDLEWARES = { | ||
'projectname.autoproxy.AutoProxyMiddleware': 543, | ||
} | ||
``` | ||
|
||
## 配置 | ||
可在项目配置文件中使用`AUTO_PROXY`配置项配置代理中间件。如 | ||
```python | ||
AUTO_PROXY = { | ||
'test_urls':[('http://upaiyun.com','online'),('http://huaban.com', '33010602001878')], | ||
'ban_code':[500,502,503,504], | ||
} | ||
``` | ||
**所有可用配置** | ||
- `'enable'`: 一个布尔值,是否启用该中间件。默认为`True` | ||
- `'test_urls'`: 一个二元组的列表,网址+特征码(返回的网页内容中能找到的特定值),用作代理连接的测试。默认为`[('http://www.w3school.com.cn', '06004630'), ]` | ||
- `'test_proxy_timeout'`: 大于0的整数,用于测试代理时连接超时设置。默认为`5` | ||
- `'download_timeout'`: 大于0的整数,与scrapy的`download_timeout`一样,启用该中间件则设置。默认为`60` | ||
- `'test_threadnums'`: 大于0的整数,启动测试代理的线程数。默认为`20` | ||
- `'ban_code'`: 一个列表,代理被禁用的http状态码。确认返回状态码在此范围可自动切换代理。默认为`[503,]` | ||
- `'ban_re'`: 正则表达式字符串,代理被禁用返回的页面内容包含匹配正则式的内容,则切换代理,若为空则不启用。默认为`r''` | ||
- `'proxy_least'`: 大于0的整数, 若代理池可用数量小于它则自动抓取新的代理。默认为`3` | ||
- `'init_valid_proxys'`: 大于0的整数, 初始化爬虫时等待的可用代理数量。数值大会导致初始化比较慢,在爬虫进行中也可以同时测试保存的代理。默认为`1` | ||
- `'invalid_limit'`: 大于0的整数,每个代理成功下载到页面时都会对其计数,若突然无法连接或者被网站拒绝将对这个代理进行invaild操作,若代理爬取的页面数大于该设置数值,则暂时不invaild,切换至另一个代理,并减少其页面计数。默认为`200` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,342 @@ | ||
#! -*- coding: utf-8 -*- | ||
import urllib2 | ||
import logging | ||
import threading | ||
import math | ||
import re | ||
|
||
from bs4 import BeautifulSoup | ||
from twisted.internet import defer | ||
from twisted.internet.error import TimeoutError, ConnectionRefusedError, \ | ||
ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class AutoProxyMiddleware(object): | ||
|
||
EXCEPTIONS_TO_CHANGE = (defer.TimeoutError, TimeoutError, ConnectionRefusedError, ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone) | ||
|
||
_settings = [ | ||
('enable', True), | ||
('test_urls', [('http://www.w3school.com.cn', '06004630'), ]), | ||
('test_proxy_timeout', 5), | ||
('download_timeout', 60), | ||
('test_threadnums', 20), | ||
('ban_code', [503, ]), | ||
('ban_re', r''), | ||
('proxy_least', 3), | ||
('init_valid_proxys', 1), | ||
('invalid_limit', 200), | ||
] | ||
|
||
def __init__(self, proxy_set=None): | ||
self.proxy_set = proxy_set or {} | ||
for k, v in self._settings: | ||
setattr(self, k, self.proxy_set.get(k, v)) | ||
|
||
# 代理列表和当前的代理指针,couter_proxy用作该代理下载的网页数量 | ||
self.proxy = [] | ||
self.proxy_index = 0 | ||
self.proxyes = {} | ||
self.counter_proxy = {} | ||
|
||
self.fecth_new_proxy() | ||
self.test_proxyes(self.proxyes, wait=True) | ||
logger.info('Use proxy : %s', self.proxy) | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler): | ||
return cls(crawler.settings.getdict('AUTO_PROXY')) | ||
|
||
def process_request(self, request, spider): | ||
if not self._is_enabled_for_request(request): | ||
return | ||
|
||
if self.len_valid_proxy() > 0: | ||
self.set_proxy(request) | ||
# if 'download_timeout' not in request.meta: | ||
request.meta['download_timeout'] = self.download_timeout | ||
else: | ||
# 没有可用代理,直连 | ||
if 'proxy' in request.meta: | ||
del request.meta['proxy'] | ||
|
||
def process_respose(self, request, response, spider): | ||
if not self._is_enabled_for_request(request): | ||
return response | ||
|
||
if response.status in self.ban_code: | ||
self.invaild_proxy(request.meta['proxy']) | ||
logger.debug("Proxy[%s] ban because return httpstatuscode:[%s]. ", request.meta['proxy'], str(response.status)) | ||
new_request = request.copy() | ||
new_request.dont_filter = True | ||
return new_request | ||
|
||
if self.re: | ||
try: | ||
pattern = re.compile(self.re) | ||
except TypeError: | ||
logger.error('Wrong "ban_re", please check settings') | ||
return response | ||
match = re.search(pattern, response.body) | ||
if match: | ||
self.invaild_proxy(request.meta['proxy']) | ||
logger.debug("Proxy[%s] ban because pattern match:[%s]. ", request.meta['proxy'], str(match)) | ||
new_request = request.copy() | ||
new_request.dont_filter = True | ||
return new_request | ||
|
||
p = request.meta['proxy'] | ||
self.counter_proxy[p] = self.counter_proxy.setdefault(p, 1) + 1 | ||
return response | ||
|
||
def process_exception(self, request, exception, spider): | ||
if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \ | ||
and request.meta.get('proxy', False): | ||
self.invaild_proxy(request.meta['proxy']) | ||
logger.debug("Proxy[%s] connect exception[%s].", request.meta['proxy'], exception) | ||
new_request = request.copy() | ||
new_request.dont_filter = True | ||
return new_request | ||
|
||
def invaild_proxy(self, proxy): | ||
""" | ||
将代理设为invaild。如果之前该代理已下载超过200页(默认)的资源,则暂时不设置,仅切换代理,并减少其计数。 | ||
""" | ||
if self.counter_proxy.get(proxy, 0) > self.invalid_limit: | ||
self.counter_proxy[proxy] = self.counter_proxy.get(proxy, 0) - 50 | ||
if self.counter_proxy[proxy] < 0: | ||
self.counter_proxy[proxy] = 0 | ||
self.change_proxy() | ||
else: | ||
self.proxyes[proxy] = False | ||
# logger.info('Set proxy[%s] invaild.', proxy) | ||
|
||
def change_proxy(self): | ||
""" | ||
切换代理。 | ||
""" | ||
while True: | ||
self.proxy_index = (self.proxy_index + 1) % len(self.proxy) | ||
proxy_valid = self.proxyes[self.proxy[self.proxy_index]] | ||
if proxy_valid: | ||
break | ||
if self.len_valid_proxy() == 0: | ||
logger.info('Available proxys is none.Waiting for fecth new proxy.') | ||
break | ||
logger.info('Change proxy to %s', self.proxy[self.proxy_index]) | ||
logger.info('Available proxys[%s]: %s', self.len_valid_proxy(), self.valid_proxyes()) | ||
|
||
# 可用代理数量小于预设值则扩展代理 | ||
if self.len_valid_proxy() < self.proxy_least: | ||
self.extend_proxy() | ||
|
||
def set_proxy(self, request): | ||
""" | ||
设置代理。 | ||
""" | ||
proxy_valid = self.proxyes[self.proxy[self.proxy_index]] | ||
if not proxy_valid: | ||
self.change_proxy() | ||
|
||
request.meta['proxy'] = self.proxy[self.proxy_index] | ||
# logger.info('Set proxy. request.meta: %s', request.meta) | ||
|
||
def len_valid_proxy(self): | ||
""" | ||
计算可用代理的数量 | ||
""" | ||
count = 0 | ||
for p in self.proxy: | ||
if self.proxyes[p]: | ||
count += 1 | ||
return count | ||
|
||
def valid_proxyes(self): | ||
""" | ||
可用代理列表 | ||
""" | ||
proxyes = [] | ||
for p in self.proxy: | ||
if self.proxyes[p]: | ||
proxyes.append(p) | ||
return proxyes | ||
|
||
def extend_proxy(self): | ||
""" | ||
扩展代理。测试代理是异步的。 | ||
""" | ||
self.fecth_new_proxy() | ||
self.test_proxyes(self.proxyes) | ||
|
||
def append_proxy(self): | ||
""" | ||
辅助函数,将测试通过的代理添加到列表 | ||
""" | ||
for k, v in self.proxyes.iteritems(): | ||
if v and k not in self.proxy: | ||
self.proxy.append(k) | ||
|
||
def fecth_new_proxy(self): | ||
""" | ||
获取新的代理,目前从三个网站抓取代理,每个网站开一个线程抓取代理。 | ||
""" | ||
logger.info('Starting fecth new proxy.') | ||
urls = ['xici', 'ip3336', 'kxdaili'] | ||
threads = [] | ||
for url in urls: | ||
t = ProxyFecth(self.proxyes, url) | ||
threads.append(t) | ||
t.start() | ||
for t in threads: | ||
t.join() | ||
|
||
def test_proxyes(self, proxyes, wait=False): | ||
""" | ||
测试代理可通性。测试网址、特征码以及测试线程数均可设置。 | ||
""" | ||
list_proxy = proxyes.items() | ||
threads = [] | ||
n = int(math.ceil(len(list_proxy) / self.test_threadnums)) | ||
for i in range(self.test_threadnums): | ||
# 将待测试的代理平均分给测试线程 | ||
list_part = list_proxy[i * n: (i + 1) * n] | ||
part = {k: v for k, v in list_part} | ||
t = ProxyValidate(self, part) | ||
threads.append(t) | ||
t.start() | ||
|
||
# 初始化该中间件时,等待有可用的代理 | ||
if wait: | ||
while True: | ||
for t in threads: | ||
t.join(0.2) | ||
if self._has_valid_proxy(): | ||
break | ||
if self._has_valid_proxy(): | ||
break | ||
|
||
def _has_valid_proxy(self): | ||
if self.len_valid_proxy() >= self.init_valid_proxys: | ||
return True | ||
|
||
def _is_enabled_for_request(self, request): | ||
return self.enable and 'dont_proxy' not in request.meta | ||
|
||
|
||
class ProxyValidate(threading.Thread): | ||
""" | ||
测试代理线程类 | ||
""" | ||
|
||
def __init__(self, autoproxy, part): | ||
super(ProxyValidate, self).__init__() | ||
self.autoproxy = autoproxy | ||
self.part = part | ||
|
||
def run(self): | ||
self.test_proxyes(self.part) | ||
|
||
def test_proxyes(self, proxyes): | ||
for proxy, valid in proxyes.iteritems(): | ||
if(self.check_proxy(proxy)): | ||
self.autoproxy.proxyes.update({proxy: True}) | ||
self.autoproxy.append_proxy() | ||
|
||
def check_proxy(self, proxy): | ||
proxy_handler = urllib2.ProxyHandler({'http': proxy}) | ||
opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler) | ||
try: | ||
for url, code in self.autoproxy.test_urls: | ||
resbody = opener.open(url, timeout=self.autoproxy.test_proxy_timeout).read() | ||
if code not in resbody: | ||
return False | ||
return True | ||
except Exception: | ||
return False | ||
|
||
|
||
class ProxyFecth(threading.Thread): | ||
|
||
def __init__(self, proxyes, url): | ||
super(ProxyFecth, self).__init__() | ||
self.proxyes = proxyes | ||
self.url = url | ||
|
||
def run(self): | ||
self.proxyes.update(getattr(self, 'fecth_proxy_from_' + self.url)()) | ||
|
||
def fecth_proxy_from_xici(self): | ||
proxyes = {} | ||
url = "http://www.xicidaili.com/nn/" | ||
try: | ||
for i in range(1, 4): | ||
soup = self.get_soup(url + str(i)) | ||
trs = soup.find("table", attrs={"id": "ip_list"}).find_all("tr") | ||
for i, tr in enumerate(trs): | ||
if(0 == i): | ||
continue | ||
tds = tr.find_all('td') | ||
ip = tds[1].text | ||
port = tds[2].text | ||
proxy = ''.join(['http://', ip, ':', port]).encode('utf-8') | ||
proxyes[proxy] = False | ||
except Exception as e: | ||
logger.error('Failed to fecth_proxy_from_xici. Exception[%s]', e) | ||
|
||
return proxyes | ||
|
||
def fecth_proxy_from_ip3336(self): | ||
proxyes = {} | ||
url = 'http://www.ip3366.net/free/?stype=1&page=' | ||
try: | ||
for i in range(1, 6): | ||
soup = self.get_soup(url + str(i)) | ||
trs = soup.find("div", attrs={"id": "list"}).table.find_all("tr") | ||
for i, tr in enumerate(trs): | ||
if 0 == i: | ||
continue | ||
tds = tr.find_all("td") | ||
ip = tds[0].string.strip().encode('utf-8') | ||
port = tds[1].string.strip().encode('utf-8') | ||
proxy = ''.join(['http://', ip, ':', port]) | ||
proxyes[proxy] = False | ||
except Exception as e: | ||
logger.error('Failed to fecth_proxy_from_ip3336. Exception[%s]', e) | ||
|
||
return proxyes | ||
|
||
def fecth_proxy_from_kxdaili(self): | ||
proxyes = {} | ||
url = 'http://www.kxdaili.com/dailiip/1/%d.html' | ||
try: | ||
for i in range(1, 11): | ||
soup = self.get_soup(url % i) | ||
trs = soup.find("table", attrs={"class": "ui table segment"}).find_all("tr") | ||
for i, tr in enumerate(trs): | ||
if 0 == i: | ||
continue | ||
tds = tr.find_all("td") | ||
ip = tds[0].string.strip().encode('utf-8') | ||
port = tds[1].string.strip().encode('utf-8') | ||
proxy = ''.join(['http://', ip, ':', port]) | ||
proxyes[proxy] = False | ||
except Exception as e: | ||
logger.error('Failed to fecth_proxy_from_kxdaili. Exception[%s]', e) | ||
|
||
return proxyes | ||
|
||
def get_soup(self, url): | ||
request = urllib2.Request(url) | ||
request.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36") | ||
html_doc = urllib2.urlopen(request).read() | ||
|
||
soup = BeautifulSoup(html_doc) | ||
|
||
return soup | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
AutoProxyMiddleware() |
Empty file.
Oops, something went wrong.