Skip to content

Commit

Permalink
First Release
Browse files Browse the repository at this point in the history
  • Loading branch information
cocoakekeyu committed Sep 15, 2016
0 parents commit 541edc4
Show file tree
Hide file tree
Showing 11 changed files with 925 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.pyc
32 changes: 32 additions & 0 deletions README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# AutoProxyMiddleware

## 简介
一个用于scrapy爬虫的自动代理中间件。可自动抓取和切换代理,自定义抓取和切换规则。

## 用法
将中间件模块放置到项目中,并在项目设置文件中添加该中间件。如
```python
DOWNLOADER_MIDDLEWARES = {
'projectname.autoproxy.AutoProxyMiddleware': 543,
}
```

## 配置
可在项目配置文件中使用`AUTO_PROXY`配置项配置代理中间件。如
```python
AUTO_PROXY = {
'test_urls':[('http://upaiyun.com','online'),('http://huaban.com', '33010602001878')],
'ban_code':[500,502,503,504],
}
```
**所有可用配置**
- `'enable'`: 一个布尔值,是否启用该中间件。默认为`True`
- `'test_urls'`: 一个二元组的列表,网址+特征码(返回的网页内容中能找到的特定值),用作代理连接的测试。默认为`[('http://www.w3school.com.cn', '06004630'), ]`
- `'test_proxy_timeout'`: 大于0的整数,用于测试代理时连接超时设置。默认为`5`
- `'download_timeout'`: 大于0的整数,与scrapy的`download_timeout`一样,启用该中间件则设置。默认为`60`
- `'test_threadnums'`: 大于0的整数,启动测试代理的线程数。默认为`20`
- `'ban_code'`: 一个列表,代理被禁用的http状态码。确认返回状态码在此范围可自动切换代理。默认为`[503,]`
- `'ban_re'`: 正则表达式字符串,代理被禁用返回的页面内容包含匹配正则式的内容,则切换代理,若为空则不启用。默认为`r''`
- `'proxy_least'`: 大于0的整数, 若代理池可用数量小于它则自动抓取新的代理。默认为`3`
- `'init_valid_proxys'`: 大于0的整数, 初始化爬虫时等待的可用代理数量。数值大会导致初始化比较慢,在爬虫进行中也可以同时测试保存的代理。默认为`1`
- `'invalid_limit'`: 大于0的整数,每个代理成功下载到页面时都会对其计数,若突然无法连接或者被网站拒绝将对这个代理进行invaild操作,若代理爬取的页面数大于该设置数值,则暂时不invaild,切换至另一个代理,并减少其页面计数。默认为`200`
342 changes: 342 additions & 0 deletions autoproxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,342 @@
#! -*- coding: utf-8 -*-
import urllib2
import logging
import threading
import math
import re

from bs4 import BeautifulSoup
from twisted.internet import defer
from twisted.internet.error import TimeoutError, ConnectionRefusedError, \
ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone

logger = logging.getLogger(__name__)


class AutoProxyMiddleware(object):

EXCEPTIONS_TO_CHANGE = (defer.TimeoutError, TimeoutError, ConnectionRefusedError, ConnectError, ConnectionLost, TCPTimedOutError, ConnectionDone)

_settings = [
('enable', True),
('test_urls', [('http://www.w3school.com.cn', '06004630'), ]),
('test_proxy_timeout', 5),
('download_timeout', 60),
('test_threadnums', 20),
('ban_code', [503, ]),
('ban_re', r''),
('proxy_least', 3),
('init_valid_proxys', 1),
('invalid_limit', 200),
]

def __init__(self, proxy_set=None):
self.proxy_set = proxy_set or {}
for k, v in self._settings:
setattr(self, k, self.proxy_set.get(k, v))

# 代理列表和当前的代理指针,couter_proxy用作该代理下载的网页数量
self.proxy = []
self.proxy_index = 0
self.proxyes = {}
self.counter_proxy = {}

self.fecth_new_proxy()
self.test_proxyes(self.proxyes, wait=True)
logger.info('Use proxy : %s', self.proxy)

@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings.getdict('AUTO_PROXY'))

def process_request(self, request, spider):
if not self._is_enabled_for_request(request):
return

if self.len_valid_proxy() > 0:
self.set_proxy(request)
# if 'download_timeout' not in request.meta:
request.meta['download_timeout'] = self.download_timeout
else:
# 没有可用代理,直连
if 'proxy' in request.meta:
del request.meta['proxy']

def process_respose(self, request, response, spider):
if not self._is_enabled_for_request(request):
return response

if response.status in self.ban_code:
self.invaild_proxy(request.meta['proxy'])
logger.debug("Proxy[%s] ban because return httpstatuscode:[%s]. ", request.meta['proxy'], str(response.status))
new_request = request.copy()
new_request.dont_filter = True
return new_request

if self.re:
try:
pattern = re.compile(self.re)
except TypeError:
logger.error('Wrong "ban_re", please check settings')
return response
match = re.search(pattern, response.body)
if match:
self.invaild_proxy(request.meta['proxy'])
logger.debug("Proxy[%s] ban because pattern match:[%s]. ", request.meta['proxy'], str(match))
new_request = request.copy()
new_request.dont_filter = True
return new_request

p = request.meta['proxy']
self.counter_proxy[p] = self.counter_proxy.setdefault(p, 1) + 1
return response

def process_exception(self, request, exception, spider):
if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \
and request.meta.get('proxy', False):
self.invaild_proxy(request.meta['proxy'])
logger.debug("Proxy[%s] connect exception[%s].", request.meta['proxy'], exception)
new_request = request.copy()
new_request.dont_filter = True
return new_request

def invaild_proxy(self, proxy):
"""
将代理设为invaild。如果之前该代理已下载超过200页(默认)的资源,则暂时不设置,仅切换代理,并减少其计数。
"""
if self.counter_proxy.get(proxy, 0) > self.invalid_limit:
self.counter_proxy[proxy] = self.counter_proxy.get(proxy, 0) - 50
if self.counter_proxy[proxy] < 0:
self.counter_proxy[proxy] = 0
self.change_proxy()
else:
self.proxyes[proxy] = False
# logger.info('Set proxy[%s] invaild.', proxy)

def change_proxy(self):
"""
切换代理。
"""
while True:
self.proxy_index = (self.proxy_index + 1) % len(self.proxy)
proxy_valid = self.proxyes[self.proxy[self.proxy_index]]
if proxy_valid:
break
if self.len_valid_proxy() == 0:
logger.info('Available proxys is none.Waiting for fecth new proxy.')
break
logger.info('Change proxy to %s', self.proxy[self.proxy_index])
logger.info('Available proxys[%s]: %s', self.len_valid_proxy(), self.valid_proxyes())

# 可用代理数量小于预设值则扩展代理
if self.len_valid_proxy() < self.proxy_least:
self.extend_proxy()

def set_proxy(self, request):
"""
设置代理。
"""
proxy_valid = self.proxyes[self.proxy[self.proxy_index]]
if not proxy_valid:
self.change_proxy()

request.meta['proxy'] = self.proxy[self.proxy_index]
# logger.info('Set proxy. request.meta: %s', request.meta)

def len_valid_proxy(self):
"""
计算可用代理的数量
"""
count = 0
for p in self.proxy:
if self.proxyes[p]:
count += 1
return count

def valid_proxyes(self):
"""
可用代理列表
"""
proxyes = []
for p in self.proxy:
if self.proxyes[p]:
proxyes.append(p)
return proxyes

def extend_proxy(self):
"""
扩展代理。测试代理是异步的。
"""
self.fecth_new_proxy()
self.test_proxyes(self.proxyes)

def append_proxy(self):
"""
辅助函数,将测试通过的代理添加到列表
"""
for k, v in self.proxyes.iteritems():
if v and k not in self.proxy:
self.proxy.append(k)

def fecth_new_proxy(self):
"""
获取新的代理,目前从三个网站抓取代理,每个网站开一个线程抓取代理。
"""
logger.info('Starting fecth new proxy.')
urls = ['xici', 'ip3336', 'kxdaili']
threads = []
for url in urls:
t = ProxyFecth(self.proxyes, url)
threads.append(t)
t.start()
for t in threads:
t.join()

def test_proxyes(self, proxyes, wait=False):
"""
测试代理可通性。测试网址、特征码以及测试线程数均可设置。
"""
list_proxy = proxyes.items()
threads = []
n = int(math.ceil(len(list_proxy) / self.test_threadnums))
for i in range(self.test_threadnums):
# 将待测试的代理平均分给测试线程
list_part = list_proxy[i * n: (i + 1) * n]
part = {k: v for k, v in list_part}
t = ProxyValidate(self, part)
threads.append(t)
t.start()

# 初始化该中间件时,等待有可用的代理
if wait:
while True:
for t in threads:
t.join(0.2)
if self._has_valid_proxy():
break
if self._has_valid_proxy():
break

def _has_valid_proxy(self):
if self.len_valid_proxy() >= self.init_valid_proxys:
return True

def _is_enabled_for_request(self, request):
return self.enable and 'dont_proxy' not in request.meta


class ProxyValidate(threading.Thread):
"""
测试代理线程类
"""

def __init__(self, autoproxy, part):
super(ProxyValidate, self).__init__()
self.autoproxy = autoproxy
self.part = part

def run(self):
self.test_proxyes(self.part)

def test_proxyes(self, proxyes):
for proxy, valid in proxyes.iteritems():
if(self.check_proxy(proxy)):
self.autoproxy.proxyes.update({proxy: True})
self.autoproxy.append_proxy()

def check_proxy(self, proxy):
proxy_handler = urllib2.ProxyHandler({'http': proxy})
opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler)
try:
for url, code in self.autoproxy.test_urls:
resbody = opener.open(url, timeout=self.autoproxy.test_proxy_timeout).read()
if code not in resbody:
return False
return True
except Exception:
return False


class ProxyFecth(threading.Thread):

def __init__(self, proxyes, url):
super(ProxyFecth, self).__init__()
self.proxyes = proxyes
self.url = url

def run(self):
self.proxyes.update(getattr(self, 'fecth_proxy_from_' + self.url)())

def fecth_proxy_from_xici(self):
proxyes = {}
url = "http://www.xicidaili.com/nn/"
try:
for i in range(1, 4):
soup = self.get_soup(url + str(i))
trs = soup.find("table", attrs={"id": "ip_list"}).find_all("tr")
for i, tr in enumerate(trs):
if(0 == i):
continue
tds = tr.find_all('td')
ip = tds[1].text
port = tds[2].text
proxy = ''.join(['http://', ip, ':', port]).encode('utf-8')
proxyes[proxy] = False
except Exception as e:
logger.error('Failed to fecth_proxy_from_xici. Exception[%s]', e)

return proxyes

def fecth_proxy_from_ip3336(self):
proxyes = {}
url = 'http://www.ip3366.net/free/?stype=1&page='
try:
for i in range(1, 6):
soup = self.get_soup(url + str(i))
trs = soup.find("div", attrs={"id": "list"}).table.find_all("tr")
for i, tr in enumerate(trs):
if 0 == i:
continue
tds = tr.find_all("td")
ip = tds[0].string.strip().encode('utf-8')
port = tds[1].string.strip().encode('utf-8')
proxy = ''.join(['http://', ip, ':', port])
proxyes[proxy] = False
except Exception as e:
logger.error('Failed to fecth_proxy_from_ip3336. Exception[%s]', e)

return proxyes

def fecth_proxy_from_kxdaili(self):
proxyes = {}
url = 'http://www.kxdaili.com/dailiip/1/%d.html'
try:
for i in range(1, 11):
soup = self.get_soup(url % i)
trs = soup.find("table", attrs={"class": "ui table segment"}).find_all("tr")
for i, tr in enumerate(trs):
if 0 == i:
continue
tds = tr.find_all("td")
ip = tds[0].string.strip().encode('utf-8')
port = tds[1].string.strip().encode('utf-8')
proxy = ''.join(['http://', ip, ':', port])
proxyes[proxy] = False
except Exception as e:
logger.error('Failed to fecth_proxy_from_kxdaili. Exception[%s]', e)

return proxyes

def get_soup(self, url):
request = urllib2.Request(url)
request.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36")
html_doc = urllib2.urlopen(request).read()

soup = BeautifulSoup(html_doc)

return soup


if __name__ == '__main__':

AutoProxyMiddleware()
Empty file added huaban/__init__.py
Empty file.
Loading

0 comments on commit 541edc4

Please sign in to comment.