Skip to content

Commit c9d80d4

Browse files
authored
Merge e193737 into c6030ce
2 parents c6030ce + e193737 commit c9d80d4

File tree

5 files changed

+77
-29
lines changed

5 files changed

+77
-29
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ six>=1.5.2
88
PyDispatcher>=2.0.5
99
service_identity
1010
parsel>=1.4
11+
functools32

scrapy/downloadermiddlewares/httpproxy.py

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,49 @@
11
import base64
2-
from six.moves.urllib.request import getproxies, proxy_bypass
3-
from six.moves.urllib.parse import unquote
2+
try:
3+
from functools import lru_cache
4+
except:
5+
from functools32 import lru_cache
46
try:
57
from urllib2 import _parse_proxy
68
except ImportError:
79
from urllib.request import _parse_proxy
8-
from six.moves.urllib.parse import urlunparse
910

10-
from scrapy.utils.httpobj import urlparse_cached
11+
from six.moves.urllib.parse import urlunparse, unquote
12+
from six.moves.urllib.request import getproxies, proxy_bypass
13+
1114
from scrapy.exceptions import NotConfigured
15+
from scrapy.utils.httpobj import urlparse_cached
1216
from scrapy.utils.python import to_bytes
1317

1418

19+
@lru_cache(maxsize=128)
20+
def basic_auth_header(auth_encoding, username, password):
21+
user_pass = to_bytes(
22+
'%s:%s' % (unquote(username), unquote(password)),
23+
encoding=auth_encoding)
24+
return base64.b64encode(user_pass).strip()
25+
26+
27+
@lru_cache(maxsize=128)
28+
def get_proxy(auth_encoding, url, orig_type):
29+
proxy_type, user, password, hostport = _parse_proxy(url)
30+
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
31+
32+
if user:
33+
creds = basic_auth_header(auth_encoding, user, password)
34+
else:
35+
creds = None
36+
37+
return creds, proxy_url
38+
39+
1540
class HttpProxyMiddleware(object):
1641

1742
def __init__(self, auth_encoding='latin-1'):
1843
self.auth_encoding = auth_encoding
1944
self.proxies = {}
20-
for type, url in getproxies().items():
21-
self.proxies[type] = self._get_proxy(url, type)
45+
for type_, url in getproxies().items():
46+
self.proxies[type_] = get_proxy(self.auth_encoding, url, type_)
2247

2348
@classmethod
2449
def from_crawler(cls, crawler):
@@ -27,30 +52,13 @@ def from_crawler(cls, crawler):
2752
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
2853
return cls(auth_encoding)
2954

30-
def _basic_auth_header(self, username, password):
31-
user_pass = to_bytes(
32-
'%s:%s' % (unquote(username), unquote(password)),
33-
encoding=self.auth_encoding)
34-
return base64.b64encode(user_pass).strip()
35-
36-
def _get_proxy(self, url, orig_type):
37-
proxy_type, user, password, hostport = _parse_proxy(url)
38-
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
39-
40-
if user:
41-
creds = self._basic_auth_header(user, password)
42-
else:
43-
creds = None
44-
45-
return creds, proxy_url
46-
4755
def process_request(self, request, spider):
4856
# ignore if proxy is already set
4957
if 'proxy' in request.meta:
5058
if request.meta['proxy'] is None:
5159
return
5260
# extract credentials if present
53-
creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
61+
creds, proxy_url = get_proxy(self.auth_encoding, request.meta['proxy'], '')
5462
request.meta['proxy'] = proxy_url
5563
if creds and not request.headers.get('Proxy-Authorization'):
5664
request.headers['Proxy-Authorization'] = b'Basic ' + creds

tests/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ brotlipy
1010
testfixtures
1111
# optional for shell wrapper tests
1212
bpython
13-
ipython
13+
ipython<6

tests/test_downloadermiddleware_httpproxy.py

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from functools import partial
44
from twisted.trial.unittest import TestCase, SkipTest
55

6-
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
6+
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware, basic_auth_header, get_proxy
77
from scrapy.exceptions import NotConfigured
88
from scrapy.http import Response, Request
99
from scrapy.spiders import Spider
@@ -13,7 +13,7 @@
1313
spider = Spider('foo')
1414

1515

16-
class TestDefaultHeadersMiddleware(TestCase):
16+
class TestHttpProxyMiddleware(TestCase):
1717

1818
failureException = AssertionError
1919

@@ -30,9 +30,12 @@ def test_not_enabled(self):
3030

3131
def test_no_environment_proxies(self):
3232
os.environ = {'dummy_proxy': 'reset_env_and_do_not_raise'}
33-
mw = HttpProxyMiddleware()
3433

3534
for url in ('http://e.com', 'https://e.com', 'file:///tmp/a'):
35+
mw = HttpProxyMiddleware()
36+
basic_auth_header.cache_clear()
37+
get_proxy.cache_clear()
38+
3639
req = Request(url)
3740
assert mw.process_request(req, spider) is None
3841
self.assertEqual(req.url, url)
@@ -42,11 +45,15 @@ def test_environment_proxies(self):
4245
os.environ['http_proxy'] = http_proxy = 'https://proxy.for.http:3128'
4346
os.environ['https_proxy'] = https_proxy = 'http://proxy.for.https:8080'
4447
os.environ.pop('file_proxy', None)
45-
mw = HttpProxyMiddleware()
4648

4749
for url, proxy in [('http://e.com', http_proxy),
4850
('https://e.com', https_proxy), ('file://tmp/a', None)]:
4951
req = Request(url)
52+
53+
mw = HttpProxyMiddleware()
54+
basic_auth_header.cache_clear()
55+
get_proxy.cache_clear()
56+
5057
assert mw.process_request(req, spider) is None
5158
self.assertEqual(req.url, url)
5259
self.assertEqual(req.meta.get('proxy'), proxy)
@@ -61,11 +68,16 @@ def test_proxy_precedence_meta(self):
6168
def test_proxy_auth(self):
6269
os.environ['http_proxy'] = 'https://user:pass@proxy:3128'
6370
mw = HttpProxyMiddleware()
71+
basic_auth_header.cache_clear()
72+
get_proxy.cache_clear()
6473
req = Request('http://scrapytest.org')
6574
assert mw.process_request(req, spider) is None
6675
self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'})
6776
self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic dXNlcjpwYXNz')
6877
# proxy from request.meta
78+
mw = HttpProxyMiddleware()
79+
basic_auth_header.cache_clear()
80+
get_proxy.cache_clear()
6981
req = Request('http://scrapytest.org', meta={'proxy': 'https://username:password@proxy:3128'})
7082
assert mw.process_request(req, spider) is None
7183
self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'})
@@ -74,11 +86,16 @@ def test_proxy_auth(self):
7486
def test_proxy_auth_empty_passwd(self):
7587
os.environ['http_proxy'] = 'https://user:@proxy:3128'
7688
mw = HttpProxyMiddleware()
89+
basic_auth_header.cache_clear()
90+
get_proxy.cache_clear()
7791
req = Request('http://scrapytest.org')
7892
assert mw.process_request(req, spider) is None
7993
self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'})
8094
self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic dXNlcjo=')
8195
# proxy from request.meta
96+
mw = HttpProxyMiddleware()
97+
basic_auth_header.cache_clear()
98+
get_proxy.cache_clear()
8299
req = Request('http://scrapytest.org', meta={'proxy': 'https://username:@proxy:3128'})
83100
assert mw.process_request(req, spider) is None
84101
self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'})
@@ -88,25 +105,35 @@ def test_proxy_auth_encoding(self):
88105
# utf-8 encoding
89106
os.environ['http_proxy'] = u'https://m\u00E1n:pass@proxy:3128'
90107
mw = HttpProxyMiddleware(auth_encoding='utf-8')
108+
basic_auth_header.cache_clear()
109+
get_proxy.cache_clear()
91110
req = Request('http://scrapytest.org')
92111
assert mw.process_request(req, spider) is None
93112
self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'})
94113
self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic bcOhbjpwYXNz')
95114

96115
# proxy from request.meta
116+
mw = HttpProxyMiddleware(auth_encoding='utf-8')
117+
basic_auth_header.cache_clear()
118+
get_proxy.cache_clear()
97119
req = Request('http://scrapytest.org', meta={'proxy': u'https://\u00FCser:pass@proxy:3128'})
98120
assert mw.process_request(req, spider) is None
99121
self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'})
100122
self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic w7xzZXI6cGFzcw==')
101123

102124
# default latin-1 encoding
103125
mw = HttpProxyMiddleware(auth_encoding='latin-1')
126+
basic_auth_header.cache_clear()
127+
get_proxy.cache_clear()
104128
req = Request('http://scrapytest.org')
105129
assert mw.process_request(req, spider) is None
106130
self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'})
107131
self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic beFuOnBhc3M=')
108132

109133
# proxy from request.meta, latin-1 encoding
134+
mw = HttpProxyMiddleware(auth_encoding='latin-1')
135+
basic_auth_header.cache_clear()
136+
get_proxy.cache_clear()
110137
req = Request('http://scrapytest.org', meta={'proxy': u'https://\u00FCser:pass@proxy:3128'})
111138
assert mw.process_request(req, spider) is None
112139
self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'})
@@ -122,24 +149,35 @@ def test_proxy_already_seted(self):
122149
def test_no_proxy(self):
123150
os.environ['http_proxy'] = 'https://proxy.for.http:3128'
124151
mw = HttpProxyMiddleware()
152+
basic_auth_header.cache_clear()
153+
get_proxy.cache_clear()
125154

126155
os.environ['no_proxy'] = '*'
127156
req = Request('http://noproxy.com')
128157
assert mw.process_request(req, spider) is None
129158
assert 'proxy' not in req.meta
130159

131160
os.environ['no_proxy'] = 'other.com'
161+
mw = HttpProxyMiddleware()
162+
basic_auth_header.cache_clear()
163+
get_proxy.cache_clear()
132164
req = Request('http://noproxy.com')
133165
assert mw.process_request(req, spider) is None
134166
assert 'proxy' in req.meta
135167

136168
os.environ['no_proxy'] = 'other.com,noproxy.com'
169+
mw = HttpProxyMiddleware()
170+
basic_auth_header.cache_clear()
171+
get_proxy.cache_clear()
137172
req = Request('http://noproxy.com')
138173
assert mw.process_request(req, spider) is None
139174
assert 'proxy' not in req.meta
140175

141176
# proxy from meta['proxy'] takes precedence
142177
os.environ['no_proxy'] = '*'
178+
mw = HttpProxyMiddleware()
179+
basic_auth_header.cache_clear()
180+
get_proxy.cache_clear()
143181
req = Request('http://noproxy.com', meta={'proxy': 'http://proxy.com'})
144182
assert mw.process_request(req, spider) is None
145183
self.assertEqual(req.meta, {'proxy': 'http://proxy.com'})

tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ deps =
5050
Pillow==2.6.1
5151
cssselect==0.9.1
5252
zope.interface==4.1.1
53+
functools32
5354
-rtests/requirements.txt
5455

5556
[testenv:trunk]

0 commit comments

Comments
 (0)