这是用于创建SocksiPyConnection类的代码:http ://blog.databigbang.com/distributed-scraping-with-multiple-tor-circuits/
class SocksiPyConnection(httplib.HTTPConnection): def __init__(self, proxytype, proxyaddr, proxyport = None, rdns = True, username = None, password = None, *args, **kwargs): self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username, password) httplib.HTTPConnection.__init__(self, *args, **kwargs) def connect(self): self.sock = socks.socksocket() self.sock.setproxy(*self.proxyargs) if isinstance(self.timeout, float): self.sock.settimeout(self.timeout) self.sock.connect((self.host, self.port))
由于scrapy代码中的扭曲电抗器非常复杂,我无法弄清楚如何将socksipy插入其中。有什么想法吗?
请不要用类似privoxy的替代方法回答,也不要发布答案说“ scrapy不适用于socks代理”-我知道,这就是为什么我要编写一个自定义的Downloader来使用socksipy发出请求的原因。
做了之后pip install txsocksx,我需要更换scrapy的ScrapyAgent使用txsocksx.http.SOCKS5Agent。
pip install txsocksx
scrapy
ScrapyAgent
txsocksx.http.SOCKS5Agent
我只是复制代码HTTP11DownloadHandler,并ScrapyAgent从scrapy/core/downloader/handlers/http.py,子类他们写了这样的代码:
HTTP11DownloadHandler
scrapy/core/downloader/handlers/http.py
class TorProxyDownloadHandler(HTTP11DownloadHandler): def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyTorAgent(contextFactory=self._contextFactory, pool=self._pool) return agent.download_request(request) class ScrapyTorAgent(ScrapyAgent): def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] omitConnectTunnel = proxyParams.find('noconnect') >= 0 if scheme == 'https' and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get('Proxy-Authorization', None)) return self._TunnelingAgent(reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: _, _, host, port, proxyParams = _parse(request.url) proxyEndpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, timeout=timeout, bindAddress=bindaddress) agent = SOCKS5Agent(reactor, proxyEndpoint=proxyEndpoint) return agent return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
在settings.py中,需要执行以下操作:
DOWNLOAD_HANDLERS = { 'http': 'crawler.http.TorProxyDownloadHandler' }
现在通过诸如Tor之类的袜子代理与Scrapy进行代理。