import scrapy from selenium import webdriver
class ProductSpider(scrapy.Spider): name = "product_spider" allowed_domains = ['ebay.com'] start_urls = ['http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40'] def __init__(self): self.driver = webdriver.Firefox() def parse(self, response): self.driver.get(response.url) while True: next = self.driver.find_element_by_xpath('//td[@class="pagn-next"]/a') try: next.click() # get the data and write it to scrapy items except: break self.driver.close()
此解决方案效果很好,但是它对相同的URL两次请求,一个是scrapy Scheduler,另一个是Selenium Web驱动程序。
与没有selenium的苛刻要求相比,完成这项工作将花费两倍的时间。如何避免这种情况?
这是解决这个问题的有用技巧。
*为硒 *创建一个Web服务 ,在本地运行它
from flask import Flask, request, make_response from flask_restful import Resource, Api from selenium import webdriver from selenium.webdriver.chrome.options import Options app = Flask(__name__) api = Api(app) class Selenium(Resource): _driver = None @staticmethod def getDriver(): if not Selenium._driver: chrome_options = Options() chrome_options.add_argument("--headless") Selenium._driver = webdriver.Chrome(chrome_options=chrome_options) return Selenium._driver @property def driver(self): return Selenium.getDriver() def get(self): url = str(request.args['url']) self.driver.get(url) return make_response(self.driver.page_source) api.add_resource(Selenium, '/') if __name__ == '__main__': app.run(debug=True)
现在将使用硒Chrome / Firefox驱动程序返回编译后的网页。
现在我们的蜘蛛会是什么样子,
import scrapy import urllib class ProductSpider(scrapy.Spider): name = 'products' allowed_domains = ['ebay.com'] urls = [ 'http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40', ] def start_requests(self): for url in self.urls: url = 'http://127.0.0.1:5000/?url={}'.format(urllib.quote(url)) yield scrapy.Request(url) def parse(self, response): yield { 'field': response.xpath('//td[@class="pagn-next"]/a'), }