小能豆

使用 Selenium + Scrapy 下载文件时出现问题

py

此代码旨在下载它必须在一系列给定链接中找到的一些文档。虽然它似乎找到了 pdf 文件的链接,但却无法下载它。问题可能出在哪里?

class DownloaderSpider(scrapy.Spider):

    def __init__(self, *args, **kwargs):
        super(DownloaderSpider, self).__init__(*args, **kwargs)

        # Configure Chrome WebDriver with download preferences
        options = webdriver.ChromeOptions()
        prefs = {
            "download.default_directory": "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads",
            "download.prompt_for_download": False,
            "plugins.always_open_pdf_externally": True
        }
        options.add_experimental_option("prefs", prefs)
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    def parse(self, response):
        query = response.meta['query']
        summary = response.meta['summary']
        date = response.meta['date']
        deadline = response.meta['deadline']
        pdf_link = None

        self.driver.get(response.url)

        # Wait until the document is fully loaded
        WebDriverWait(self.driver, 15).until(
            lambda driver: driver.execute_script("return document.readyState") == "complete"
        )

        if not self.wait_for_stability():
            self.log("Page did not stabilize in time.")
            return

        response = HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')

        elements = response.xpath("//tr | //div[not(div)]")
        self.log(f"Found {len(elements)} elements containing text.")

        best_match = None
        highest_score = 0
        for element in elements:
            element_text = element.xpath("string(.)").get().strip()
            score = fuzz.partial_ratio(summary.lower(), element_text.lower())
            # Accept element if it contains a matching date (or deadline) in any format
            if score > highest_score and (self.contains_matching_date(element_text, date) or self.contains_matching_date(element_text, deadline)):
                highest_score = score
                best_match = element

        if best_match and highest_score >= 0:  # Adjust threshold as needed
            self.log(f"Best match found with score {highest_score}")
            pdf_link = best_match.xpath(".//a[contains(@href, '.pdf')]/@href").get()
            if pdf_link:
                self.log(f"Found PDF link: {pdf_link}")
        if pdf_link:
            pdf_link = response.urljoin(pdf_link)
            try:
                # Use Selenium to click the PDF link and trigger the download
                pdf_element = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, f"//a[contains(@href, '{pdf_link.split('/')[-1]}')]"))
                )
                pdf_element.click()

                # Wait for the file to appear in the download directory
                download_dir = "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads"
                local_filename = query.replace(' ', '_') + ".pdf"
                local_filepath = os.path.join(download_dir, local_filename)

                timeout = 30  # seconds
                start_time = time.time()
                while not os.path.exists(local_filepath):
                    if time.time() - start_time > timeout:
                        raise Exception("Download timed out.")
                    time.sleep(1)

                self.log(f"Downloaded file {local_filepath}")
            except Exception as e:
                self.log(f"Failed to download file from {pdf_link}: {e}")
        else:
            self.log("No direct PDF link found, checking for next page link.")
            next_page = best_match.xpath(".//a/@href").get() if best_match else None
            if next_page:
                next_page = response.urljoin(next_page)
                self.log(f"Following next page link: {next_page}")
                yield scrapy.Request(next_page, self.parse_next_page, meta={'query': query})

    def parse_next_page(self, response):
        query = response.meta['query']

        self.driver.get(response.url)

        WebDriverWait(self.driver, 15).until(
            lambda driver: driver.execute_script("return document.readyState") == "complete"
        )

        if not self.wait_for_stability():
            self.log("Page did not stabilize in time.")
            return

        response = HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')

        pdf_link = response.xpath("//a[contains(@href, '.pdf')]/@href").get()
        if pdf_link:
            pdf_link = response.urljoin(pdf_link)
            try:
                # Use Selenium to click the PDF link and trigger the download
                pdf_element = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, f"//a[contains(@href, '{pdf_link.split('/')[-1]}')]"))
                )
                pdf_element.click()

                # Wait for the file to appear in the download directory
                download_dir = "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads"
                local_filename = query.replace(' ', '_') + ".pdf"
                local_filepath = os.path.join(download_dir, local_filename)

                timeout = 30  # seconds
                start_time = time.time()
                while not os.path.exists(local_filepath):
                    if time.time() - start_time > timeout:
                        raise Exception("Download timed out.")
                    time.sleep(1)

                self.log(f"Downloaded file {local_filepath}")
            except Exception as e:
                self.log(f"Failed to download file from {pdf_link}: {e}")
        else:
            self.log("No PDF link found on next page.")

管道如下:代码创建并配置 selenium 驱动程序,浏览 .csv 文档中的链接。找到页面中最可能的条目,这是与下载相对应的日志,如果它不是下载链接,则导航到新页面并找到第一个下载链接,然后下载(或尝试下载):

2025-03-29 17:31:14 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying <GET https://oshanarc.gov.na/procurement> (failed 6 times): [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
2025-03-29 17:31:14 [scrapy.core.scraper] ERROR: Error downloading <GET https://oshanarc.gov.na/procurement>
Traceback (most recent call last):
  File "C:\Users\marti\Downloads\Web Scraper\venv\Lib\site-packages\twisted\internet\defer.py", line 2013, in _inlineCallbacks
    result = context.run(
        cast(Failure, result).throwExceptionIntoGenerator, gen
    )
  File "C:\Users\marti\Downloads\Web Scraper\venv\Lib\site-packages\twisted\python\failure.py", line 467, in throwExceptionIntoGenerator
    return g.throw(self.value.with_traceback(self.tb))
           ~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\marti\Downloads\Web Scraper\venv\Lib\site-packages\scrapy\core\downloader\middleware.py", line 68, in process_request
    return (yield download_func(request, spider))
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
twisted.web._newclient.ResponseNeverReceived: [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
2025-03-29 17:31:14 [scrapy.core.engine] INFO: Closing spider (finished)

知道可能是什么问题吗?


阅读 26

收藏
2025-04-02

共1个答案

小能豆

您的代码在尝试下载 PDF 文件时遇到了几个潜在问题。根据错误日志和代码分析,以下是可能的原因和解决方案:

主要问题分析

  1. 连接丢失错误
    Connection to the other side was lost in a non-clean fashion: Connection lost
    这表明与目标网站的连接不稳定或被中断。

  2. PDF 下载失败的可能原因

  3. 直接使用 Selenium 点击 PDF 链接可能不会触发下载
  4. 文件路径或权限问题
  5. 网站可能有反爬机制
  6. 等待时间不足或元素定位不准确

解决方案

1. 改进下载逻辑

if pdf_link:
    try:
        # 方法1:直接使用requests下载(推荐)
        import requests
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(pdf_link, headers=headers, stream=True)

        if response.status_code == 200:
            local_filename = query.replace(' ', '_') + ".pdf"
            local_filepath = os.path.join(download_dir, local_filename)

            with open(local_filepath, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)

            self.log(f"Successfully downloaded {local_filepath}")
        else:
            self.log(f"Failed to download: HTTP {response.status_code}")

    except Exception as e:
        self.log(f"Download failed: {str(e)}")

2. 增加稳健性措施

# 在初始化时添加这些选项
prefs = {
    "download.default_directory": "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads",
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True,
    "profile.default_content_settings.popups": 0,
    "profile.content_settings.exceptions.automatic_downloads.*.setting": 1
}
options.add_experimental_option("prefs", prefs)

# 添加这些实验性选项
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
options.add_argument('--disable-popup-blocking')

3. 改进等待策略

# 替换简单的等待逻辑
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

try:
    # 等待直到链接可见并可点击
    pdf_element = WebDriverWait(self.driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '.pdf')]"))
    )

    # 使用JavaScript点击可能更可靠
    self.driver.execute_script("arguments[0].click();", pdf_element)

    # 等待下载开始
    time.sleep(5)

except TimeoutException:
    self.log("PDF link not found or not clickable")

4. 处理连接问题

# 在Scrapy设置中添加这些配置
custom_settings = {
    'DOWNLOAD_TIMEOUT': 60,
    'RETRY_TIMES': 5,
    'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
    'DOWNLOAD_DELAY': 2,
    'CONCURRENT_REQUESTS': 1,
    'HTTPCACHE_ENABLED': True
}

其他建议

  1. 检查目标网站
  2. 确保目标网站没有反爬机制
  3. 检查是否需要登录或接受条款

  4. 验证PDF链接

  5. 打印出找到的PDF链接并在浏览器中手动测试
  6. 确保链接是直接的PDF文件而不是中间页面

  7. 文件系统权限

  8. 确保下载目录存在且有写入权限
  9. 考虑使用绝对路径

  10. 日志增强
    python self.log(f"Attempting to download from: {pdf_link}") self.log(f"Download directory: {download_dir}") self.log(f"Local filepath will be: {local_filepath}")

  11. 备用下载方法

  12. 如果Selenium点击不工作,尝试直接使用requestsurllib下载
  13. 或者使用Scrapy的Requestyield结合

通过以上改进,您的PDF下载功能应该会更加可靠。如果问题仍然存在,建议逐步调试,先确保能正确找到PDF链接,再解决下载问题。

2025-04-02