此代码旨在下载它必须在一系列给定链接中找到的一些文档。虽然它似乎找到了 pdf 文件的链接,但却无法下载它。问题可能出在哪里?
class DownloaderSpider(scrapy.Spider):
def __init__(self, *args, **kwargs):
super(DownloaderSpider, self).__init__(*args, **kwargs)
# Configure Chrome WebDriver with download preferences
options = webdriver.ChromeOptions()
prefs = {
"download.default_directory": "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads",
"download.prompt_for_download": False,
"plugins.always_open_pdf_externally": True
}
options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
def parse(self, response):
query = response.meta['query']
summary = response.meta['summary']
date = response.meta['date']
deadline = response.meta['deadline']
pdf_link = None
self.driver.get(response.url)
# Wait until the document is fully loaded
WebDriverWait(self.driver, 15).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
if not self.wait_for_stability():
self.log("Page did not stabilize in time.")
return
response = HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
elements = response.xpath("//tr | //div[not(div)]")
self.log(f"Found {len(elements)} elements containing text.")
best_match = None
highest_score = 0
for element in elements:
element_text = element.xpath("string(.)").get().strip()
score = fuzz.partial_ratio(summary.lower(), element_text.lower())
# Accept element if it contains a matching date (or deadline) in any format
if score > highest_score and (self.contains_matching_date(element_text, date) or self.contains_matching_date(element_text, deadline)):
highest_score = score
best_match = element
if best_match and highest_score >= 0: # Adjust threshold as needed
self.log(f"Best match found with score {highest_score}")
pdf_link = best_match.xpath(".//a[contains(@href, '.pdf')]/@href").get()
if pdf_link:
self.log(f"Found PDF link: {pdf_link}")
if pdf_link:
pdf_link = response.urljoin(pdf_link)
try:
# Use Selenium to click the PDF link and trigger the download
pdf_element = WebDriverWait(self.driver, 10).until(
EC.element_to_be_clickable((By.XPATH, f"//a[contains(@href, '{pdf_link.split('/')[-1]}')]"))
)
pdf_element.click()
# Wait for the file to appear in the download directory
download_dir = "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads"
local_filename = query.replace(' ', '_') + ".pdf"
local_filepath = os.path.join(download_dir, local_filename)
timeout = 30 # seconds
start_time = time.time()
while not os.path.exists(local_filepath):
if time.time() - start_time > timeout:
raise Exception("Download timed out.")
time.sleep(1)
self.log(f"Downloaded file {local_filepath}")
except Exception as e:
self.log(f"Failed to download file from {pdf_link}: {e}")
else:
self.log("No direct PDF link found, checking for next page link.")
next_page = best_match.xpath(".//a/@href").get() if best_match else None
if next_page:
next_page = response.urljoin(next_page)
self.log(f"Following next page link: {next_page}")
yield scrapy.Request(next_page, self.parse_next_page, meta={'query': query})
def parse_next_page(self, response):
query = response.meta['query']
self.driver.get(response.url)
WebDriverWait(self.driver, 15).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
if not self.wait_for_stability():
self.log("Page did not stabilize in time.")
return
response = HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
pdf_link = response.xpath("//a[contains(@href, '.pdf')]/@href").get()
if pdf_link:
pdf_link = response.urljoin(pdf_link)
try:
# Use Selenium to click the PDF link and trigger the download
pdf_element = WebDriverWait(self.driver, 10).until(
EC.element_to_be_clickable((By.XPATH, f"//a[contains(@href, '{pdf_link.split('/')[-1]}')]"))
)
pdf_element.click()
# Wait for the file to appear in the download directory
download_dir = "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads"
local_filename = query.replace(' ', '_') + ".pdf"
local_filepath = os.path.join(download_dir, local_filename)
timeout = 30 # seconds
start_time = time.time()
while not os.path.exists(local_filepath):
if time.time() - start_time > timeout:
raise Exception("Download timed out.")
time.sleep(1)
self.log(f"Downloaded file {local_filepath}")
except Exception as e:
self.log(f"Failed to download file from {pdf_link}: {e}")
else:
self.log("No PDF link found on next page.")
管道如下:代码创建并配置 selenium 驱动程序,浏览 .csv 文档中的链接。找到页面中最可能的条目,这是与下载相对应的日志,如果它不是下载链接,则导航到新页面并找到第一个下载链接,然后下载(或尝试下载):
2025-03-29 17:31:14 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying <GET https://oshanarc.gov.na/procurement> (failed 6 times): [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
2025-03-29 17:31:14 [scrapy.core.scraper] ERROR: Error downloading <GET https://oshanarc.gov.na/procurement>
Traceback (most recent call last):
File "C:\Users\marti\Downloads\Web Scraper\venv\Lib\site-packages\twisted\internet\defer.py", line 2013, in _inlineCallbacks
result = context.run(
cast(Failure, result).throwExceptionIntoGenerator, gen
)
File "C:\Users\marti\Downloads\Web Scraper\venv\Lib\site-packages\twisted\python\failure.py", line 467, in throwExceptionIntoGenerator
return g.throw(self.value.with_traceback(self.tb))
~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\marti\Downloads\Web Scraper\venv\Lib\site-packages\scrapy\core\downloader\middleware.py", line 68, in process_request
return (yield download_func(request, spider))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
twisted.web._newclient.ResponseNeverReceived: [<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
2025-03-29 17:31:14 [scrapy.core.engine] INFO: Closing spider (finished)
知道可能是什么问题吗?
您的代码在尝试下载 PDF 文件时遇到了几个潜在问题。根据错误日志和代码分析,以下是可能的原因和解决方案:
连接丢失错误:
Connection to the other side was lost in a non-clean fashion: Connection lost
这表明与目标网站的连接不稳定或被中断。
PDF 下载失败的可能原因:
if pdf_link:
try:
# 方法1:直接使用requests下载(推荐)
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(pdf_link, headers=headers, stream=True)
if response.status_code == 200:
local_filename = query.replace(' ', '_') + ".pdf"
local_filepath = os.path.join(download_dir, local_filename)
with open(local_filepath, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
self.log(f"Successfully downloaded {local_filepath}")
else:
self.log(f"Failed to download: HTTP {response.status_code}")
except Exception as e:
self.log(f"Download failed: {str(e)}")
# 在初始化时添加这些选项
prefs = {
"download.default_directory": "c:\\Users\\marti\\Downloads\\Web Scraper\\downloads",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True,
"profile.default_content_settings.popups": 0,
"profile.content_settings.exceptions.automatic_downloads.*.setting": 1
}
options.add_experimental_option("prefs", prefs)
# 添加这些实验性选项
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
options.add_argument('--disable-popup-blocking')
# 替换简单的等待逻辑
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
try:
# 等待直到链接可见并可点击
pdf_element = WebDriverWait(self.driver, 20).until(
EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '.pdf')]"))
)
# 使用JavaScript点击可能更可靠
self.driver.execute_script("arguments[0].click();", pdf_element)
# 等待下载开始
time.sleep(5)
except TimeoutException:
self.log("PDF link not found or not clickable")
# 在Scrapy设置中添加这些配置
custom_settings = {
'DOWNLOAD_TIMEOUT': 60,
'RETRY_TIMES': 5,
'RETRY_HTTP_CODES': [500, 502, 503, 504, 408, 429],
'DOWNLOAD_DELAY': 2,
'CONCURRENT_REQUESTS': 1,
'HTTPCACHE_ENABLED': True
}
检查是否需要登录或接受条款
验证PDF链接:
确保链接是直接的PDF文件而不是中间页面
文件系统权限:
考虑使用绝对路径
日志增强:
python
self.log(f"Attempting to download from: {pdf_link}")
self.log(f"Download directory: {download_dir}")
self.log(f"Local filepath will be: {local_filepath}")
备用下载方法:
requests
或urllib
下载Request
与yield
结合通过以上改进,您的PDF下载功能应该会更加可靠。如果问题仍然存在,建议逐步调试,先确保能正确找到PDF链接,再解决下载问题。