我是Scrapy的新手,我有蜘蛛代码
class Example_spider(BaseSpider): name = "example" allowed_domains = ["www.example.com"] def start_requests(self): yield self.make_requests_from_url("http://www.example.com/bookstore/new") def parse(self, response): hxs = HtmlXPathSelector(response) urls = hxs.select('//div[@class="bookListingBookTitle"]/a/@href').extract() for i in urls: yield Request(urljoin("http://www.example.com/", i[1:]), callback=self.parse_url) def parse_url(self, response): hxs = HtmlXPathSelector(response) main = hxs.select('//div[@id="bookshelf-bg"]') items = [] for i in main: item = Exampleitem() item['book_name'] = i.select('div[@class="slickwrap full"]/div[@id="bookstore_detail"]/div[@class="book_listing clearfix"]/div[@class="bookstore_right"]/div[@class="title_and_byline"]/p[@class="book_title"]/text()')[0].extract() item['price'] = i.select('div[@id="book-sidebar-modules"]/div[@class="add_to_cart_wrapper slickshadow"]/div[@class="panes"]/div[@class="pane clearfix"]/div[@class="inner"]/div[@class="add_to_cart 0"]/form/div[@class="line-item"]/div[@class="line-item-price"]/text()').extract() items.append(item) return items
管道代码为:
class examplePipeline(object): def __init__(self): self.dbpool = adbapi.ConnectionPool('MySQLdb', db='blurb', user='root', passwd='redhat', cursorclass=MySQLdb.cursors.DictCursor, charset='utf8', use_unicode=True ) def process_item(self, spider, item): # run db query in thread pool assert isinstance(item, Exampleitem) query = self.dbpool.runInteraction(self._conditional_insert, item) query.addErrback(self.handle_error) return item def _conditional_insert(self, tx, item): print "db connected-=========>" # create record if doesn't exist. tx.execute("select * from example_book_store where book_name = %s", (item['book_name']) ) result = tx.fetchone() if result: log.msg("Item already stored in db: %s" % item, level=log.DEBUG) else: tx.execute("""INSERT INTO example_book_store (book_name,price) VALUES (%s,%s)""", (item['book_name'],item['price']) ) log.msg("Item stored in db: %s" % item, level=log.DEBUG) def handle_error(self, e): log.err(e)
运行此后,我得到以下错误
exceptions.NameError: global name 'Exampleitem' is not defined
在process_item方法中添加以下代码时,出现上述错误
process_item
assert isinstance(item, Exampleitem)
而没有添加这行我得到
**exceptions.TypeError: 'Example_spider' object is not subscriptable
任何人都可以运行此代码,并确保所有项目都保存到数据库中吗?
在管道中尝试以下代码
import sys import MySQLdb import hashlib from scrapy.exceptions import DropItem from scrapy.http import Request class MySQLStorePipeline(object): def __init__(self): self.conn = MySQLdb.connect('host', 'user', 'passwd', 'dbname', charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): try: self.cursor.execute("""INSERT INTO example_book_store (book_name, price) VALUES (%s, %s)""", (item['book_name'].encode('utf-8'), item['price'].encode('utf-8'))) self.conn.commit() except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) return item