我们从Python开源项目中,提取了以下30个代码示例,用于说明如何使用scrapy.log.DEBUG。
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem('Missing{0}!'.format(data)) if valid: self.collection.insert(dict(item)) log.msg('??????!', level=log.DEBUG, spider=spider) return item # def testdb(self): # # ???MongoHQ # con = pymongo.Connection("paulo.mongohq.com",10042) # db = con.mytest # db.authenticate("root", "sa123") # db.urllist.drop()
def process_item(self, item, spider): #import pudb; pu.db #val = "{0}\t{1}\t{2}\t{3}\t".format(item['appid'], item['title'], item['recommended'], item['intro']) #self.file.write('--------------------------------------------\n') #self.file.write(val) valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: self.collection.insert(dict(item)) log.msg("new app added to MongoDB database!", level=log.DEBUG, spider=spider) return item
def process_item(self, item, spider): if spider.name == 'baiduTopStockSpider': collection = self.db[settings['stock']] d = dict(item) cursor = list(collection.find({'num': d["num"], 'source': d["source"]})) if cursor: collection.update({'_id': cursor[0]['_id']}, d) else: collection.insert(d) log.msg("stock added to MongoDB database!", level=log.DEBUG, spider=spider) elif spider.name == 'xueqiuPostSpider': collection = self.db['post'] collection.save(dict(item)) log.msg("post added to MongoDB database!", level=log.DEBUG, spider=spider) return item
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 if retries <= self.max_retry_times: log.msg(format="Retrying %(request)s " \ "(failed %(retries)d times): %(reason)s", level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True # our priority setup is different from super retryreq.meta['priority'] = retryreq.meta['priority'] - 10 return retryreq else: log.msg(format="Gave up retrying %(request)s "\ "(failed %(retries)d times): %(reason)s", level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason)
def process_item(self, item, spider): if item['site'] == 'Qua': if item['company']: item['company'] = wash(item['company']) if item['flight_time']: item['flight_time'] = wash(item['flight_time']) if item['airports']: item['airports'] = wash(item['airports']) if item['passtime']: item['passtime'] = wash(item['passtime']) if item['price']: item['price'] = wash(item['price']) for data in item: if not data: raise DropItem("Missing data!") self.collection.insert(dict(item)) log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider) elif item['site'] == 'Ctrip': self.collection.insert(dict(item)) log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider) return item
def parse(self, response): el = JDspiderLoader(response=response) el.add_xpath('title', '//*[@id="name"]/h1/text()') with Browser() as browser: url = response.url browser.visit(url) price = browser.find_by_id('jd-price') if price == []: price = browser.find_by_xpath('//*[@id="price"]/strong') # self.log(price[0].value, level=log.DEBUG) el.add_value('price', price[0].value[1:]) with Browser() as browser: number = response.url.split('/')[-1].split('.')[0] url = 'http://club.jd.com/review/' + number + '-2-1.html' browser.visit(url) shaitu = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[5]/a/em') el.add_value('shaitu', shaitu[0].value[1:-1]) haoping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[2]/a/em') el.add_value('haoping', haoping[0].value[1:-1]) zhongping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[3]/a/em') el.add_value('zhongping', zhongping[0].value[1:-1]) chaping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[4]/a/em') el.add_value('chaping', chaping[0].value[1:-1]) return el.load_item()
def dropped(self, item, exception, response, spider): return { 'level': log.DEBUG, 'msg': logformatter.DROPPEDMSG, 'args': { 'exception': exception, 'item': item, } }
def process_item(self, item, spider): valid = True print '--'*40 for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: try: self.collection.insert(dict(item)) log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider) except: print 'ggggg'*40 return item
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem('Missming{}!'.format(data)) if valid: self.coll.insert(dict(item)) log.msg('item added to mongodb database !',level=log.DEBUG,spider=spider) return item
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: self.collection.insert(dict(item)) log.msg("Event added to MongoDB database!", level=log.DEBUG, spider=spider) return item
def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: request.headers.setdefault('User-Agent', ua) spider.log( u'User-Agent: {} {}'.format(request.headers.get('User-Agent'), request), level=log.DEBUG )
def start_listening(self): self.port = listen_tcp(self.portrange, self.host, self) h = self.port.getHost() log.msg(format='Web service listening on %(host)s:%(port)d', level=log.DEBUG, host=h.host, port=h.port)
def process_item(self, item, spider): valid=True for data in item: if not data: valid=False raise DropItem('Missing{0}!'.format(data)) if valid: self.collection.insert(dict(item)) log.msg('question added to mongodb database!', level=log.DEBUG,spider=spider) return item
def process_item(self, item, spider): for data in item: if not data: raise DropItem("Missing data!") #self.collection.update({'url': item['url']}, dict(item), upsert=True) self.collection.insert(dict(item)) log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider) return None
def _download_request(self, request, spider): """Download a request URL using webdriver.""" log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG) request.manager.webdriver.get(request.url) #time.sleep(5) take_screenshot = getattr(settings, 'TAKE_SCREENSHOT', None) screenshot_loc = getattr(settings, 'SCREENSHOT_LOCATION', None) if take_screenshot and screenshot_loc: screenshot_location = screenshot_loc + str(randint(10000,10000000)) + '.png' request.manager.webdriver.save_screenshot(screenshot_location) request.meta['screenshot'] = screenshot_location request.meta['User-Agent'] = request.headers.get('User-Agent') request.meta['Referer'] = request.headers.get('Referer') return WebdriverResponse(request.url, request.manager.webdriver)
def _do_action_request(self, request, spider): """Perform an action on a previously webdriver-loaded page.""" log.msg('Running webdriver actions %s' % request.url, level=log.DEBUG) request.actions.perform() return WebdriverResponse(request.url, request.manager.webdriver)
def process_item(self,item,spider): for data in item: if not data: raise DropItem("Missing data!") self.collection.update({'url':item['url']},dict(item),upsert=True) log.msg("Question added to MongoDB !",level=log.DEBUG,spider=spider) return item
def debug(msg): log.msg(str(msg), level=log.DEBUG) #??????
def process_item(self, item, spider): if self.__get_uniq_key() is None: self.collection.insert(dict(item)) else: self.collection.update( {self.__get_uniq_key(): item[self.__get_uniq_key()]}, dict(item), upsert=True) log.msg("Item wrote to MongoDB database %s/%s" % (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']), level=log.DEBUG, spider=spider) return item
def debug(msg): log.msg(str(msg), level=log.DEBUG)
def open_spider(self, spider): self.connection = pymongo.MongoClient( settings['MONGODB_SERVER'], settings['MONGODB_PORT'] ) self.db = self.connection[settings['MONGODB_DB']] self.collection = self.db[settings['MONGODB_COLLECTION']] log.msg('Load nid from MongoDB database!', level=log.DEBUG, spider=spider) self.itemlist = set() for i in self.collection.find(): self.itemlist.add(i['nid'])
def process_item(self, item, spider): if item['nid'] in self.itemlist: raise DropItem('Duplication data!') #self.collection.update({'nid': item['nid']}, dict(item), upsert=True) self.collection.insert(dict(item)) log.msg('Goods added to MongoDB database!', level=log.DEBUG, spider=spider) return item
def process_item(self, item, spider): if not isinstance(item,StackOverflowItem): return item valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: self.collection.insert(dict(item)) log.msg("Question added to MongoDB database!",level=log.DEBUG, spider=spider) return item
def process_item(self, item, spider): if not isinstance(item,StackOverflowItemJobs): return item valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: self.collection.insert(dict(item)) log.msg("Jobs added to MongoDB database!",level=log.DEBUG, spider=spider) return item
def process_item(self, item, spider): """ ??item????????????????? """ if isinstance(item, TravelCrawlItem): try: self.spot_review.insert(dict(item)) log.msg("News added to MongoDB database!", level=log.DEBUG, spider=spider) except Exception: pass elif isinstance(item, TravelnoteItem): try: self.note.insert(dict(item)) log.msg("News added to MongoDB database!", level=log.DEBUG, spider=spider) except Exception: pass elif isinstance(item, TravelfoodItem): try: self.food_review.insert(dict(item)) log.msg("News added to MongoDB database!", level=log.DEBUG, spider=spider) except Exception: pass elif isinstance(item, TravelhotelItem): try: self.hotel_review.insert(dict(item)) log.msg("News added to MongoDB database!", level=log.DEBUG, spider=spider) except Exception: pass return item
def parse(self, response): el = Pbdnof58Loader(response=response) PageUrl = response.xpath('//a[contains(@class, "next")]/@href').extract() self.log(PageUrl, level=log.DEBUG) r = Redis() if PageUrl != []: r.lpush('myspider:58_urls', self.url + PageUrl[0]) sleep(1) el.add_value('UrlofPage', self.url + PageUrl[0]) urls = response.xpath('//table[contains(@class, "tbimg")]/tr') for url in urls: url = url.xpath('td[contains(@class, "t")]/a/@href').extract() if len(url) == 1 and 'zhuan' not in url[0]: r.lpush('myspider:start_urls', url[0]) return el.load_item()
def __do__insert(self, conn, item, spider): try: conn.execute(""" insert into 58pbdndb set title = %s, area = %s, price = %s, quality = %s, time = %s """, (item['title'], item['area'], item['price'], item['quality'], item['time'])) except MySQLdb.Error, e: spider.log("Mysql Error %d: %s" % (e.args[0], e.args[1]), level=log.DEBUG)
def file_path(self, request, response=None, info=None): item = request.meta['item'] # ?URL???????? # ????????,??:qq?/{???}/?????.jpg image_guid = request.url.split('/')[-3] log.msg(image_guid, level=log.DEBUG) filename = u'{0[account]}/{0[album_name]}/{1}.jpg'.format(item, image_guid) return filename
def process_item(self, item, spider): valid = True for data in item: if not data : valid = False raise DropItem("Missing {0}!".format(data)) if item['title'] == '': valid = False raise DropItem("title is empty") if item['content'] == '': valid = False raise DropItem("content is empty") for keyword in settings['EXCLUDE']: if keyword in item['title']: valid = False DropItem("title have invalid keywords") break if valid: iskey = False for key in settings['KEYS']: if key in item['title']: iskey = True break for author in settings['AUTHOR']: if author == item['author']: iskey = True break if not iskey: raise DropItem("item do not have keywords") for info in self.db.items.find({}, {"title":1}): infoTitle = info["title"].encode("utf-8") if infoTitle == item["title"]: valid = False raise DropItem("item exist!") break if valid: self.collection.insert(dict(item)) send_mail(item['title'], item['content'], item['href']) # log.msg("webCrewl item added to MongoDB database!", # level=log.DEBUG, spider=spider) return item