我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用scrapy.FormRequest()。
def parseNotFirstPage(self, response): sipo = response.meta['sipo'] soup = BeautifulSoup(response.body_as_unicode(), 'lxml') itemList = soup.find_all(attrs={"class": "item"}) for item in itemList: sipocrawler = SipoCrawlerItem() itemSoup = BeautifulSoup(item.prettify(), 'lxml') patentid = itemSoup.find(attrs={'name': 'idHidden'}).get('value') nrdAn = itemSoup.find(attrs={'name': 'nrdAnHidden'}).get('value') nrdPn = itemSoup.find(attrs={'name': 'nrdPnHidden'}).get('value') sipocrawler['patent_id'] = str(patentid) formdata = url_config.detailSearch.get('formdata') formdata.__setitem__('nrdAn', str(patentid).split('.')[0]) formdata.__setitem__('cid', str(patentid)) formdata.__setitem__('sid', str(patentid)) yield FormRequest( url=url_config.detailSearch.get('url'), formdata=formdata, callback=self.parsePatentDetail, meta={'sipo': sipo, 'sipocrawler': sipocrawler, 'lawinfo': {'nrdAn': nrdAn, 'nrdPn': nrdPn}} ) # ??????
def login_after_captcha(self, response): with open("captcha.jpg", "wb") as f: f.write(response.body) f.close() from PIL import Image try: im = Image.open('captcha.jpg') im.show() im.close() except: pass captcha = input("?????\n>") post_data = response.meta.get("post_data", {}) post_url = "https://www.zhihu.com/login/phone_num" post_data["captcha"] = captcha return [scrapy.FormRequest( url=post_url, formdata=post_data, headers=self.headers, callback=self.check_login )]
def parse_dates(self, response): """ The data is organized by dates, the spider will get the entire year relative data """ for date in response.css('select[name="mesano"] option'): mesano = date.css('::attr(value)').extract_first() if re.search(r"(\d{4})", mesano).group(1) == time.strftime("%Y"): request = scrapy.FormRequest( url=BASE_URL + 'holerite/index.html', formdata={ 'acao': '', 'grupo': GRUPO, 'mesano': mesano, 'tipo': '1' }, callback=self.parse_entities ) request.meta['mesano'] = mesano yield request
def parseAfterSetting(self, response): print(response.body_as_unicode()) for sipo in self.sipoList: mainSearch = url_config.mainSearch headers = mainSearch.get('headers') searchExpCn = sipo.search_exp_cn print('?????--- ', searchExpCn) formData = mainSearch.get('formdata') formData.__setitem__('searchCondition.searchExp', searchExpCn) yield FormRequest( url=url_config.mainSearch.get('url'), callback=self.parseFirstPage, method="POST", headers=headers, formdata=formData, meta={'sipo': sipo} ) # ????????
def parsePatentDetail(self, response): sipo = response.meta['sipo'] sipocrawler = response.meta['sipocrawler'] detail = json.loads(response.body_as_unicode()) sipocrawler['abstract'] = BeautifulSoup(detail.get('abstractInfoDTO').get('abIndexList')[0].get('value'), 'lxml').text.replace('\n', '').strip() sipocrawler['invention_name'] = detail.get('abstractInfoDTO').get('tioIndex').get('value') for abitem in detail.get('abstractInfoDTO').get('abstractItemList'): ItemCollection.resolveData(sipocrawler, abitem.get('indexCnName'), abitem.get('value')) lawinfo = response.meta.get('lawinfo') formdata = url_config.relatedInfo.get('formdata') formdata.__setitem__('literaInfo.nrdAn', lawinfo.get('nrdAn')) formdata.__setitem__('literaInfo.nrdPn', lawinfo.get('nrdPn')) yield FormRequest( url=url_config.relatedInfo.get('url'), method='POST', dont_filter=True, # ??????????????????????????? formdata=formdata, callback=self.parseRelatedInfo, meta={'sipo': sipo, 'sipocrawler': sipocrawler} ) # ??????
def start_requests(self): settings = get_project_settings() city_list = settings["CITY_LIST"] if self.city: city_cn_name = city_list.get(self.city) yield scrapy.FormRequest( url=self.base_url + self.city + "_gongyu", formdata={"startDate": self.start_date, "endDate": self.end_date}, callback=self.parse, meta={'city_en_name': self.city, "city_cn_name": city_cn_name} ) else: for city_en_name, city_cn_name in city_list.items(): yield scrapy.FormRequest( url=self.base_url + city_en_name + "_gongyu", formdata={"startDate": self.start_date, "endDate": self.end_date}, callback=self.parse, meta={'city_en_name': city_en_name, "city_cn_name": city_cn_name} )
def start_requests(self): if self.FIRST_TIME_RUNNING: self.FIRST_TIME_RUNNING = False for sid in (list(range(2014020000, 2014040000)) + list(range(2015020000, 2015040000)) + list(range(2016020000, 2016040000))): yield scrapy.FormRequest(self.domain + self.login_url, formdata={'zjh': str(sid), 'mm': '1'}, callback=self.parse, meta={'sid': sid, 'password': '1', 'cookiejar': sid}, dont_filter=True) else: for password in self.load_passwords(): for sid in self.get_sids(): yield scrapy.FormRequest(self.domain + self.login_url, formdata={'zjh': str(sid), 'mm': password}, callback=self.parse, meta={'sid': sid, 'password': password, 'cookiejar': sid}, dont_filter=True)
def start_requests(self): for start_url in self.database_urls: url, body = start_url.split("?POST_BODY=", 1) yield scrapy.FormRequest( url, method="POST", headers={ 'Content-Type': "application/x-www-form-urlencoded" }, body=body, meta={ 'source_url': url, 'source_anchor': body }, callback=self.parse )
def contruct_request(self, response, post_data=None, next_page=False, other_info=None): if post_data is not None: encryptor = MeituanEncryptor(post_data, response.url) else: encryptor = response.meta["encryptor"] post_data = encryptor.data if next_page: post_data["page_index"] = str(int(post_data["page_index"]) + 1) encryptor.data = post_data token = encryptor.get_token() url = self.base_url2 + token meta = { "encryptor": encryptor, "cookiejar": response.meta["cookiejar"], "geo_point": response.meta["geo_point"], "other_info": other_info if other_info is not None else {} } return scrapy.FormRequest( url, meta=meta, formdata=post_data, callback=self.parse_restaurant )
def contruct_request(self, response, post_data=None, other_info=None): if post_data is not None: encryptor = MeituanEncryptor(post_data, response.url) else: encryptor = response.meta["encryptor"] post_data = encryptor.data token = encryptor.get_token(100010) url = self.base_url2 + token meta = { "encryptor": encryptor, "cookiejar": response.meta["cookiejar"], "other_info": other_info if other_info is not None else {} } return scrapy.FormRequest( url, meta=meta, formdata=post_data, callback=self.parse_menu )
def parse_page(self, response): next_page = response.meta.get('page') + 1 json_data = json.loads(response.text) if json_data.get('type') != 'success': return articles = scrapy.Selector(text=json_data.get('html')).css('article') for article in articles: yield { 'author': article.css('div.author-meta a ::text').extract_first(), 'date': article.css('div.clock-meta a ::text').extract_first(), 'title': article.css('h1.entry-title ::text').extract_first() } yield scrapy.FormRequest( self.scrolling_url, formdata={'action': 'infinite_scroll', 'page': str(next_page), 'order': 'DESC'}, callback=self.parse_page, meta={'page': next_page} )
def parse(self, response): topic_xpath_rule = '//li[@class="zm-topic-cat-item"]/a/text()' topic_names = response.selector.xpath(topic_xpath_rule).extract() topic_xpath_rule = '//li[@class="zm-topic-cat-item"]/@data-id' topic_ids = response.selector.xpath(topic_xpath_rule).extract() # for i in range(len(topic_ids)): print("?30???") # for i in range(10): for i in range(len(topic_ids)): params = {"topic_id": int(topic_ids[i]), "offset": 0, "hash_id": "d17ff3d503b2ebce086d2f3e98944d54"} yield FormRequest( url='https://www.zhihu.com/node/TopicsPlazzaListV2', method='POST', # headers=self.set_headers2('https://www.zhihu.com/topics'), headers=self.set_headers('https://www.zhihu.com/topics'), cookies=cookielib.LWPCookieJar(filename='cookies'), # formdata={'method': 'next', 'params': '{"topic_id":988,"offset":0,"hash_id":"d17ff3d503b2ebce086d2f3e98944d54"}'}, formdata={'method': 'next', 'params': str(params).replace("\'", "\"").replace(" ", "")}, callback=self.topic_parse, meta={'topic_name': topic_names[i]} )
def start_requests(self): username = self.spider_settings.get('username') password = self.spider_settings.get('password') if username and password: yield scrapy.FormRequest( url='https://{}/login'.format(self.name), formdata={'Username': username, 'Password': password, 'target': '/MyAccount/', 'submit': 'Log+in'}, callback=self._after_login, meta={'dont_cache': True}, ) else: # Username, password or section not found in feeds.cfg. self.logger.info('Login failed: No username or password given. ' 'Only free articles are available in full text.') yield self._start_requests()
def start_requests(self): abonr = self.spider_settings.get('abonr') password = self.spider_settings.get('password') if abonr and password: yield scrapy.FormRequest( url='https://www.{}/falter/e-paper/login'.format(self.name), formdata={'login[abonr]': abonr, 'login[password]': password, 'redirect_url': '/archiv/'}, callback=self.parse_archive, meta={'dont_cache': True}, ) else: # Username, password or section falter.at not found in feeds.cfg. self.logger.info('Login failed: No username or password given. ' 'Only free articles are available in full text.') yield scrapy.Request('https://www.{}/archiv/'.format( self.name), self.parse_archive, meta={'dont_cache': True})
def parse_room_first(self, response): id = re.findall(r'\d{3,10}', response.url)[0] name = response.css('#listing_name::text').extract_first() # equipment = response.css( # 'div.row.row-condensed.text-muted.text-center.hide-sm > div > div.col-sm-3.icon--small-margin > span.text-small::text').extract() # img = response.css('.cover-img::attr(style)').extract_first().replace('ackground-image:url', '')[1:-1] # description = response.css('div.simple-format-container > p > span::text').extract() # comment_num = response.css('div.col-md-8.review-header > div > h4 > span > span::text').extract_first() owner = response.css( 'div.host-info.pull-left > div > span > a.link-reset::attr(href)').extract_first().split('?')[-1] owner_id = response.css( 'div.host-info.pull-left > div > span > a.link-reset > span::text').extract_first() f = furl(response.url) f.path.add('personalization.json') try: del f.args['location'] except KeyError: pass f.args.addlist('review_ids[]', ['144474925', '141633062', '140450604', '139913674', '138701100', '138102086', '137690239']) url = f.url path = str(f.path) + str(f.query) return scrapy.FormRequest(url=url, callback=self.parse_room_second, meta={'room_id': id, 'name': name, 'owner': owner, 'owner_id': owner_id, 'parse': True})
def login(self, response): response_text = response.text match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) xsrf = '' if match_obj: xsrf = (match_obj.group(1)) if xsrf: post_url = "https://www.zhihu.com/login/phone_num" post_data = { "_xsrf": xsrf, "phone_num": "18782902568", "password": "admin123" } return [scrapy.FormRequest( url = post_url, formdata = post_data, headers=self.headers, callback=self.check_login )]
def yield_formrequest(self, param, index, code, category): """ :param param: "POST" parameters :param index: page number :param code: company code :param category: "abbr"??company_code?????????????; "full"??company_code????????????? :return: """ post_data = { # "Param": "????:????,????:????", "Param": param, "Index": repr(index), "Page": repr(self.cases_per_page), "Order": "????", "Direction": "asc", } data = copy.deepcopy(post_data) data["case_parties"] = code # parties: ??? data["abbr_full_category"] = category # ????????, ??? return scrapy.FormRequest(url=self.url, formdata=post_data, callback=lambda response: self.parse(response, data), dont_filter=True) # ??URL??(??url??????????yield?????URL??, ?????????)
def login(self, response): captcha = "captcha.jpg" with open(captcha, "wb") as f: f.write(response.body) try: Image.open(captcha).show() except: pass post_data = response.meta.get("post_data", {}) post_url = "https://www.zhihu.com/login/{}".format(self.user_type) post_data["captcha"] = input("Please input the captcha: ") return [scrapy.FormRequest( url=post_url, formdata=post_data, headers=self.headers, callback=self.check_login )] # ???????????start_urls??url??parse?????????
def parse(self, response): #?? ?????? URL for url in response.xpath('//a[@class="see-more play-button small id-track-click apps id-responsive-see-more"]'): targetURL = "https://play.google.com" + url.xpath('@href')[0].extract() #??POST , ??? 100 ? yield scrapy.FormRequest( targetURL, formdata = {'start':'0', 'num':'100', 'numChildren':'0', 'cctcss':'square-cover', 'cllayout':'NORMAL', 'ipf':'1', 'xhr':'1', 'token':'zNTXc17yBEzmbkMlpt4eKj14YOo:1458833715345'}, callback = self.parse_data )
def parse_login(self, response): self._check_login_params() self._login = False form_data = { self.username_field: self.username, self.password_field: self.password } if hasattr(self, 'form_xpath'): return scrapy.FormRequest.from_response( response, formxpath=self.form_xpath, formdata=form_data, callback=self.parse_after_login ) elif hasattr(self, 'form_url'): return scrapy.FormRequest( self.form_url, formdata=form_data, callback=self.parse_after_login )
def parse_video(self, response): item = response.meta['item'] # item['info']['play_count'] = response.xpath(xpath).extract_first(default='') # if (item['info']['play_count'] == '') and (not re.findall(r'????', response.body)): # item['info']['play_count'] = (response.xpath('//em[@id="mod_cover_playnum"]/text()') # .extract_first(default='')) if not self.__get_json(response): return if not self.__get_media_urls(item): return item['media_urls'] = self.media_urls item['file_name'] = self.file_name url = 'https://v.qq.com/x/page/{}.html'.format(self.kwargs['vid']) meta = { 'item': item, 'vid': self.kwargs['vid'], } yield scrapy.FormRequest(url, method='GET', meta=meta, callback=self.parse_play_count)
def parse_video(self, response): item = response.meta['item'] url = 'https://interface.bilibili.com/playurl' if not self.__get_json(response): return try: item['info']['play_count'] = self.json_data['play'] item['info']['intro'] = self.json_data['description'] item['info']['date'] = self.json_data['created_at'] item['info']['author'] = self.json_data['author'] except: pass try: cid = self.json_data['list'][0]['cid'] except Exception as err: self.logger.error('url: {}, error: {}'.format(self.page_url, str(err))) return params = self.bilibili_common.get_params(cid) yield scrapy.FormRequest(url=url, method='GET', meta={'item': item}, formdata=params, callback=self.parse_video_urls)
def parse_video_custom(self, response): item = response.meta['item'] json_data = json.loads(response.body[response.body.find('{'): response.body.rfind('}') + 1]) vid = self.url.split('/')[-1] url = 'https://ups.youku.com/ups/get.json' params = { 'vid': vid, 'ccode': '0590', 'client_ip': '0.0.0.0', 'client_ts': str(int(time.time())), 'utid': 'aKCuEcCdq38CAbaWLjWeW3TI', 'r': json_data['stealsign'], 'callback': 'json' + str(int(time.time() * 1000)), } yield scrapy.FormRequest(url=url, method='GET', meta={'item': item}, formdata=params, callback=self.parse_video_urls)
def parse_item(self, response): item = MultimediaCrawlerItem() item['host'] = 'baozoumanhua' item['media_type'] = 'video' item['stack'] = [] item['download'] = 0 item['extract'] = 0 item['file_dir'] = os.path.join(settings['FILES_STORE'], item['media_type'], self.name) item['url'] = response.url item['info'] = { 'link': item['url'], 'title': (response.xpath(r'//h1[@class="v-title"]/text()').extract_first(default='').strip()), 'intro': '', 'author': 'baozoumanhua', } player = self.__get_player(item['url'], response) if player is None: self.logger.error('url: {}, error: does not match any player'.format(item['url'])) return yield scrapy.FormRequest(url=player.url, method=player.method, meta={'item': item}, formdata=player.params, callback=player.parse_video)
def parse(self, response): item = MultimediaCrawlerItem() item['host'] = 'ergeng' item['media_type'] = 'video' item['stack'] = [] item['download'] = 0 item['extract'] = 0 item['file_dir'] = os.path.join(settings['FILES_STORE'], item['media_type'], self.name) item['url'] = response.url timestamp = re.search(r'"create_at"\s*:\s*(\d+),|$', response.body).group(1) item['info'] = { 'link': item['url'], 'title': (response.xpath(r'//div[contains(@class, "new-video-info")]/h3/text()'). extract_first(default='').strip()), 'intro': response.xpath(r'//div[contains(@class, "tj")]/text()').extract_first(default='').strip(), 'date': time.strftime('%Y-%m-%d', time.localtime(int(timestamp))) if timestamp is not None else '', 'author': re.search(r'"user_nickname"\s*:\s*"(.*?)"|$', response.body).group(1), } player = self.__get_player(item['url'], response) if player is None: self.logger.error('url: {}, error: does not match any player'.format(item['url'])) return yield scrapy.FormRequest(url=player.url, method=player.method, meta={'item': item}, formdata=player.params, callback=player.parse_video)
def parse_video_url(self, response): item = response.meta['item'] vid = re.search(r'id_(.*?).html|$', response.url).group(1) if vid is None: self.logger.error('url: {}, error: failed to find vid'.format(response.url)) return params = { 'vid': vid, 'ccode': '0401', 'client_ip': '192.168.1.1', 'utid': 'tB2PEWHIKgECAbaWLjUeiFyE', 'client_ts': str(round(time.time())), } url = 'https://ups.youku.com/ups/get.json' yield scrapy.FormRequest(url, method='GET', meta={'item': item}, formdata=params, callback=self.parse_download_url)
def parse(self, response): page_size = 30 user = response.meta['user'] url = 'https://space.bilibili.com/ajax/member/getSubmitVideos' json_data = json.loads(response.body) total = json_data['data']['video'] pages = total // page_size if not (total % page_size) else (total // page_size + 1) for page in range(1, pages + 1): params = { 'mid': user.id, 'pagesize': str(page_size), 'tid': '0', 'page': str(page), 'keyword': '', 'order': 'pubdate', } yield scrapy.FormRequest(url=url, method='GET', meta={'user': user}, formdata=params, callback=self.parse_items)
def parse(self, response): user = response.meta['user'] num = 24 count = response.xpath('//div[@id="headBgMod"]//ul[@class="user_count"]/li[3]/span[2]/text()').extract()[0] for page in range(1, int(math.ceil(int(count) / num)) + 1): aa = "1.9.1" callback = ''.join(['jQuery', re.sub(r'\D', '', aa + str(random.random())), '_', str(int(time.time() * 1000))]) params = { 'otype': 'json', 'pagenum': str(page), 'callback': callback, 'qm': '1', 'num': str(num), 'sorttype': '0', 'orderflag': '0', 'low_login': '1', 'uin': re.search(r'data-vuin="(.*?)"', response.body).group(1), '_': str(int(time.time() * 1000)), } url = 'http://c.v.qq.com/vchannelinfo' yield scrapy.FormRequest(url, method='GET', meta={'user': user}, formdata=params, callback=self.parse_page)
def parse_video_or_audio(self, response): item = response.meta['item'] item['media_type'], result = self.__video_or_audio(response.body) item['file_dir'] = os.path.join(settings['FILES_STORE'], item['media_type'], self.name) self.logger.info('type: {}, result: {} url: {}'.format(item['media_type'], result, response.url)) if item['media_type'] == 'video': url = 'https://v.qq.com/x/page/{}.html'.format(result) meta = { 'item': item, 'vid': result, } yield scrapy.FormRequest(url, method='GET', meta=meta, callback=self.parse_info) elif item['media_type'] == 'audio': item['media_urls'] = [result] t = urlparse(result).path.split('.') item['file_name'] += ('.' + t[1]) if ((len(t) >= 2) and t[1]) else '.mp3' yield item
def parse(self, response): user = response.meta['user'] count = int(response.xpath('//h3[@node-type="hdTitle"]/following-sibling::span/text()' ).extract()[0][1:-1].replace(',', '')) params = { 'spm': 'a2hzp.8253869.0.0', 'order': '1', 'last_item': '', # 'last_vid': re.search(r'last_vid=(\d+)', response.body), } page, current, num = 1, 0, 50 while current < count: params['page'] = str(page) # params['last_pn'] = i yield scrapy.FormRequest(url=response.url.split('?')[0], method='GET', meta={'user': user}, formdata=params, callback=self.parse_items) current = num * page page += 1
def pass_valid(self, response): print("?????") i = Image.open(BytesIO(response.body)) i.save("yz.png") validcode_value = input("?? yz.png,??????") data = { "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": response.meta['view_state'], "__EVENTVALIDATION": response.meta['event_validation'], "txt_ValidCode": validcode_value, "btnSubmit": "? ?" } func = self.parse_zz if response.meta['type'] == 'zz' else self.parse_bid yield scrapy.FormRequest(response.meta['last_url'], meta={"cookiejar": response.meta["cookiejar"]}, formdata=data, callback=func, dont_filter=True)
def lohin_after_captcha(self,response): '''?????????''' with open("captcha.jpg","wb") as f: f.write(response.body) f.close() # from PIL import Image # try: # im=Image.open('captcha.jpg') # im.show() # except: # pass captcha=input('???????') post_data=response.meta.get('post_data',{}) #???,????? post_url = "https://www.zhihu.com/login/phone_num" post_data['captcha']=captcha return [scrapy.FormRequest( url=post_url, formdata=post_data, headers=self.headers, callback=self.check_login )]
def parse_(self, response): detail = response.xpath('//table[@bordercolor="lightgray"]/tr') # ??????? for temp in detail[:-1]: item = SiteItem() item['title'] = temp.xpath('td/span/@title').extract_first().strip() if temp.xpath('td/span/@onclick').extract_first(): item['link'] = 'http://www.chinaunicombidding.cn' + \ (temp.xpath('td/span/@onclick').extract_first()).split(',')[0].split( '(')[1][1:-1].strip() item['pubtime'] = temp.xpath('td[@width="15%"]/text()').extract_first().strip() yield item nowPage = str(int(response.xpath('//span[@id="nowPage"]/text()').extract_first()) + 1) print ('nowpage======================================' + str(nowPage)) if item['pubtime'] == date.get_curdate(): yield scrapy.FormRequest( "http://www.chinaunicombidding.cn/jsp/cnceb/web/info1/infoList.jsp?page=" + nowPage, formdata={ "type": "", "province": "", "city": "", "notice": "", "time1": "", "time2": "" }, callback=self.parse_)
def parse(self, response): detail = response.xpath('//ul[@class="lby-list"]//li') pubtime = None for temp in detail[:20]: item = SiteItem() temp_pubtime = temp.xpath('span/text()').extract_first().strip()[1:11] if temp_pubtime: item['pubtime'] = temp.xpath('span/text()').extract_first().strip()[1:11] pubtime = item['pubtime'] item['title'] = temp.xpath('a//text()').extract_first() print "------------------------------{}----".format(item['title']) if temp.xpath('a/@href').extract_first(): item['link'] = "http://www.zycg.gov.cn" + temp.xpath('a//@href').extract_first() yield item # ??????????????? # print ('-----------------------??-------------------------------') # print ('-------pubtime----------------{}-------------------------------'.format(pubtime)) # print ('------date.get_curdate-----------------{}-------------------------------'.format(date.get_curdate())) if pubtime == date.get_curdate(): # ????? # print "-----------------??-----------------" next_page_href = "http://www.zycg.gov.cn" + ( str(response.xpath('//a[@class="next_page"]//@href').extract_first())) yield scrapy.FormRequest(next_page_href, callback=self.parse)
def parse(self, response): detail = response.xpath('//ul[@class="m_m_c_list"]/li') for temp in detail: item = SiteItem() item['title'] = temp.xpath('a/text()').extract_first().strip() item['link'] = "http://www.gdgpo.gov.cn" + temp.xpath('a/@href').extract_first().strip() item['pubtime'] = temp.xpath('em/text()').extract_first().strip()[0:10] print("------------------------------------------------------------------------------") yield item if date.get_curdate() == (item['pubtime']): pageindex = response.xpath('//input[@id="pointPageIndexId"]/@value').extract_first() self.iipage += 1 last_page = response.xpath( u'//a/span[contains(text(),"? ?")]/../@href').extract_first() total_pagenum = last_page.split('(')[1][:-1] if int(self.iipage) < int(total_pagenum): yield scrapy.FormRequest("http://www.gdgpo.gov.cn/queryMoreInfoList.do", formdata={ "sitewebId": "4028889705bebb510105bec068b00003", "channelCode": '0005', 'pageIndex': str(self.iipage), 'pageSize': "15", 'pointPageIndexId': "1" }, callback=self.parse)
def login_after_captcha(self, response): with open('captcha.jpg', 'wb') as f: f.write(response.body) f.close() from PIL import Image try: img = Image.open('captcha.jpg') img.show() img.close() except: pass captcha = input('??????') post_data = response.meta.get('post_data', {}) post_url = 'https://www.zhihu.com/login/phone_num' post_data['captcha'] = captcha return scrapy.FormRequest(post_url, formdata=post_data, headers=self.headers, callback=self.check_login)
def parse_single_song(self, response): loader = response.meta['loader'] selector = Selector(response) singer = selector.xpath('//title/text()').extract() loader.add_value('singer', singer) loader.add_value('_id', response.meta['song_id']) comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100) source_data, source_url = api_song_url(response.meta['song_id']) comment_id = generate_comment_index()['comment_index'] loader.add_value('comment_id', comment_id) yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers, formdata=comment_data, callback=self.parse_comments, meta={'comment_id': comment_id}) yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers, formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)
def generate_firm_susong(self, response): if len(response.body) < 10: return qitem = response.meta["item"] page_n = response.meta["page_n"] self.append_susong_detail({"????": self.clean_content(response.body)}, qitem._id) anjian_list = response.xpath("//table[@class='m_changeList']//a[@class='c_a']/@onclick").extract() anjian_name = response.xpath("//table[@class='m_changeList']//tr//td[2]//a[@class='c_a']/text()").extract() for i in range(0, len(anjian_list)): yield scrapy.FormRequest( "http://www.qichacha.com/company_wenshuView", callback=self.generate_firm_anjian, cookies=self.qicha_cookie, method='POST', dont_filter="true", formdata={"id": self.generate_anjian_id(anjian_list[i])}, meta={"item_id": qitem._id, "anjian_name": anjian_name[i]} ) # ????? yield scrapy.Request( response.meta["chacha_url_pre"] + '&tab=susong&box=wenshu&p=' + str(page_n), encoding='utf-8', callback=self.generate_firm_susong, cookies=self.qicha_cookie, meta={"item": qitem, "chacha_url_pre": response.meta["chacha_url_pre"], "page_n": int(page_n)+1} )
def start_requests(self): return [scrapy.FormRequest("http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Indice%20de%20Iniciativas?_piref73_1335505_73_1335500_1335500.next_page=/wc/cambioLegislatura", formdata = {'idLegislatura':'12'} , callback = self.parse)]
def parse(self, response): """ :param response: :return:???????post?? post??? inslider page pagesize Content-Type:application/x-www-form-urlencoded """ soup = BeautifulSoup(response.body) menu = soup.find_all("a",class_="ui-more") #???????? if menu: for topic in menu: topic_name = topic.text.replace(u"??","") topic_url = topic.get("href") self.flag.setdefault(topic_url,0) page="1" #post_data????? post_data = { "inslider":"0", "page":page, "pagesize":"10" } # yield scrapy.Request(topic_url, # callback=self.parse_topic, # method="POST", # headers={"Content-Type":"application/x-www-form-urlencoded"}, # body=json.dumps(post_data) # ) yield scrapy.FormRequest( url=topic_url, formdata=post_data, callback=self.parse_topic, meta={"page":page,"topic_name":topic_name} )
def start_requests(self): return [ scrapy.Request("http://www.ctcnn.com/",callback=self.parse), # scrapy.FormRequest(self.start_url,formdata={'page':'1'},callback=self.parse_newest), #TODO something wrong ] #???????
def parse(self,response): yield scrapy.FormRequest(self.start_url,formdata={'page':'1'},callback=self.parse_newest) soup = BeautifulSoup(response.body,"lxml") index_list = soup.find(class_="index-first-list")("li") if soup.find(class_="index-first-list") else None for news in index_list: title = news.h2.a.string if news.h2.a else None abstract = news.p.string if news.p else None news_url = self.domain+news.a.get("href",None) if news.a else None item = NewsItem(title=title,abstract=abstract,news_url=news_url,catalogue=u"????") request = scrapy.Request(news_url,self.parse_news,dont_filter=True) request.meta["item"] = item yield request #???????
def parse_newest(self, response): soup = BeautifulSoup(response.body,"lxml") page =response.request.body.split('=')[-1] li = soup.find_all('li') if li: for news in li : news_date = news.find(class_="time").string[2:] if news.find(class_="time") else None struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title = news.find(class_="title").string if news.find(class_="title") else None news_url = self.domain+news.find(class_="title").a.get("href",None) if news.find(class_="title") else None abstract = news.find(class_="info").string if news.find(class_="info") else None pic = self.domain+news.find('img').get('src',None) if news.find('img') else None topic = news.find(class_="type").string if news.find(class_="type") else None item = NewsItem(catalogue=u"????", title=title, news_url=news_url, abstract=abstract, pic=pic, topic=topic, news_date=news_date) item = judge_news_crawl(item) if item: request = scrapy.Request(news_url,callback=self.parse_news,dont_filter=True) request.meta["item"] = item yield request else: self.flag=page else: logger.info("can't find news list") #??? if not self.flag: new_request = scrapy.FormRequest(self.start_url,formdata={'page':str(int(page)+1)},callback=self.parse_newest) yield new_request
def start_requests(self): return [scrapy.FormRequest("https://bbs.byr.cn/user/ajax_login.json", formdata=LOGIN_FORMDATA, meta={'cookiejar': 1}, headers=HEADERS, callback=self.logged_in)] # ???(hour??????????????????????????????)
def start_requests(self): return [scrapy.FormRequest("http://bbs.byr.cn/user/ajax_login.json", formdata=LOGIN_FORMDATA, meta={'cookiejar': 1}, headers=HEADERS, callback=self.logged_in)]