我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用bs4.BeautifulSoup()。
def make_nsfw_safe(text): """Make NSFW safer by adding click-to-show class to images.""" soup = BeautifulSoup(text, "lxml") images = soup.find_all("img") for image in images: if image.get("class"): image["class"] = "%s nsfw" % " ".join(image.get("class")) else: image["class"] = "nsfw" image.replace_with(image) result = str(soup) # We don't want html/body, which BeautifulSoup kindly wraps our new HTML in if result.startswith("<html><body>") and result.endswith("</body></html>"): result = result[len("<html><body>"):-len("</body></html>")] return result
def mathjax(s): with open("temp.log", "w") as f: f.write(s) p = Popen([app.config['mjpage'], '--dollars', '--output', "CommonHTML", '--fontURL', ("https://cdnjs.cloudflare.com/ajax/libs/" "mathjax/2.7.0/fonts/HTML-CSS")], stdout=PIPE, stdin=PIPE, stderr=PIPE) #filename = hashlib.sha256(s.encode('utf-8')).hexdigest() #with open(filename, 'w') as f: # print(s, file=f) res = p.communicate(input=s.encode('utf-8')) out = res[0].decode('utf-8') err = res[1].decode('utf-8') soup = BeautifulSoup(out, 'html.parser') style = str(soup.style) body = "".join(str(s) for s in soup.body.children) return style, body
def get_best(url): url = 'http://www.infoarena.ro' + url source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, "html.parser") name = soup.find('span', {'class': 'username'}).find('a')['href'][35:] tests = soup.find_all('td', {'class': 'number'}) max_ms = -1 for test in tests: test = test.string if test.endswith('ms'): time = int(test.strip('ms')) max_ms = max(max_ms, time) if name not in d or max_ms < d[name][0]: d[name] = (max_ms, url) print(max_ms, name, url)
def decrypt(hash, tipo): global word try: if(tipo == 0): url = BeautifulSoup(urllib.urlopen("https://md5.gromweb.com/?md5=" + hash), "html.parser") else: url = BeautifulSoup(urllib.urlopen("https://sha1.gromweb.com/?hash=" + hash), "html.parser") password = url.find("em", {"class": "long-content string"}) password = re.sub(re.compile("<.*?>"), "", str(password)).strip() if str(password) == "None": print word+"\t\t\t\t[-] Senha nao encontrada! :-(" else: print word+"\t\t\t\t[+] Senha encontrada: " + password except IOError: decryptwl(hash, tipo)
def add_afsc_links(full_afsc_dict, reddit): """ Add links to /r/AirForce wiki from given filename into the dictionary. :param dict: either enlisted_dict or officer_dict :param reddit: PRAW reddit object """ # gets dict of AFSC to link on /r/AirForce wiki wiki_page = reddit.subreddit("AirForce").wiki["index"] wiki_soup = BeautifulSoup(wiki_page.content_html, "html.parser") links = wiki_soup.find_all("a") # currently all wiki AFSC are enlisted for link in links: # not all links have /r/AirForce/wiki/jobs so this is more generalized # using only /r/AirForce/ wiki links if "www.reddit.com/r/AirForce/wiki/" in link["href"]: AFSC_code = link["href"].split("/")[-1].upper() base_afsc = AFSC_code[:5] # shaves off any prefixes if base_afsc in full_afsc_dict["enlisted"].keys(): full_afsc_dict["enlisted"][base_afsc]["link"] = link["href"]
def process_POST_request(request): dict_ = urlparse.parse_qs(request.text) def htmlify(thing): try: html = dict_[thing][0] except KeyError as e: html = '' return '<html>' + html + '</html>' uri = dict_['uri'][0] head = htmlify('head') body = htmlify('body') try: text = dict_['data'][0] except KeyError as e: text = '' headsoup = BeautifulSoup(head, 'lxml') bodysoup = BeautifulSoup(body, 'lxml') target_uri = getUri(uri, headsoup, bodysoup) doi = getDoi(headsoup, bodysoup) return target_uri, doi, head, body, text
def getRosiItem(): start = time.time() index = 1 while True: url = "http://www.mmxyz.net/category/rosi/page/{}/".format(index) res = requests.get(url,timeout=10) if res.status_code == 404: print("+ Time: {:.2f} S +".format(time.time()-start)) print("+ Total Pages: {} +".format(index-1)) print("+ Total Numbers: {} +".format(len(RosiItems))) print("+-------------------------+\r\n\r\n") return soup = BeautifulSoup(res.content, "html.parser") rosiList = soup.find_all("a", class_="inimg") for rosi in rosiList: RosiItems.append(rosi['href']) index += 1
def hltb(bot,trigger): if not trigger.group(2): return bot.say("Enter a game name to search.") game = trigger.group(2) url = "http://howlongtobeat.com/search_main.php?page=1" payload = {"queryString":game,"t":"games","sorthead":"popular","sortd":"Normal Order","length_type":"main","detail":"0"} test = {'Content-type':'application/x-www-form-urlencoded', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36','origin':'https://howlongtobeat.com','referer':'https://howlongtobeat.com'} session = requests.Session() session.post(url, headers=test, data=payload) r = session.post(url, headers=test, data=payload) if len(r.content) < 250: return bot.say("No results.") bs = BeautifulSoup(r.content) first = bs.findAll("div", {"class":"search_list_details"})[0] name = first.a.text time = first.findAll('div')[3].text bot.say('{} - {}'.format(name, time))
def craw_last_index(ptt_class_name): #ptt_class_name = 'Soft_Job' index_url = 'https://www.ptt.cc/bbs/' + ptt_class_name + '/index.html' res = requests.get(index_url,verify = True) soup3 = BeautifulSoup(res.text, "lxml") x = soup3('',{'class':"btn wide"},text = re.compile('??')) last_index = x[0]['href'] last_index = last_index.replace('/bbs/' + ptt_class_name + '/index','') last_index = int( last_index.replace('.html','') )+1 return last_index #--------------------------------------------------------------------------------- # ?? ubuntu - crontab-e, ????, ??????? data # ?? PTT ????, ???????, ??????, # ??????DATA, ???? index ??????, ??????? data, # ?????, ??????
def addToCart(self): print '\nADD TO CART -----------------' session_get = self.user_session.get(self.URL_product_url, headers=self.get_headers) #print session_get.content soup = BeautifulSoup(session_get.content, 'lxml') results = soup.find_all('select', class_='size-select') #print results for item in results[0].select('option'): re_result = re.sub(self.sub_pattern, '', item.string) #print re_result matchObj = re.search(r"^%s+$" % self.user_size, re_result) if matchObj: self.post_data_addToCart['pid'] = item['value'] self.post_data_addToCart['masterPID'] = item['value'].partition("_")[0] print self.post_data_addToCart break session_post = self.user_session.post(url=self.URL_cart_post_url, headers=self.post_headers, data=self.post_data_addToCart) print 'Add To Cart Status: ' + str(session_post.status_code)
def finalBoss(self): print '\nEntering Payment Info -----------------------------' self.get_headers['Referer'] = self.URL_checkout_url self.post_headers['Referer'] = self.URL_pay_url #print json.dumps(self.get_headers, indent=1) session_get = self.user_session.get(self.URL_pay_url, headers=self.get_headers) savePage(session_get, 'finalCheckout.html') soup = BeautifulSoup(session_get.content, 'lxml') pay_secure_key = soup.find('input', {'name':'dwfrm_payment_securekey'}) print pay_secure_key #NOTE: Visa, Mastercard, etc...correspond to different types. Find how they get set #NOTE: Visa = 001, Mastercard = 002, AE = 003, Discover = 004 post_data_payInfo = { 'dwfrm_payment_creditCard_type': '002', 'dwfrm_payment_creditCard_owner': 'Bob McFlymo', 'dwfrm_payment_creditCard_number': '5105105105105100', 'dwfrm_payment_creditCard_month': '01', 'dwfrm_payment_creditCard_year': '2018', 'dwfrm_payment_creditCard_cvn': '002', 'dwfrm_payment_securekey': pay_secure_key, 'dwfrm_payment_signcreditcardfields': 'sign' } #savePage(session_get, 'finalCheckout.html')
def checkItemDirect(self): #NOTE: this function will most likely hamper performance but in some cases may improve it, leave it up to user choice to run this before checkout #Basic Steps: #Use BS to parse for <ul class="size options" #Size marked as follows: <li class="8 available" data-option-title="8" #Therefore, match data-option-title with user_size, then check the class for available keyword session_get = self.user_session.get(self.URL_product) print 'Status of requests.get: ' + str(session_get.status_code) soup = BeautifulSoup(session_get.content, "lxml") #Check that the lxml parser works for html #Look to use SoupStrainer to improve parsing efficiency for li in soup.select('li[data-option-title]'): #print li['class'] #print type(li['class']) if (self.user_size in li['class']) & ('available' in li['class']): print 'Size ' + self.user_size + ' Available'
def getMoreInfo(self, nzb): """ Get details about a torrent. .. seealso:: MovieSearcher.correctRelease """ data = self.getHTMLData(nzb['detail_url']) soup = BeautifulSoup(data, 'html.parser') description = soup.find(id='description') if description: nzb['description'] = description.prettify() line = soup.find(text='Date de publication').parent.parent pub = line.find_all('td')[1] added = datetime.strptime(pub.getText().split('(')[0].strip(), '%d/%m/%Y %H:%M') nzb['age'] = (datetime.now() - added).days self.log.debug(nzb['age'])
def test_parse_html2(self): parser = QqParser(allowed_tags={'chapter', 'section', 'subsection', 'subsubsection', 'eq', 'eqref', 'ref', 'equation', 'label', 'idx'}) doc = r"""\chapter \label h1:label Hello This is a \ref{h1:label}. """ tree = parser.parse(doc) html = QqHTMLFormatter(tree) s = html.do_format() soup = BeautifulSoup(s, 'html.parser') self.assertEqual(soup.h1['id'], 'label_h1_label') self.assertEqual(soup.span['class'], ['section__number']) self.assertEqual(soup.span.string, "1") self.assertEqual(soup("a")[1].attrs,{'class': ['a-ref'], 'title': '', 'href': '#label_h1_label'}) self.assertEqual(soup("a")[1].string, "1")
def test_parse_html3(self): parser = QqParser(allowed_tags={'h1', 'h2', 'h3', 'h4', 'eq', 'eqref', 'ref', 'equation', 'label', 'idx'}) doc = r"""\equation \label eq:x2y2 x^2 + y^2 = z^2 See \ref{eq:x2y2}. """ tree = parser.parse(doc) html = QqHTMLFormatter(tree) html.counters['equation'].showparents = False s = html.do_format() soup = BeautifulSoup(s, 'html.parser') self.assertEqual(soup.div.attrs, {'id':"label_eq_x2y2",'class':["latex_equation"]}) self.assertEqual(soup.span['class'], ['ref']) self.assertEqual(soup.a['class'], ['a-ref']) self.assertEqual(soup.a['href'], '#mjx-eqn-1') self.assertEqual(soup.a.string, "(1)")
def test_refs_with_separator(self): doc = r"""\chapter Hello \label sec:first \chapter World \label sec:other See \ref[section][sec:first] and \ref[section][sec:other] for details. """ parser = QqParser() formatter = QqHTMLFormatter() parser.allowed_tags.update(formatter.uses_tags()) tree = parser.parse(doc) formatter.root = tree print(tree.as_list()) html = formatter.do_format() soup = BeautifulSoup(html, "html.parser") self.assertEqual(soup("a")[2].contents[0], "section 1")
def test_missing_label(self): doc = r"""\chapter Hello \label sec:first \chapter World \label sec:other See \ref[section][sec:third] and \ref[zection][sec:another] for details. """ parser = QqParser() formatter = QqHTMLFormatter() parser.allowed_tags.update(formatter.uses_tags()) tree = parser.parse(doc) formatter.root = tree print(tree.as_list()) html = formatter.do_format() soup = BeautifulSoup(html, "html.parser") self.assertEqual(soup("a")[2].contents[0], "section ???") self.assertEqual(soup("a")[3].contents[0], "zection ???")
def getpixivfollow(): """Get pixiv bookmark.""" users = ['1789300'] page = 1 userlist = {} bookmark_url = u'https://www.pixiv.net/bookmark.php' while len(users) > 0: page_params = ( ('type', 'user'), ('rest', 'show'), ('p', str(page))) bookmark_page = PIXIV_SESSION.get( bookmark_url, params=page_params, proxies=PROXY).text bookmark_content = BeautifulSoup(bookmark_page, 'lxml') print(u'Get Pixiv bookmark page {0} ...'.format(page)) users = bookmark_content.select("div[class=usericon]") if len(users) == 0: break for user in users: user_info = user.find('a', attrs={'class': 'ui-profile-popup'}) user_name = user_info.attrs['data-user_name'] user_id = user_info.attrs['data-user_id'] userlist[user_id] = user_name page += 1 return userlist
def pixiv2pawoo(pixivid): """Pixiv -> Pawoo.""" pawoourl = u'https://pawoo.net/oauth_authentications/{0}?provider=pixiv' pawoolink = pawoourl.format(pixivid) pawoopage = PAWOO_SESSION.get(pawoolink, proxies=PROXY) if pawoopage.status_code == 200: pawooname = pawoopage.headers.get('link').split(';')[0] pawooname = pawooname.replace( '<https://pawoo.net/.well-known/webfinger?resource=acct%3A', '') pawooname = pawooname.replace('%40pawoo.net>', '') csrf_token = BeautifulSoup(pawoopage.text, 'lxml') csrf_token = csrf_token.select( "meta[name=csrf-token]")[0].attrs.get('content') with open('pawoolist.txt', 'a', encoding='utf-8-sig') as pawoofile: pawoofile.write( '{1},https://pawoo.net/@{0}\n'.format(pawooname, pixivid)) followpawoo(pawooname, csrf_token) return 1 else: return 0
def get_book(url): """ ????? PDF ??? """ # ???????? print('???????……') nav_page = CONNECTION.get(url).text shelves = set(re.findall(r'/courses/.+/pdfbook/\d/', nav_page)) for shelf_count, shelf in enumerate(shelves, 1): res = CONNECTION.get(BASE_URL + shelf).text soup = BeautifulSoup(res, 'lxml') save_dir = os.path.join(BASE_DIR, 'Books', str(shelf_count)) if not os.path.exists(save_dir): os.makedirs(save_dir) for book_count, book in enumerate(soup.select('#booknav a'), 1): print('------>', book.string) file_name = REG_FILE.sub(' ', book.string) + '.pdf' pdf = CONNECTION.get(BASE_URL + book['rel'][0]).content with open(os.path.join(save_dir, file_name), 'wb') as pdf_file: pdf_file.write(pdf)
def read_captcha(): header={ 'User-Agent':'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1', 'Host':'login.weibo.cn' } url_login = 'http://login.weibo.cn/login/' html = requests.get(url_login,headers=header).content # ???? soup = BeautifulSoup(html, 'lxml') code_img = str(soup.find('img'))[24:-3] # ????????? print(code_img) urlretrieve(code_img, r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif') show_img(r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif') remove_line(r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif', r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha/') pic_cut('captcha_removeline.gif', 'E:/????/??????/1 ???/captcha_master1/captcha_master/main_captcha/', 'E:/????/??????/1 ???/captcha_master1/captcha_master/word/')
def gen_item_comment(self, response): comment = [] new_comment = {} comments_data = [] rep_time_list = response.xpath('//div[@class="authi"]//em').extract() for indexi, content in enumerate(response.xpath('//td[@class="t_f"]').extract()): soup = BeautifulSoup(content, 'lxml') if soup.find('div', class_='attach_nopermission') is not None: soup.find('div', class_='attach_nopermission').clear() [s.extract() for s in soup('script')] # remove script tag c = StrClean.clean_unicode(soup.get_text()) comments_data.append({'content': c, 'reply_time': self.format_rep_date(rep_time_list[indexi])}) new_comment['url'] = response.url new_comment['comments_data'] = comments_data comment.append(new_comment) return comment
def gen_item_comment(self, response): comment = [] new_comment = {} comments_data = [] rep_time_list = response.xpath('//div[@class="authi"]//em').extract() for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]//table[1]').extract()): soup = BeautifulSoup(content, 'lxml') [s.extract() for s in soup('script')] # remove script tag c = StrClean.clean_comment(soup.get_text()) if indexi >= len(rep_time_list): rep_time = self.format_rep_date(rep_time_list[-1]) else: rep_time = self.format_rep_date(rep_time_list[indexi]) comments_data.append({'content': c, 'reply_time': rep_time}) new_comment['url'] = response.url new_comment['comments_data'] = comments_data comment.append(new_comment) return comment
def gen_item_comment(self, response): comment = [] new_comment = {} comments_data = [] rep_time_list = re.findall(u'\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}', response.body) for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]//table[1]').extract()): soup = BeautifulSoup(content, 'lxml') [s.extract() for s in soup('script')] # remove script tag c = StrClean.clean_comment(soup.get_text()) if indexi >= len(rep_time_list): rep_time = self.format_rep_date(rep_time_list[-1]) else: rep_time = self.format_rep_date(rep_time_list[indexi]) comments_data.append({'content': c, 'reply_time': rep_time}) new_comment['url'] = response.url new_comment['comments_data'] = comments_data comment.append(new_comment) return comment
def article_detail(aitem, response): for a_content in response.xpath('//script').extract(): if a_content.find("detailArticle|post") == -1: continue a_content = a_content.split("props=")[1] a_content = a_content.split(",location")[0] a_content = json.loads(a_content).get("detailArticle|post") aitem.content = BeautifulSoup(a_content.get("content"), 'lxml').get_text() aitem.time = a_content.get('published_at') aitem.last_reply_time = aitem.time aitem.views = a_content.get('counters').get('view_count') aitem.replies = a_content.get('counters').get('comment') aitem.author = a_content.get('user').get('name') aitem.title = a_content.get('title') category_tags = json.loads(a_content.get('extraction_tags')) category = '' for category_tag in category_tags: category += category_tag[0] + ' ' aitem.category = category return aitem
def gen_item_comment(self, response): comment = [] new_comment = {} comments_data = [] rep_time_list = response.xpath('//span[@class="time"]').extract() for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]/table[1]').extract()): soup = BeautifulSoup(content, 'lxml') if soup.find('div', class_='attach_nopermission') is not None: soup.find('div', class_='attach_nopermission').clear() [s.extract() for s in soup('script')] # remove script tag c = StrClean.clean_unicode(soup.get_text()) comments_data.append({'content': c, 'reply_time': self.format_rep_date(rep_time_list[indexi])}) new_comment['url'] = response.url new_comment['comments_data'] = comments_data comment.append(new_comment) return comment
def gen_item_comment(self, response, is_first=False): comment = [] new_comment = {} comments_data = [] rep_time_list = response.xpath('//span[@class="date"]/text()').extract() for indexi, content in enumerate(response.xpath('//div[@class="replycontent"]').extract()): soup = BeautifulSoup(content, 'lxml') [s.extract() for s in soup('script')] # remove script tag c = StrClean.clean_comment(soup.get_text()) time_index = indexi if is_first: time_index += 1 if time_index >= len(rep_time_list): rep_time = self.format_rep_date(rep_time_list[-1]) else: rep_time = self.format_rep_date(rep_time_list[time_index]) comments_data.append({'content': c, 'reply_time': rep_time}) new_comment['url'] = response.url new_comment['comments_data'] = comments_data comment.append(new_comment) return comment
def gen_item_comment(self, response, is_first=False): comment = [] new_comment = {} comments_data = [] rep_time_list = response.xpath('//div[@class="authi"]/em').extract() if len(rep_time_list) == 0: return comment for indexi, content in enumerate(response.xpath('//div[@class="pct"]//table[1]').extract()): if is_first and indexi == 0: continue soup = BeautifulSoup(content, 'lxml') [s.extract() for s in soup('script')] # remove script tag c = StrClean.clean_comment(soup.get_text()) time_index = indexi if time_index >= len(rep_time_list): rep_time = self.format_rep_date(rep_time_list[-1]) else: rep_time = self.format_rep_date(rep_time_list[time_index]) comments_data.append({'content': c, 'reply_time': rep_time}) new_comment['url'] = response.url new_comment['comments_data'] = comments_data comment.append(new_comment) return comment
def download_lyrics(artist, url): print url time.sleep(random() + 2) page = urllib2.urlopen(url).read() soup = BeautifulSoup(page, 'html.parser') # Get the song title song_title = soup.find('title').get_text().split(' - ')[1].lower().replace('/', ' ').replace(' ', '_') # Get the lyrics div lyrics = soup.findAll('div', {'class': ''}) for i in lyrics: lyrics = i.get_text().strip() if len(lyrics) > 10: with open('artists/' + artist + '/' + song_title + '.txt', 'wb') as w: cleaned_lyrics = lyrics.replace('\r\n', ' *BREAK* ').replace('\n', ' *BREAK* ').replace(' ', ' ') w.write(cleaned_lyrics.encode('utf-8'))
def download_songs(url): time.sleep(random.random() * 0.5) try: page = urllib2.urlopen(url).read() soup = BeautifulSoup(page, 'html.parser') # Get the artist name artist_name = soup.findAll('h1')[0].get_text()[:-7].lower().replace(' ', '_') # Store all songs for a given artist with open('artist_data/'+artist_name+'.txt', 'wb') as w: for song in soup.findAll('a', {'target': '_blank'}): if 'lyrics/' in song['href']: song_url = song['href'][1:].strip() w.write(song_url + '\n') except urllib2.HTTPError: print '404 not found'
def packages(self): """ Parse XML file to locate packages. """ xml = requests.get(self._xml_url).content soup = BeautifulSoup(xml, "html.parser") nltk_packages, third_party = defaultdict(dict), defaultdict(dict) for pack in soup.find_all('package'): package_attributes = pack.attrs name = package_attributes['id'] # Keeps track of nltk_data packages vs third_party packages. if package_attributes['url'].startswith(self._nltk_data_url): nltk_packages[name] = package_attributes else: third_party[name] = package_attributes return nltk_packages, third_party
def _online_tibia(self): """Get total players playing""" url = "http://www.tibia.com/community/?subtopic=worlds" try: async with aiohttp.get(url) as response: soup = BeautifulSoup(await response.text(), "html.parser") div1 = soup.find('div', attrs={'id': 'RightArtwork'}) div2 = div1.find('div', attrs={'id': 'PlayersOnline'}) test = div2.get_text() test1 = test.replace("Players Online", "") new = "Players currently playing Tibia: " + test1 # div2 = div1.find('div', attrs={'class': 'Border_2'}) # div3 = div2.find('div', attrs={'class': 'Border_3'}) # table = div3.find_all('table', attrs={'class': 'Table1'}) # tr = table.find_all('tr') # tbody = div4.find('div', attrs={'class': 'CaptionInnerContainer'}) await self.bot.say(str(new)) except: await self.bot.say("Could not retrive data. The webserver may be offline.")
def _server_tibia(self, servername): """Get Server Info""" servername = servername.title() url = "https://secure.tibia.com/community/?subtopic=worlds&world=" + str(servername) try: async with aiohttp.get(url) as response: soup = BeautifulSoup(await response.text(), "html5lib") b = soup.find_all("table", attrs={'class': 'Table1'}) new = [] rows = b[1].tbody.div.find_all('td') for row in rows: new.append(row.get_text()) k = new[::2] l = new[1::2] zipped = list(zip(k, l)) t = tabulate(zipped, headers=["Category", "Info"]) await self.bot.say("```Python" + "\n" + str(t) + "```") except: await self.bot.say("Unable to retrive server data. The webserver may be offline.")
def get_course_status(course_num): client = Client() subject = client.get_course_subject(course_num) if subject is None: return None semester = get_semester() subject_url = "http://classes.cornell.edu/browse/roster/" + semester + "/subject/" + subject subject_page = requests.get(subject_url) subject_page.raise_for_status() subject_bs4 = bs4.BeautifulSoup(subject_page.text, "html.parser") course_code_tags = subject_bs4.find_all("strong", class_="tooltip-iws") for tag in course_code_tags: course_code = int(tag.getText().strip()) if course_num == course_code: section = tag.parent.parent.parent status = section.find_all('li', class_ = "open-status")[0].i["class"][-1] if "open-status-open" in status: return "open" if "open-status-closed" in status: return "closed" if "open-status-warning" in status: return "waitlist" if "open-status-archive" in status: return "archive"
def get_soup_from_url(url, params=None): ''' url? parameter? ???? ?? URL? GET??? ?? ??(HTML text)? BeautifulSoup??? ??? ?? :param url: GET??? ?? URL string :param params: GET?? ???? dict :return: BeautifulSoup object ''' # requests.get??? ?? ???(response??)? r??? ?? r = requests.get(url, params=params) # response???? text???? ??? ??? html_doc??? ?? html_doc = r.text # BeautifulSoup??? ??, ??? html text soup = BeautifulSoup(html_doc, 'lxml') return soup
def test_tag_inherits_self_closing_rules_from_builder(self): if XML_BUILDER_PRESENT: xml_soup = BeautifulSoup("", "xml") xml_br = xml_soup.new_tag("br") xml_p = xml_soup.new_tag("p") # Both the <br> and <p> tag are empty-element, just because # they have no contents. self.assertEqual(b"<br/>", xml_br.encode()) self.assertEqual(b"<p/>", xml_p.encode()) html_soup = BeautifulSoup("", "html") html_br = html_soup.new_tag("br") html_p = html_soup.new_tag("p") # The HTML builder users HTML's rules about which tags are # empty-element tags, and the new tags reflect these rules. self.assertEqual(b"<br/>", html_br.encode()) self.assertEqual(b"<p></p>", html_p.encode())
def Scrape(url): timeout = 10 socket.setdefaulttimeout(timeout) #Collecting html content. headers = {'User-Agent': 'TorScrapper - Onion scrapper | github.com/ConanKapoor/TorScrapper.git' } req = urllib.request.Request(url,None,headers) response = urllib.request.urlopen(req) #Using BeautifulSoup to parse html object response. page = BeautifulSoup(response.read(),'html.parser') #Saving output token = re.sub(r'[^\w]', '', url) name = os.path.abspath("") + '/Output/Scraped-' + token +'.html' file = open(name,'w') file.write(str(page)) file.close() # Taking input.
def getJournalURL(jname): # get journal URL given the journal name for retrieving article PIIs urlstr = "http://api.elsevier.com/sitemap/page/sitemap/" + jname[0].lower() + ".html" retl = "" with urllib.request.urlopen(urlstr) as url: response = url.read() linkcnt = 0 for link in BeautifulSoup(response, parse_only=SoupStrainer("a")): if linkcnt == 0: linkcnt += 1 continue if link.has_attr("href"): if link.text.lower() == jname.lower(): #print(link["href"]) retl = link["href"] break linkcnt += 1 return retl
def get_url(self, query): site1 = urllib.urlopen('http://www.youtube.com/results?search_query=%s'%query) html = site1.read() soup = BS(html) links = soup.findAll('a') vidlinks = [link.get('href') for link in links if link.get('href') is not None] vlink = [ i for i in vidlinks if '/watch?v=' in i][0] img_link = soup.findAll('img',{'alt':'Thumbnail', 'width':'185'})[0].get('src') img_url = 'http:%s' %img_link imagethread = threading.Thread(target=lambda:urllib.urlretrieve(img_url, 'Files\image.jpg')) imagethread.start() return vlink
def run(self): ind=self.qu.get() url=self.url+str(ind) soup =bs.BeautifulSoup(''.join( ul.urlopen(url).readlines() )) bu = up.urlsplit(self.url) print 'started with the ' ,str(url).split('/')[-1], for i in soup.find_all(attrs = { "class" : "recipe-title"}): sp = up.urlsplit(i.a.get('href')) path = sp.path print path if re.search(pat, path): path = bu.scheme+'://'+bu.netloc+path filename = str(path).split('/')[-2] filename = op.join(op.abspath(op.curdir),filename+'.py') # recipe will be stored in given location # filename = op.join(op.abspath(op.curdir),filename+'.html') #uncomment the above line if downloading the web page for teh recipe print path self.q.put((path,filename)) self.fetch_data() time.sleep(1) self.qu.task_done() self.q.join() print 'done with the ' ,str(url).split('/')[-1],
def get_all_key_signatures(cfg, keyid): """ Get all signatures for a specific key. We exclude self signed signatures because this is not helpful for us. """ content, status_code = make_sks_request( cfg, requests.get, "lookup", {"op": "vindex", "search": "0x{}".format(keyid)}, None ) if status_code != 200: return status_code, content elem = BeautifulSoup(content, HTML_PARSER).span ids = [] while (elem.findNext().name != "strong" and elem.findNext()): elem = elem.findNext() if "op=get" in elem["href"] and elem.text != keyid: ids.append(elem.text) return ids
def search_key(cfg, search_str): """ Search for a key by a given string """ content, status_code = make_sks_request( cfg, requests.get, "lookup", {"op": "index", "search": search_str}, None ) if status_code != 200: return content, status_code bs = BeautifulSoup(content, HTML_PARSER) regex = re.compile(r"^pub *\d{3,4}\w\/([\w\d]{8})") ids = [] for pre in bs.findAll("pre"): match = regex.search(pre.text.strip("\r\n")) if match and not "KEY REVOKED" in pre.text: ids.append(match.groups()[0]) return {"ids": ids}, status_code
def parse_news(self,response): item = response.meta.get("item",NewsItem()) soup = BeautifulSoup(response.body.decode('gbk')) pic = soup.find('p' , class_ = 'f_center').find('img').get('src') if soup.find('p' , class_ = 'f_center') and soup.find('p' , class_ = 'f_center').find('img') else None referer_web = soup.find('a',id = 'ne_article_source').text if soup.find('a',id = 'ne_article_source') else None referer_url = soup.find('a',id = 'ne_article_source').get('href') if soup.find('a',id = 'ne_article_source') else None author = soup.find('span',class_ = 'ep-editor').text if soup.find('span',class_ = 'ep-editor') else None if u"?" in author: author = author.split(u"?")[-1] crawl_date = NOW read_num = soup.find('div',class_ = 'post_comment_joincount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0 comment_num = soup.find('div',class_ = 'post_comment_tiecount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0 content = soup.find('div',class_ = 'post_text').get_text(strip=True) if soup.find('div',class_ = 'post_text') else None item['referer_web'] = referer_web item['content'] = content item['referer_url'] = referer_url item['author'] = author item['crawl_date'] = crawl_date item['pic'] = pic item['comment_num'] = int(comment_num) item['read_num'] = int(read_num) yield item
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body.decode("utf-8").encode("utf-8"),"lxml") pic = soup.find("p",class_ = "detailPic").find("img").get("src") if soup.find("p",class_ = "detailPic") else None referer_web = soup.find("span",class_ = "ss03").text if soup.find("span",class_ = "ss03") else None author = soup.find("span",itemprop="author").find("span").text if soup.find("span",itemprop="author") else None temp = soup.find("div" ,id = "main_content") if temp: ps = temp.find_all("p") if temp.find_all("p") else None content = "\n\n".join([ p.text.strip() for p in ps]) else: content = None item['pic'] = pic item['referer_web'] = referer_web item['author'] = author item['content'] = content item['crawl_date'] = NOW yield item
def parse_news(self,response): item = response.meta.get("item",None) # #?????????????????????? # news_date = item.get("news_date",None) # if news_date: # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d") # news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") # # delta = self.end_now-struct_date # if delta.days == self.end_day: # # pass # raise CloseSpider('today scrapy end') soup = BeautifulSoup(response.body) news_content_group = soup.find("div",class_="entry-content group") #?????? news_content_group.find("div",class_="related_posts").replace_with("") content = news_content_group.text.strip() item["content"] = content item["catalogue"] = u"????" yield item
def parse_news(self,response): item = response.meta.get("item",NewsItem()) pageindex = response.meta.get("pageindex",1) soup = BeautifulSoup(response.body, 'lxml') origin_date = soup.find("td", class_="time").text.strip() struct_date= datetime.datetime.strptime(origin_date,"%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") content = soup.find("div", class_= "lph-article-comView").text.strip() if soup.find("div", class_= "lph-article-comView").text.strip() else None item["news_date"]= news_date item["crawl_date"]= NOW item["content"] = content item["catalogue"] = u"????" item = judge_news_crawl(item) if item: yield item else: self.flag = int(pageindex)
def parse(self,response): origin_url = response.url if "index" not in origin_url: soup = BeautifulSoup(response.body,"lxml") catalogue = soup.find("a",class_ = "blue CurrChnlCls").get("title").strip() news_list = soup.find("div", class_ = "lie_main_m").find_all("li") for news in news_list: title = news.find("a").text.strip() news_url = "http://www.cnta.gov.cn/xxfb" + news.find("a").get("href")[2:] news_no = news_url.rsplit("/",1)[-1].split(".")[0] item = NewsItem( news_url =news_url, title = title, news_no = news_no, catalogue = catalogue, ) yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item}) else: topic_url = origin_url.rsplit(".",1)[0] self.flag.setdefault(topic_url,0) yield scrapy.Request(origin_url,callback=self.parse_topic)
def parse(self, response): origin_url = response.url #http://money.163.com/special/002526O5/transport_02.html search_result = re.search(r"_(\d)*?\.",origin_url) #???? pageindex = search_result.group(1) if search_result else 1 soup = BeautifulSoup(response.body,"lxml") news_list = soup("div",class_="list_item clearfix") for news in news_list: news_date = news.find("span",class_="time").text if news.find("span",class_="time")else None title = news.find("h2").text if news.find("h2") else None news_url = news.find("h2").a.get("href",None) if news.find("h2") else None abstract = news.find("p").contents[0] if news.find("p") else None item = NewsItem(title=title,news_url=news_url,abstract=abstract,news_date=news_date) item = judge_news_crawl(item) #?????????? if item: request = scrapy.Request(news_url,callback=self.parse_news,meta={"item":item}) yield request else: self.flag = int(pageindex) if not self.flag: next_url = self.next_url % int(pageindex)+1 yield scrapy.Request(next_url)
def parse_news(self,response): item = response.meta.get("item",NewsItem()) soup = BeautifulSoup(response.body) referer_web = soup.find("a",id="ne_article_source").text if soup.find("a",id="ne_article_source") else None referer_url = soup.find("a",id="ne_article_source").get("href",None) if soup.find("a",id="ne_article_source") else None comment_num = soup.find("a",class_="post_cnum_tie").text if soup.find("a",id="ne_article_source") else None content = soup.find("div",class_="post_text").text.strip() if soup.find("div",class_="post_text") else None #??: ?????????-????? ?????? author_source = soup.find("span",class_="left").text if soup.find("span",class_="left") else None #TODO ?????? # import pdb;pdb.set_trace() # author = re.search(u"??(.*)",author_source).group(1)[1:] if author_source else None # item["author"]=author item["referer_web"]=referer_web item["referer_url"]=referer_url item["comment_num"]=comment_num item["content"]=content item["crawl_date"]=NOW yield item