Python bs4 模块，BeautifulSoup() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用bs4.BeautifulSoup()。

项目：socialhome 作者：jaywink | 项目源码 | 文件源码

def make_nsfw_safe(text):
    """Make NSFW safer by adding click-to-show class to images."""
    soup = BeautifulSoup(text, "lxml")
    images = soup.find_all("img")

    for image in images:
        if image.get("class"):
            image["class"] = "%s nsfw" % " ".join(image.get("class"))
        else:
            image["class"] = "nsfw"
        image.replace_with(image)

    result = str(soup)
    # We don't want html/body, which BeautifulSoup kindly wraps our new HTML in
    if result.startswith("<html><body>") and result.endswith("</body></html>"):
        result = result[len("<html><body>"):-len("</body></html>")]
    return result

项目：qqmbr 作者：ischurov | 项目源码 | 文件源码

def mathjax(s):
    with open("temp.log", "w") as f:
        f.write(s)

    p = Popen([app.config['mjpage'],
              '--dollars',
               '--output', "CommonHTML",
               '--fontURL',
               ("https://cdnjs.cloudflare.com/ajax/libs/"
                "mathjax/2.7.0/fonts/HTML-CSS")], stdout=PIPE, stdin=PIPE,
              stderr=PIPE)

    #filename = hashlib.sha256(s.encode('utf-8')).hexdigest()
    #with open(filename, 'w') as f:
    #    print(s, file=f)

    res = p.communicate(input=s.encode('utf-8'))
    out = res[0].decode('utf-8')
    err = res[1].decode('utf-8')

    soup = BeautifulSoup(out, 'html.parser')
    style = str(soup.style)
    body = "".join(str(s) for s in soup.body.children)

    return style, body

项目：optimum-arena 作者：ovidiugiorgi | 项目源码 | 文件源码

def get_best(url):
    url = 'http://www.infoarena.ro' + url
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, "html.parser")
    name = soup.find('span', {'class': 'username'}).find('a')['href'][35:]
    tests = soup.find_all('td', {'class': 'number'})
    max_ms = -1
    for test in tests:
        test = test.string
        if test.endswith('ms'):
            time = int(test.strip('ms'))
            max_ms = max(max_ms, time)
    if name not in d or max_ms < d[name][0]:
        d[name] = (max_ms, url)
    print(max_ms, name, url)

项目：HatDecrypter 作者：HatBashBR | 项目源码 | 文件源码

def decrypt(hash, tipo):
    global word

    try:
        if(tipo == 0):
            url = BeautifulSoup(urllib.urlopen("https://md5.gromweb.com/?md5=" + hash), "html.parser")
        else:
            url = BeautifulSoup(urllib.urlopen("https://sha1.gromweb.com/?hash=" + hash), "html.parser")

        password = url.find("em", {"class": "long-content string"})
        password = re.sub(re.compile("<.*?>"), "", str(password)).strip()
        if str(password) == "None":
            print word+"\t\t\t\t[-] Senha nao encontrada! :-("
        else:
            print word+"\t\t\t\t[+] Senha encontrada: " + password
    except IOError:
       decryptwl(hash, tipo)

项目：AFSCbot 作者：HadManySons | 项目源码 | 文件源码

def add_afsc_links(full_afsc_dict, reddit):
    """
    Add links to /r/AirForce wiki from given filename into the dictionary.
    :param dict: either enlisted_dict or officer_dict
    :param reddit: PRAW reddit object
    """
    # gets dict of AFSC to link on /r/AirForce wiki
    wiki_page = reddit.subreddit("AirForce").wiki["index"]
    wiki_soup = BeautifulSoup(wiki_page.content_html, "html.parser")
    links = wiki_soup.find_all("a")

    # currently all wiki AFSC are enlisted
    for link in links:
        # not all links have /r/AirForce/wiki/jobs so this is more generalized
        # using only /r/AirForce/ wiki links
        if "www.reddit.com/r/AirForce/wiki/" in link["href"]:
            AFSC_code = link["href"].split("/")[-1].upper()
            base_afsc = AFSC_code[:5]  # shaves off any prefixes
            if base_afsc in full_afsc_dict["enlisted"].keys():
                full_afsc_dict["enlisted"][base_afsc]["link"] = link["href"]

项目：scibot 作者：SciCrunch | 项目源码 | 文件源码

def process_POST_request(request):
    dict_ = urlparse.parse_qs(request.text)
    def htmlify(thing):
        try:
            html = dict_[thing][0]
        except KeyError as e:
            html = ''
        return '<html>' + html + '</html>'
    uri = dict_['uri'][0]
    head = htmlify('head')
    body = htmlify('body')
    try:
        text = dict_['data'][0]
    except KeyError as e:
        text = ''

    headsoup = BeautifulSoup(head, 'lxml')
    bodysoup = BeautifulSoup(body, 'lxml')

    target_uri = getUri(uri, headsoup, bodysoup)
    doi = getDoi(headsoup, bodysoup)
    return target_uri, doi, head, body, text

项目：Rosi 作者：HaoBingo | 项目源码 | 文件源码

def getRosiItem():
    start = time.time()
    index = 1
    while True:
        url = "http://www.mmxyz.net/category/rosi/page/{}/".format(index)
        res = requests.get(url,timeout=10)
        if res.status_code == 404:
            print("+   Time: {:.2f} S         +".format(time.time()-start))
            print("+   Total Pages:     {}   +".format(index-1))
            print("+  Total Numbers:   {}  +".format(len(RosiItems)))
            print("+-------------------------+\r\n\r\n")
            return
        soup = BeautifulSoup(res.content, "html.parser")
        rosiList = soup.find_all("a", class_="inimg")
        for rosi in rosiList:
            RosiItems.append(rosi['href'])
        index += 1

项目：sopel-modules 作者：phixion | 项目源码 | 文件源码

def hltb(bot,trigger):
    if not trigger.group(2):
        return bot.say("Enter a game name to search.")
    game = trigger.group(2)
    url = "http://howlongtobeat.com/search_main.php?page=1"
    payload = {"queryString":game,"t":"games","sorthead":"popular","sortd":"Normal Order","length_type":"main","detail":"0"}
    test = {'Content-type':'application/x-www-form-urlencoded', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36','origin':'https://howlongtobeat.com','referer':'https://howlongtobeat.com'}
    session = requests.Session()
    session.post(url, headers=test, data=payload)
    r = session.post(url, headers=test, data=payload)
    if len(r.content) < 250:
        return bot.say("No results.")
    bs = BeautifulSoup(r.content)
    first = bs.findAll("div", {"class":"search_list_details"})[0]
    name = first.a.text
    time = first.findAll('div')[3].text
    bot.say('{} - {}'.format(name, time))

项目：Crawler_and_Share 作者：f496328mm | 项目源码 | 文件源码

def craw_last_index(ptt_class_name):   
    #ptt_class_name = 'Soft_Job'
    index_url = 'https://www.ptt.cc/bbs/' + ptt_class_name + '/index.html'
    res = requests.get(index_url,verify = True)
    soup3 = BeautifulSoup(res.text, "lxml")   

    x = soup3('',{'class':"btn wide"},text = re.compile('??'))
    last_index = x[0]['href']
    last_index = last_index.replace('/bbs/' + ptt_class_name + '/index','')
    last_index = int( last_index.replace('.html','') )+1

    return last_index
#--------------------------------------------------------------------------------- 
# ?? ubuntu - crontab-e, ????, ??????? data 
# ?? PTT ????, ???????, ??????, 
# ??????DATA, ???? index ??????, ??????? data,
# ?????, ??????

项目：OpenCouture-Dev 作者：9-9-0 | 项目源码 | 文件源码

def addToCart(self):
        print '\nADD TO CART -----------------'
        session_get = self.user_session.get(self.URL_product_url, headers=self.get_headers)
        #print session_get.content
        soup = BeautifulSoup(session_get.content, 'lxml')

        results = soup.find_all('select', class_='size-select')
        #print results

        for item in results[0].select('option'):
            re_result = re.sub(self.sub_pattern, '', item.string)
            #print re_result
            matchObj = re.search(r"^%s+$" % self.user_size, re_result)
            if matchObj:
                self.post_data_addToCart['pid'] = item['value']
                self.post_data_addToCart['masterPID'] = item['value'].partition("_")[0]
                print self.post_data_addToCart
                break

        session_post = self.user_session.post(url=self.URL_cart_post_url, headers=self.post_headers, data=self.post_data_addToCart)
        print 'Add To Cart Status: ' + str(session_post.status_code)

项目：OpenCouture-Dev 作者：9-9-0 | 项目源码 | 文件源码

def finalBoss(self):
        print '\nEntering Payment Info -----------------------------'
        self.get_headers['Referer'] = self.URL_checkout_url
        self.post_headers['Referer'] = self.URL_pay_url
        #print json.dumps(self.get_headers, indent=1)
        session_get = self.user_session.get(self.URL_pay_url, headers=self.get_headers)
        savePage(session_get, 'finalCheckout.html')
        soup = BeautifulSoup(session_get.content, 'lxml')
    pay_secure_key = soup.find('input', {'name':'dwfrm_payment_securekey'})
        print pay_secure_key


        #NOTE: Visa, Mastercard, etc...correspond to different types. Find how they get set
        #NOTE: Visa = 001, Mastercard = 002, AE = 003, Discover = 004
        post_data_payInfo = { 'dwfrm_payment_creditCard_type': '002',
                              'dwfrm_payment_creditCard_owner': 'Bob McFlymo',
                              'dwfrm_payment_creditCard_number': '5105105105105100',
                              'dwfrm_payment_creditCard_month': '01',
                              'dwfrm_payment_creditCard_year': '2018',
                              'dwfrm_payment_creditCard_cvn': '002',
                              'dwfrm_payment_securekey': pay_secure_key,
                              'dwfrm_payment_signcreditcardfields': 'sign'
                             }

        #savePage(session_get, 'finalCheckout.html')

项目：OpenCouture-Dev 作者：9-9-0 | 项目源码 | 文件源码

def checkItemDirect(self):
        #NOTE: this function will most likely hamper performance but in some cases may improve it, leave it up to user choice to run this before checkout
        #Basic Steps:
        #Use BS to parse for <ul class="size options"
        #Size marked as follows: <li class="8 available" data-option-title="8"
        #Therefore, match data-option-title with user_size, then check the class for available keyword
        session_get = self.user_session.get(self.URL_product)
        print 'Status of requests.get: ' + str(session_get.status_code)
        soup = BeautifulSoup(session_get.content, "lxml")
        #Check that the lxml parser works for html
        #Look to use SoupStrainer to improve parsing efficiency
        for li in soup.select('li[data-option-title]'):
            #print li['class']
            #print type(li['class'])

            if (self.user_size in li['class']) & ('available' in li['class']): 
                print 'Size ' + self.user_size + ' Available'

项目：potatoygg 作者：Ripolin | 项目源码 | 文件源码

def getMoreInfo(self, nzb):
        """
        Get details about a torrent.

        .. seealso:: MovieSearcher.correctRelease
        """
        data = self.getHTMLData(nzb['detail_url'])
        soup = BeautifulSoup(data, 'html.parser')
        description = soup.find(id='description')
        if description:
            nzb['description'] = description.prettify()
        line = soup.find(text='Date de publication').parent.parent
        pub = line.find_all('td')[1]
        added = datetime.strptime(pub.getText().split('(')[0].strip(),
                                  '%d/%m/%Y %H:%M')
        nzb['age'] = (datetime.now() - added).days
        self.log.debug(nzb['age'])

项目：qqmbr 作者：ischurov | 项目源码 | 文件源码

def test_parse_html2(self):
        parser = QqParser(allowed_tags={'chapter', 'section',
                                        'subsection', 'subsubsection',
                                        'eq', 'eqref', 'ref',
                                        'equation', 'label', 'idx'})
        doc = r"""\chapter \label h1:label
    Hello

This is a \ref{h1:label}.
"""
        tree = parser.parse(doc)
        html = QqHTMLFormatter(tree)
        s = html.do_format()
        soup = BeautifulSoup(s, 'html.parser')

        self.assertEqual(soup.h1['id'], 'label_h1_label')
        self.assertEqual(soup.span['class'], ['section__number'])
        self.assertEqual(soup.span.string, "1")
        self.assertEqual(soup("a")[1].attrs,{'class': ['a-ref'], 'title': '', 'href': '#label_h1_label'})
        self.assertEqual(soup("a")[1].string, "1")

项目：qqmbr 作者：ischurov | 项目源码 | 文件源码

def test_parse_html3(self):
        parser = QqParser(allowed_tags={'h1', 'h2', 'h3', 'h4', 'eq', 'eqref', 'ref', 'equation', 'label', 'idx'})
        doc = r"""\equation \label eq:x2y2
    x^2 + y^2 = z^2

See \ref{eq:x2y2}.
"""
        tree = parser.parse(doc)
        html = QqHTMLFormatter(tree)
        html.counters['equation'].showparents = False
        s = html.do_format()
        soup = BeautifulSoup(s, 'html.parser')
        self.assertEqual(soup.div.attrs, {'id':"label_eq_x2y2",'class':["latex_equation"]})
        self.assertEqual(soup.span['class'], ['ref'])
        self.assertEqual(soup.a['class'], ['a-ref'])
        self.assertEqual(soup.a['href'], '#mjx-eqn-1')
        self.assertEqual(soup.a.string, "(1)")

项目：qqmbr 作者：ischurov | 项目源码 | 文件源码

def test_refs_with_separator(self):
        doc = r"""\chapter Hello \label sec:first

\chapter World \label sec:other

See
\ref[section][sec:first] and \ref[section][sec:other] for details.
"""
        parser = QqParser()
        formatter = QqHTMLFormatter()
        parser.allowed_tags.update(formatter.uses_tags())
        tree = parser.parse(doc)
        formatter.root = tree
        print(tree.as_list())
        html = formatter.do_format()
        soup = BeautifulSoup(html, "html.parser")
        self.assertEqual(soup("a")[2].contents[0], "section 1")

项目：qqmbr 作者：ischurov | 项目源码 | 文件源码

def test_missing_label(self):
        doc = r"""\chapter Hello \label sec:first

\chapter World \label sec:other

See
\ref[section][sec:third] and \ref[zection][sec:another] for details.
"""
        parser = QqParser()
        formatter = QqHTMLFormatter()
        parser.allowed_tags.update(formatter.uses_tags())
        tree = parser.parse(doc)
        formatter.root = tree
        print(tree.as_list())
        html = formatter.do_format()
        soup = BeautifulSoup(html, "html.parser")
        self.assertEqual(soup("a")[2].contents[0], "section ???")
        self.assertEqual(soup("a")[3].contents[0], "zection ???")

项目：pixiv2pawoo 作者：TimeCompass | 项目源码 | 文件源码

def getpixivfollow():
    """Get pixiv bookmark."""
    users = ['1789300']
    page = 1
    userlist = {}
    bookmark_url = u'https://www.pixiv.net/bookmark.php'
    while len(users) > 0:
        page_params = (
            ('type', 'user'),
            ('rest', 'show'),
            ('p', str(page)))
        bookmark_page = PIXIV_SESSION.get(
            bookmark_url, params=page_params, proxies=PROXY).text
        bookmark_content = BeautifulSoup(bookmark_page, 'lxml')
        print(u'Get Pixiv bookmark page {0} ...'.format(page))
        users = bookmark_content.select("div[class=usericon]")
        if len(users) == 0:
            break
        for user in users:
            user_info = user.find('a', attrs={'class': 'ui-profile-popup'})
            user_name = user_info.attrs['data-user_name']
            user_id = user_info.attrs['data-user_id']
            userlist[user_id] = user_name
        page += 1
    return userlist

项目：pixiv2pawoo 作者：TimeCompass | 项目源码 | 文件源码

def pixiv2pawoo(pixivid):
    """Pixiv -> Pawoo."""
    pawoourl = u'https://pawoo.net/oauth_authentications/{0}?provider=pixiv'
    pawoolink = pawoourl.format(pixivid)
    pawoopage = PAWOO_SESSION.get(pawoolink, proxies=PROXY)
    if pawoopage.status_code == 200:
        pawooname = pawoopage.headers.get('link').split(';')[0]
        pawooname = pawooname.replace(
            '<https://pawoo.net/.well-known/webfinger?resource=acct%3A', '')
        pawooname = pawooname.replace('%40pawoo.net>', '')
        csrf_token = BeautifulSoup(pawoopage.text, 'lxml')
        csrf_token = csrf_token.select(
            "meta[name=csrf-token]")[0].attrs.get('content')
        with open('pawoolist.txt', 'a', encoding='utf-8-sig') as pawoofile:
            pawoofile.write(
                '{1},https://pawoo.net/@{0}\n'.format(pawooname, pixivid))
        followpawoo(pawooname, csrf_token)
        return 1
    else:
        return 0

项目：course-crawler 作者：Foair | 项目源码 | 文件源码

def get_book(url):
    """ ????? PDF ??? """
    # ????????
    print('???????……')
    nav_page = CONNECTION.get(url).text
    shelves = set(re.findall(r'/courses/.+/pdfbook/\d/', nav_page))
    for shelf_count, shelf in enumerate(shelves, 1):
        res = CONNECTION.get(BASE_URL + shelf).text
        soup = BeautifulSoup(res, 'lxml')
        save_dir = os.path.join(BASE_DIR, 'Books', str(shelf_count))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        for book_count, book in enumerate(soup.select('#booknav a'), 1):
            print('------>', book.string)
            file_name = REG_FILE.sub(' ', book.string) + '.pdf'
            pdf = CONNECTION.get(BASE_URL + book['rel'][0]).content
            with open(os.path.join(save_dir, file_name), 'wb') as pdf_file:
                pdf_file.write(pdf)

项目：Verification-code-crack 作者：weixianglin | 项目源码 | 文件源码

def read_captcha():
    header={
        'User-Agent':'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
        'Host':'login.weibo.cn'
    }
    url_login = 'http://login.weibo.cn/login/'
    html = requests.get(url_login,headers=header).content  # ????
    soup = BeautifulSoup(html, 'lxml')
    code_img = str(soup.find('img'))[24:-3]  # ?????????
    print(code_img)
    urlretrieve(code_img, r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif')
    show_img(r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif')
    remove_line(r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif',
                r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha/')
    pic_cut('captcha_removeline.gif', 'E:/????/??????/1 ???/captcha_master1/captcha_master/main_captcha/',
            'E:/????/??????/1 ???/captcha_master1/captcha_master/word/')