我们从Python开源项目中,提取了以下44个代码示例,用于说明如何使用selenium.webdriver.common.desired_capabilities.DesiredCapabilities.PHANTOMJS。
def main(number): url = 'http://www.bilibili.com/video/av' + str(number) + '/' dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0" ) dcap["phantomjs.page.settings.loadImages"] = False # phantomjs.exe???G:\Anaconda3\phantomjs\bin driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe', desired_capabilities=dcap) try: driver.get(url) # time.sleep(random.uniform(1, 5)) content = driver.page_source # ?????? driver.close() driver.quit() soup = BeautifulSoup(content, 'lxml') getInfo(soup) except Exception: pass finally: if driver: driver.quit()
def getSoup(start, stop): try: for number in range(start, stop + 1): url = 'http://www.bilibili.com/video/av'+str(number)+'/' dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0" ) dcap["phantomjs.page.settings.loadImages"] = False # phantomjs.exe???G:\Anaconda3\phantomjs\bin driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe', desired_capabilities=dcap) driver.get(url) # time.sleep(1) # ????????????? content = driver.page_source # ?????? driver.close() driver.quit() soup = BeautifulSoup(content, 'lxml') getInfo(soup) except Exception: pass
def get_webdriver(self): service_args = [] if self.webdriver_config.proxy: service_args.extend([ "--proxy=" + self.webdriver_config.proxy, "--proxy-type=http", "--ignore-ssl-errors=true" ]) dcapability = dict(DesiredCapabilities.PHANTOMJS) if self.webdriver_config.header: dcapability["phantomjs.page.settings.userAgent"] = self.webdriver_config.header['User-Agent'] dcapability["phantomjs.page.customHeaders.User-Agent"] = self.webdriver_config.header['User-Agent'] dcapability["takesScreenshot"] = True driver = webdriver.PhantomJS(self.webdriver_config.phantomjs_path, service_args=service_args, desired_capabilities=dcapability) driver.set_page_load_timeout(self.webdriver_config.timeout) return driver
def process_request(self, request, spider): if request.url[26] == 'c': ua = random.choice(self.user_agent_list) dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ua dcap["phantomjs.page.settings.loadImages"] = False driver = webdriver.PhantomJS(executable_path='E:\Webdriver\phantomjs-2.1.1-windows\\bin\phantomjs.exe', desired_capabilities=dcap) driver.get(request.url) sleep_time = random.randint(15, 22) time.sleep(sleep_time) try: detail = driver.find_element_by_xpath('//a[@ng-click="showDetail = btnOnClick(showDetail)"]') detail.click() except: pass body = driver.page_source url = driver.current_url driver.quit() return HtmlResponse(url=url, body=body, request=request, encoding='utf-8')
def __init__(self, url): self.url = url self.book_name = "N/A" self.book_volume = None # Set the user agent to something generic. dc = dict(DC.PHANTOMJS) dc["phantomjs.page.settings.userAgent"] = USER_AGENT self.d = webdriver.PhantomJS(desired_capabilities=dc, service_args=["--ignore-ssl-errors=true", "--ssl-protocol=any", "--web-security=false", "--ssl-protocol=TLSv1"]) # Set cookies that makes it think we previously agreed to the ToS. self.d.add_cookie({"name": "tachiyomi_auto_reader", "value": "Browser", "domain": ".ebookjapan.jp", "path": "/"}) self.d.add_cookie({"name": "tachiyomi_user_policy", "value": "on", "domain": ".ebookjapan.jp", "path": "/"}) self.d.set_window_size(1120, 550) # Generic waiter. self.wait = WebDriverWait(self.d, 60)
def get_pages(self): ''' ??Phantomjs??????????????????url Get all pages' urls using selenium an phantomJS return: a list of tuple (page_num,page_url) ''' r_slt=r'onchange="select_page\(\)">([\s\S]*?)</select>' r_p=r'<option value="(.*?)".*?>?(\d*?)?<' try: dcap = dict(DesiredCapabilities.PHANTOMJS) # ??????????????? dcap["phantomjs.page.settings.loadImages"] = False driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.get(self.chapter_url) text=driver.page_source st=re.findall(r_slt,text)[0] self.pages = [(int(p[-1]),p[0]) for p in re.findall(r_p,st)] except Exception: traceback.print_exc() self.pages = [] except KeyboardInterrupt: raise KeyboardInterrupt finally: driver.quit() print('Got {l} pages in chapter {ch}'.format(l=len(self.pages),ch=self.chapter_title)) return self.pages
def _get_PhantomJS(self): try: service_args = [] if self.proxy: service_args.extend([ '--proxy={}:{}'.format(self.proxy.host, self.proxy.port), '--proxy-type={}'.format(self.proxy.proto), ]) if self.proxy.username and self.proxy.password: service_args.append( '--proxy-auth={}:{}'.format( self.proxy.username, self.proxy.password ) ) useragent = random_user_agent( mobile=False ) logger.info('useragent: {}'.format(useragent)) dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = useragent try: self.webdriver = webdriver.PhantomJS( executable_path=self.config['executable_path'], service_args=service_args, desired_capabilities=dcap ) return True except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err: logger.error(err) return False except WebDriverException as e: logger.error(e) return False
def getSoup(start, stop): try: for number in range(start, stop+1): url = 'http://space.bilibili.com/'+str(number)+'/#!/' # url = 'http://space.bilibili.com/122879/#!/' # "http://http://space.bilibili.com/122879/#!/" dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0" ) dcap["phantomjs.page.settings.loadImages"] = False #?????????? # executable_path='D:\\Chrome\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe', driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.get(url) content = driver.page_source # ?????? # print(content) driver.close() soup = BeautifulSoup(content, 'lxml') username= getInfo(soup) # ????? uid = number # number??uid get_fans_uid = GetFansUid.GetFansUid(number) fansuid, fansnumber = get_fans_uid.get_uids() # ????id????? print(uid, username, fansnumber) saveData(uid, username, fansnumber, fansuid)# ????? except Exception: print("get page error") return getSoup(number + 1, stop+1) # ????
def main(number): url = 'http://space.bilibili.com/' + str(number) + '/#!/' dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0" ) dcap["phantomjs.page.settings.loadImages"] = False # ?????????? driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe', desired_capabilities=dcap) try: driver.get(url) content = driver.page_source # ?????? driver.close() driver.quit() # ?????????????? soup = BeautifulSoup(content, 'lxml') username = getInfo(soup) # ????? uid = number # number??uid get_fans_uid = GetFansUid(number) fansuid, fansnumber = get_fans_uid.get_uids() # ????id????? saveData(uid, username, fansnumber, fansuid) # ????? except Exception: pass finally: if driver: driver.quit()
def getSoup(start, stop): try: for number in range(start, stop+1): url = 'http://space.bilibili.com/'+str(number)+'/#!/' # "http://space.bilibili.com/1643718/#!/" # "http://space.bilibili.com/902915/#!/" # "http://space.bilibili.com/1/#!/" dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0" ) dcap["phantomjs.page.settings.loadImages"] = False #?????????? # phantomjs.exe???G:\Anaconda3\phantomjs\bin driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe', desired_capabilities=dcap) driver.get(url) # time.sleep(1) # ????????????? content = driver.page_source # ?????? # print(content) driver.close() driver.quit() soup = BeautifulSoup(content, 'lxml') getInfo(soup) except Exception: pass # ????
def getSoup(start, stop): try: for number in range(start, stop+1): url = 'http://space.bilibili.com/'+str(number)+'/#!/' dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0" ) dcap["phantomjs.page.settings.loadImages"] = False #?????????? driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe', desired_capabilities=dcap) driver.get(url) content = driver.page_source # ?????? driver.close() driver.quit() #?????????????? soup = BeautifulSoup(content, 'lxml') username= getInfo(soup) # ????? uid = number # number??uid get_fans_uid = GetFansUid(number) fansuid, fansnumber = get_fans_uid.get_uids() # ????id????? saveData(uid, username, fansnumber, fansuid) # ????? except Exception: print("get page error") return getSoup(number+1, stop+1) # ????
def getSoup(start, stop): try: for number in range(start, stop+1): url = 'http://space.bilibili.com/'+str(number)+'/#!/' dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0" ) dcap["phantomjs.page.settings.loadImages"] = False #?????????? driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe', desired_capabilities=dcap) driver.get(url) content = driver.page_source # ?????? driver.close() driver.quit() #?????????????? soup = BeautifulSoup(content, 'lxml') username= getInfo(soup) # ????? uid = number # number??uid get_gz_uid = GetFollowUid(number) gzsuid, gznumber = get_gz_uid.get_uids() # ????id????? saveData(uid, username, gznumber, gzsuid) # ????? except Exception: print("get page error") return getSoup(number+1, stop+1) # ????
def __init__(self, executable_path="phantomjs", port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS, service_args=None, service_log_path=None): """ Creates a new instance of the PhantomJS / Ghostdriver. Starts the service and then creates new instance of the driver. :Args: - executable_path - path to the executable. If the default is used it assumes the executable is in the $PATH - port - port you would like the service to run, if left as 0, a free port will be found. - desired_capabilities: Dictionary object with non-browser specific capabilities only, such as "proxy" or "loggingPref". - service_args : A List of command line arguments to pass to PhantomJS - service_log_path: Path for phantomjs service to log to. """ self.service = Service(executable_path, port=port, service_args=service_args, log_path=service_log_path) self.service.start() try: RemoteWebDriver.__init__(self, command_executor=self.service.service_url, desired_capabilities=desired_capabilities) except: self.quit() raise self._is_remote = False
def setUpClass(cls): super(TestEditHandler, cls).setUpClass() caps = DesiredCapabilities.PHANTOMJS # caps['loggingPrefs'] = { 'browser':'ALL' } cls.driver = webdriver.PhantomJS(desired_capabilities=caps) cls.driver.set_window_size(1920, 1080) cls.driver.implicitly_wait(10)
def initialize_driver(self, driver=None): if self.command_executor: chrome_options = Options() chrome_options.add_argument("--disable-notifications") if self.proxy: chrome_options.add_argument('--proxy-server=%s' % self.proxy) self.driver = webdriver.Remote( command_executor=self.command_executor, desired_capabilities=chrome_options.to_capabilities() ) else: if self.which_driver == 'phantomjs': dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 " "(KHTML, like Gecko) Chrome/15.0.87" ) driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.set_window_size(1400, 1000) self.driver = driver elif self.which_driver == 'chrome': chrome_options = Options() chrome_options.add_argument("--disable-notifications") if self.proxy: chrome_options.add_argument('--proxy-server=%s' % self.proxy) self.driver = webdriver.Chrome(chrome_options=chrome_options) # otherwise use the driver passed in else: self.driver = driver # set page load timeout self.driver.set_page_load_timeout(time_to_wait=240)
def open_driver(self): self.quit_driver() if not hasattr(self, 'driver') or not isinstance(self.driver,webdriver.phantomjs.webdriver.WebDriver): # phantomjs driver # http://engineering.shapesecurity.com/2015/01/detecting-phantomjs-based-visitors.html # https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string # http://phantomjs.org/api/webpage/property/settings.html # http://stackoverflow.com/questions/23390974/phantomjs-keeping-cache dcap = dict(DesiredCapabilities.PHANTOMJS) # dcap['browserName'] = 'Chrome' dcap['phantomjs.page.settings.userAgent'] = ( self.user_agent ) dcap['phantomjs.page.settings.loadImages'] = ( 'false' ) dcap['phantomjs.page.settings.clearMemoryCaches'] = ( 'true' ) dcap['phantomjs.page.settings.resourceTimeout'] = ( max(2000,int(self.timeout * 1000)) ) dcap['acceptSslCerts'] = ( True ) dcap['applicationCacheEnabled'] = ( True ) dcap['handlesAlerts'] = ( False ) dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch' } ) phantomjs_service_args = ['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2'] if self.proxy is not None: phantomjs_service_args = ['--proxy={}'.format(self.proxy)] + phantomjs_service_args if self.phantomjs_binary_path is None: driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=phantomjs_service_args) else: driver = webdriver.PhantomJS(self.phantomjs_binary_path,desired_capabilities=dcap,service_args=phantomjs_service_args) driver.set_window_size(1296,1018) # Tor browser size on Linux driver.implicitly_wait(self.timeout) driver.set_page_load_timeout(self.timeout) driver.set_script_timeout(self.timeout) self.driver = driver
def __init__(self, executable_path="phantomjs", port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS, service_args=None, service_log_path=None): """ Creates a new instance of the PhantomJS / Ghostdriver. Starts the service and then creates new instance of the driver. :Args: - executable_path - path to the executable. If the default is used it assumes the executable is in the $PATH - port - port you would like the service to run, if left as 0, a free port will be found. - desired_capabilities: Dictionary object with non-browser specific capabilities only, such as "proxy" or "loggingPref". - service_args : A List of command line arguments to pass to PhantomJS - service_log_path: Path for phantomjs service to log to. """ self.service = Service( executable_path, port=port, service_args=service_args, log_path=service_log_path) self.service.start() try: RemoteWebDriver.__init__( self, command_executor=self.service.service_url, desired_capabilities=desired_capabilities) except Exception: self.quit() raise self._is_remote = False
def load_phantomjs(config): """Start PhantomJS webdriver with the given configuration. Args: config (dict): The configuration loaded previously in Cabu. Returns: webdriver (selenium.webdriver): An instance of phantomJS webdriver. """ dcap = dict(DesiredCapabilities.PHANTOMJS) service_args = [ '--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false' ] if os.environ.get('HTTPS_PROXY') or os.environ.get('HTTP_PROXY'): proxy_address = os.environ.get('HTTPS_PROXY', os.environ.get('HTTP_PROXY')) proxy_ip = re.search('http\:\/\/(.*)$', proxy_address).group(1) service_args.append('--proxy=%s' % proxy_ip) service_args.append('--proxy-type=http') if 'HEADERS' in config and config['HEADERS']: dcap = Headers(config).set_headers(dcap) return webdriver.PhantomJS( desired_capabilities=dcap, service_args=service_args, service_log_path=os.path.devnull )
def test_phantomjs_headers_loading(self): dcap = dict(DesiredCapabilities.PHANTOMJS) headers = Headers(self.config).set_headers(dcap) self.assertEquals( headers['phantomjs.page.customHeaders.User-Agent'], 'Mozilla/6.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36' )
def test_chrome_headers_loading(self): self.app.config['DRIVER_NAME'] = 'Chrome' dcap = dict(DesiredCapabilities.PHANTOMJS) with self.assertRaises(Exception): Headers(self.config).set_headers(dcap)
def create_selenium_driver(self): # driver = webdriver.Chrome() dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36" ) driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.set_window_size(1024, 768) return driver
def get_browser(): dcap = dict(DesiredCapabilities.PHANTOMJS) DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Accept-Language'] = 'zh-CN,zh;q=0.8' DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Connection'] = 'keep-alive' DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Accept-Encoding'] = 'gzip, deflate, sdch' DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Cache-Control'] = 'max-age=0' phantomjs_path = "G:\\programeSoftwares\\python2.7\\Scripts\\phantomjs.exe" dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36") #browser = webdriver.PhantomJS(desired_capabilities=dcap) browser = webdriver.PhantomJS(desired_capabilities=dcap,executable_path=phantomjs_path) #self.browser = webdriver.PhantomJS(desired_capabilities=dcap browser.set_window_size(1920, 1080) return browser
def visit(self, url, xpath=None, timeout=60, retry=1, load_images=False, **kwargs): if self.browser: self.browser.quit() desired_capabilities = dict() desired_capabilities['phantomjs.page.settings.userAgent'] = self.ua if self.ua else 'Mozilla/5.0 (Windows NT 6.1; rv:42.0) Gecko/20100101 Firefox/42.0' service_args = list() if not load_images: service_args += ['--load-images=false'] if self.proxy: service_args += ['--proxy=%s' % self.proxy] DesiredCapabilities.PHANTOMJS.update(desired_capabilities) try: browser = webdriver.PhantomJS(service_args=service_args if service_args else None, desired_capabilities=DesiredCapabilities.PHANTOMJS) except Exception as e: print str(e) return None count = 0 while (retry + 1) > count: count += 1 try: browser.get(url) break except Exception as e: print str(e) if xpath: browser.implicitly_wait(timeout) try: browser.find_element_by_xpath(xpath) except Exception as e: print str(e) self.browser = browser result = browser.page_source return result if result != '<html><head></head><body></body></html>' else None
def get(self, url, xpath, timeout, retry, service_args, desired_capabilities): browser = None try: result = dict() if desired_capabilities: DesiredCapabilities.PHANTOMJS.update(json.loads(desired_capabilities)) browser = webdriver.PhantomJS(service_args=json.loads(service_args) if service_args else None, desired_capabilities=DesiredCapabilities.PHANTOMJS) count = 0 while (retry + 1) > count: count += 1 try: browser.get(url) break except Exception as e: print str(e) if xpath: browser.implicitly_wait(timeout) try: browser.find_element_by_xpath(xpath) except Exception as e: print str(e) text = browser.page_source if text == '<html><head></head><body></body></html>': browser.quit() return '' result['cookies'] = browser.get_cookies() result['text'] = text.encode('utf-8') browser.quit() return json.dumps(result) except Exception as e: if browser: browser.quit() print str(e) return ''
def download_articles_ph(self, url): ''' ??phantomjs???? :param url: ???? :return: ''' if url is None: return None dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( UA ) dcap["takesScreenshot"] = (False) try: driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no']) except Exception as e: print(datetime.datetime.now()) print(url) print(e) else: try: driver.set_page_load_timeout(30) driver.get(url) time.sleep(1) # driver.implicitly_wait(2) html = driver.page_source return html except: print(datetime.datetime.now()) print(url) finally: driver.quit()
def maintain_cookies_ph(self): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = UA cookie = [] # ??5?cookies for i in range(5): driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no', ]) driver.get("http://weixin.sogou.com/") # ??cookie?? cookie.append(driver.get_cookies()) # print(driver.get_cookies()) driver.quit() return cookie
def host_worker(hostQueue, fileQueue, timeout, user_agent, verbose): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = user_agent dcap["accept_untrusted_certs"] = True driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'], desired_capabilities=dcap) # or add to your PATH driver.set_window_size(1024, 768) # optional driver.set_page_load_timeout(timeout) while(not hostQueue.empty()): host = hostQueue.get() if not host.startswith("http://") and not host.startswith("https://"): host1 = "http://" + host host2 = "https://" + host filename1 = os.path.join("output", "images", str(uuid4()) + ".png") filename2 = os.path.join("output", "images", str(uuid4()) + ".png") if verbose: print("Fetching %s" % host1) if host_reachable(host1, timeout) and save_image(host1, filename1, driver): fileQueue.put({host1: filename1}) else: if verbose: print("%s is unreachable or timed out" % host1) if verbose: print("Fetching %s" % host2) if host_reachable(host2, timeout) and save_image(host2, filename2, driver): fileQueue.put({host2: filename2}) else: if verbose: print("%s is unreachable or timed out" % host2) else: filename = os.path.join("output", "images", str(uuid4()) + ".png") if verbose: print("Fetching %s" % host) if host_reachable(host, timeout) and save_image(host, filename, driver): fileQueue.put({host: filename}) else: if verbose: print("%s is unreachable or timed out" % host)
def __init__(self, user_agent=None, cookies_file=None): """ Initialize the phantom JS selenium driver :return: """ self.conf = config self.user_agent = user_agent self.cookies_file = cookies_file # http://phantomjs.org/api/webpage/property/settings.html dcap = dict(DesiredCapabilities.PHANTOMJS) dcap['phantomjs.page.settings.loadImages'] = False dcap['phantomjs.page.settings.webSecurityEnabled'] = False dcap['phantomjs.page.settings.localToRemoteUrlAccessEnabled'] = True if user_agent: dcap['phantomjs.page.settings.userAgent'] = user_agent self.driver = webdriver.PhantomJS( desired_capabilities=dcap, executable_path=self.conf['general']['phantomjs'], ) self.load_cookies() self.driver.implicitly_wait(30) self.driver.set_window_size(1024, 768)
def handle_phantomjs (self): ''' ????phantomjs??? :return: driver ''' conf = {} for line in fileinput.input("..//..//abuyun.conf"): lines = line.replace(' ', '').replace('\n', '').split("=") conf[lines[0]] = lines[1] print '??' # ????? proxyHost = conf["proxyHost"] proxyPort = conf["proxyPort"] # ??????????? proxyUser = conf["proxyUser"] proxyPass = conf["proxyPass"] service_args = [ "--proxy-type=http", "--proxy=%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, }, "--proxy-auth=%(user)s:%(pass)s" % { "user": proxyUser, "pass": proxyPass, }, ] phantomjs_path = r"phantomjs" dcap = dict(DesiredCapabilities.PHANTOMJS) # ?????UA?????????? ua = self.rad_ua() ##?????UA dcap["phantomjs.page.settings.userAgent"] = ua driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path=phantomjs_path, service_args=service_args) return driver
def __init__(self, executable_path="phantomjs", port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS, service_args=None, service_log_path=None): """ Creates a new instance of the PhantomJS / Ghostdriver. Starts the service and then creates new instance of the driver. :Args: - executable_path - path to the executable. If the default is used it assumes the executable is in the $PATH - port - port you would like the service to run, if left as 0, a free port will be found. - desired_capabilities: Dictionary object with non-browser specific capabilities only, such as "proxy" or "loggingPref". - service_args : A List of command line arguments to pass to PhantomJS - service_log_path: Path for phantomjs service to log to. """ self.service = Service( executable_path, port=port, service_args=service_args, log_path=service_log_path) self.service.start() try: RemoteWebDriver.__init__( self, command_executor=self.service.service_url, desired_capabilities=desired_capabilities) except: self.quit() raise self._is_remote = False
def get_driver_phantomjs(): """ References: PhantomJS: 1. [??PHANTOMJS?USER-AGENT](http://smilejay.com/2013/12/set-user-agent-for-phantomjs/) 2. [Selenium 2 - Setting user agent for IE and Chrome](http://stackoverflow.com/questions/6940477/selenium-2-setting-user-agent-for-ie-and-chrome) """ dcap = dict(DesiredCapabilities.PHANTOMJS) # Setting User-Agent ua = random.choice(RotateUserAgentMiddleware.user_agent_list) if ua: print("Current User-Agent is:", ua) dcap["phantomjs.page.settings.userAgent"] = ua driver = webdriver.PhantomJS(executable_path=r"/home/lxw/Downloads/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs", desired_capabilities=dcap) """ # Setting IP Proxies # ??DesiredCapabilities(????)??????????sessionId????????????????????????????url proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL ip_proxy = get_proxy() if ip_proxy: proxy.http_proxy = ip_proxy # ????????webdriver.DesiredCapabilities.PHANTOMJS? # proxy.add_to_capabilities(DesiredCapabilities.PHANTOMJS) # driver.start_session(DesiredCapabilities.PHANTOMJS) proxy.add_to_capabilities(dcap) driver.start_session(dcap) """ # ?????? driver.set_page_load_timeout(TIMEOUT) driver.set_script_timeout(TIMEOUT) # ??????????? return driver
def createHeadlessBrowser(proxy=None, XResolution=1024, YResolution=768): #proxy = None dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.86 Safari/537.36') if proxy != None: service_args = ['--proxy={}'.format(proxy),'--proxy-type=https','--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false',] driver = webdriver.PhantomJS(service_args=service_args, desired_capabilities=dcap) else: driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.set_window_size(XResolution,YResolution) driver.set_page_load_timeout(20) return driver
def _init_browser(self): ''' Setup selenium browser. Uses default path location if none is specified. Returns browser object or None if it fails.''' # User Agent uas = [ "Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20100101 Firefox/31.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20100101 Firefox/37.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36", ] ua = random.choice(uas) ## PhantomJS Binary files phantoms = config.PHANTOM_PATH phantompath = random.choice(phantoms) # Custom user agent dc = dict(DesiredCapabilities.PHANTOMJS) dc["phantomjs.page.settings.userAgent"] = ua #dc["pages.settings.XSSAuditEnabled"] = "true" try: browser = webdriver.PhantomJS( phantompath, service_args=self.service_args, desired_capabilities=dc ) except WebDriverException as err: logging.error("Could not create browser. Check path") logging.error(err) return None except: logging.error("Major problem with webdriver. " "Could be related to performance." "Decrease the number of threads.") return None browser.set_page_load_timeout(45) ## DELETED GOOD STUFF ## return browser
def download(self, link, name, url): """ ???????????? :param link: :param name: :param url: :return: """ dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( random.choice(self.agents) ) dcap["takesScreenshot"] = False dcap["phantomjs.page.customHeaders.Cookie"] = random.choice(self.cookie) # dcap["phantomjs.page.settings.resourceTimeout"] = ("1000") try: driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no', ]) except Exception as e: with open(r'list_error.txt', 'a') as f: f.write(name.encode('utf-8')) f.write('\n') print(datetime.datetime.now()) print(url) print(e) else: try: driver1.set_page_load_timeout(20) driver1.get(link) b = True try: driver1.find_element_by_class_name('page_verify') except: b = False if b is True: print('page needs verify, stop the program') print('the last weixinNUM is %s\n' % name) self.ocr4wechat(link) time.sleep(5) with open(r'list_error.txt', 'a') as f: f.write(name.encode('utf-8')) f.write('\n') else: html = driver1.page_source return link, html except Exception as e: with open(r'list_error.txt', 'a') as f: f.write(name.encode('utf-8')) f.write('\n') print(url) print(datetime.datetime.now()) print(e) finally: driver1.quit()
def downloader_html_ph(url, up_num): ##??PhantomJS?????? ''' url :??????url up_num :????? ''' # print driver.service print '????????! URL?', url, ' ?????:', up_num conf = {} for line in fileinput.input("..//..//abuyun.conf"): lines = line.replace(' ', '').replace('\n', '').split("=") conf[lines[0]] = lines[1] # ????? proxyHost = conf["proxyHost"] proxyPort = conf["proxyPort"] # ??????????? proxyUser = conf["proxyUser"] proxyPass = conf["proxyPass"] service_args = [ "--proxy-type=http", "--proxy=%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, }, "--proxy-auth=%(user)s:%(pass)s" % { "user": proxyUser, "pass": proxyPass, }, ] phantomjs_path = r"phantomjs" dcap = dict(DesiredCapabilities.PHANTOMJS) # ?????UA?????????? ua = rad_ua() ##?????UA dcap["phantomjs.page.settings.userAgent"] = ua # ,service_args=service_args ????? driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path=phantomjs_path) driver.get(url) time.sleep(2) ##??????????????? dian = '' print '?????', for i in range(up_num): driver.execute_script('window.scrollTo(0,document.body.scrollHeight)') ##?????????????? time.sleep(2) dian = dian + '.' print '.', print driver.current_url, '?????????????' data = driver.page_source.encode("utf-8") # ?????? html_parser = HTMLParser.HTMLParser() data = html_parser.unescape(data) return data
def get_browser(self, browser_name): """get a webdriver browser instance """ self._browser_name = browser_name if browser_name == 'firefox': logger.debug("getting Firefox browser (local)") if 'DISPLAY' not in os.environ: logger.debug("exporting DISPLAY=:0") os.environ['DISPLAY'] = ":0" browser = webdriver.Firefox() elif browser_name == 'chrome': logger.debug("getting Chrome browser (local)") browser = webdriver.Chrome() browser.set_window_size(1920, 1080) browser.implicitly_wait(2) elif browser_name == 'chrome-headless': logger.debug('getting Chrome browser (local) with --headless') chrome_options = Options() chrome_options.add_argument("--headless") browser = webdriver.Chrome(chrome_options=chrome_options) browser.set_window_size(1920, 1080) browser.implicitly_wait(2) elif browser_name == 'phantomjs': logger.debug("getting PhantomJS browser (local)") dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = self.user_agent args = [ '--cookies-file={c}'.format(c=self._cookie_file), '--ssl-protocol=any', '--ignore-ssl-errors=true', '--web-security=false' ] browser = webdriver.PhantomJS( desired_capabilities=dcap, service_args=args ) browser.set_window_size(1024, 768) else: raise SystemExit( "ERROR: browser type must be one of 'firefox', 'chrome', " "'chrome-headless' or 'phantomjs', not '{b}'".format( b=browser_name ) ) logger.debug("returning browser") return browser
def selenium_request(url ,isscreen = False): osurl = '%s/xici/validateimg/' % os.path.dirname(os.path.abspath("scrapy.cfg")) ua_list = [ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/48.0.2564.82 Chrome/48.0.2564.82 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36" ] dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.resourceTimeout"] = 15 dcap["phantomjs.page.settings.loadImages"] = True dcap["phantomjs.page.settings.userAgent"] = choice(ua_list) driver = webdriver.PhantomJS(executable_path='/Users/felixchan/Tool/phantomjs',desired_capabilities=dcap) # driver = webdriver.Firefox() driver.get(url) if isscreen: imgURL = '%s%s.png' % (osurl,int(time.time())) uploadimg = '%s%s_2.png' % (osurl,int(time.time())) driver.save_screenshot(imgURL) # ???? time.sleep(1) ocr = RClient(VALIDATE['username'], VALIDATE['password'], VALIDATE['soft_id'], VALIDATE['soft_key']) left = 260 top = 12 right = 396 bottom = 70 im = Image.open(imgURL) im = im.crop((left, top, right, bottom)) im.save(uploadimg) ims = open(uploadimg, 'rb').read() post_result = ocr.create(uploadimg,ims, 3040) varidate_code = post_result['Result'] print(post_result) elem = driver.find_element_by_id('input') elem.send_keys(varidate_code) #elem.send_keys(Keys.ENTER) #??????Enter?? driver.find_element_by_id('bt').click() driver.refresh() driver.implicitly_wait(2) time.sleep(1) true_page = driver.page_source # .decode('utf-8','ignore') driver.close() return true_page
def __init__(self, url="http://www.gsxt.gov.cn/index.html", #url="http://sh.gsxt.gov.cn/notice", #search_text = u"????", search_text = u"????????????", input_id='keyword', search_element_id='btn_query', gt_element_class_name='gt_box', gt_slider_knob_name='gt_slider_knob', result_numbers_xpath='/html/body/div[2]/div[3]/div[1]/span', result_list_verify_id=None, result_list_verify_class=None, is_gap_every_broad=True): """ url: ?????? search_text: ?????? input_id: ???????id search_element_id: ????????id gt_element_class_name: ??????????class????????????????? gt_slider_knob_name: ????????????class????????????????? result_numbers_xpath: ??????????? ???????xpath,??????`50`????????? result_list_verify_id: ?????????????id????????????????(??????ajax) or result_list_verify_class: ?????????????class?????????????????(??????ajax) is_gap_every_broad: ???????True???????????????????????????????????????? """ self.url = url self.search_text = search_text self.input_id = input_id self.search_element_id = search_element_id self.gt_element_class_name = gt_element_class_name self.gt_slider_knob_name = gt_slider_knob_name self.result_numbers_xpath = result_numbers_xpath self.result_list_verify_id = result_list_verify_id self.result_list_verify_class = result_list_verify_class self.is_gap_every_broad = is_gap_every_broad dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36" ) #self.driver = webdriver.PhantomJS(desired_capabilities=dcap) self.driver = webdriver.Chrome("/home/hee/driver/chromedriver") #self.driver.maximize_window() time.sleep(random.uniform(2.0, 3.0))
def __init__(self, url="http://www.gsxt.gov.cn/index.html", #url="http://sh.gsxt.gov.cn/notice", #search_text = u"????", search_text = u"????????????", input_id='keyword', search_element_id='btn_query', gt_element_class_name='gt_box', gt_slider_knob_name='gt_slider_knob', result_numbers_xpath='/html/body/div[2]/div[3]/div[1]/span', result_list_verify_id=None, result_list_verify_class=None, is_gap_every_broad=True): """ url: ?????? search_text: ?????? input_id: ???????id search_element_id: ????????id gt_element_class_name: ??????????class????????????????? gt_slider_knob_name: ????????????class????????????????? result_numbers_xpath: ??????????? ???????xpath,??????`50`????????? result_list_verify_id: ?????????????id????????????????(??????ajax) or result_list_verify_class: ?????????????class?????????????????(??????ajax) is_gap_every_broad: ???????True???????????????????????????????????????? """ self.url = url self.search_text = search_text self.input_id = input_id self.search_element_id = search_element_id self.gt_element_class_name = gt_element_class_name self.gt_slider_knob_name = gt_slider_knob_name self.result_numbers_xpath = result_numbers_xpath self.result_list_verify_id = result_list_verify_id self.result_list_verify_class = result_list_verify_class self.is_gap_every_broad = is_gap_every_broad dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36" ) #self.driver = webdriver.PhantomJS(desired_capabilities=dcap) # self.driver = webdriver.Chrome("/home/hee/driver/chromedriver") # hee self.driver = webdriver.Chrome(r"/home/lxw/Software/chromedirver_selenium/chromedriver") # lxw #self.driver.maximize_window() time.sleep(random.uniform(2.0, 3.0))
def __init__(self, url="http://www.gsxt.gov.cn/index.html", #url="http://sh.gsxt.gov.cn/notice", #search_text = u"????", search_text = u"????????????", input_id='keyword', search_element_id='btn_query', gt_element_class_name='gt_box', gt_slider_knob_name='gt_slider_knob', result_numbers_xpath='/html/body/div[2]/div[3]/div[1]/span', result_list_verify_id=None, result_list_verify_class=None, is_gap_every_broad=True): """ url: ?????? search_text: ?????? input_id: ???????id search_element_id: ????????id gt_element_class_name: ??????????class????????????????? gt_slider_knob_name: ????????????class????????????????? result_numbers_xpath: ??????????? ???????xpath,??????`50`????????? result_list_verify_id: ?????????????id????????????????(??????ajax) or result_list_verify_class: ?????????????class?????????????????(??????ajax) is_gap_every_broad: ???????True???????????????????????????????????????? """ self.url = url self.search_text = search_text self.input_id = input_id self.search_element_id = search_element_id self.gt_element_class_name = gt_element_class_name self.gt_slider_knob_name = gt_slider_knob_name self.result_numbers_xpath = result_numbers_xpath self.result_list_verify_id = result_list_verify_id self.result_list_verify_class = result_list_verify_class self.is_gap_every_broad = is_gap_every_broad dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36" ) #self.driver = webdriver.PhantomJS(desired_capabilities=dcap) self.driver = webdriver.Chrome("/home/hee/driver/chromedriver") # hee # self.driver = webdriver.Chrome(r"/home/lxw/Software/chromedirver_selenium/chromedriver") # lxw #self.driver.maximize_window() time.sleep(random.uniform(2.0, 3.0))
def findTrip(): url = "http://flights.ctrip.com/booking/XMN-BJS-day-1.html?DDate1=2016-04-18" ua_list = [ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/48.0.2564.82 Chrome/48.0.2564.82 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36", "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36" ] dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.resourceTimeout"] = 15 dcap["phantomjs.page.settings.loadImages"] = False dcap["phantomjs.page.settings.userAgent"] = choice(ua_list) #driver = webdriver.PhantomJS(executable_path=u'/home/icgoo/pywork/spider/phantomjs',desired_capabilities=dcap) #driver = webdriver.PhantomJS(executable_path=u'/home/fank/pywork/spider/phantomjs',desired_capabilities=dcap) driver = webdriver.Firefox() driver.get(url) driver.implicitly_wait(3) time.sleep(5) page = driver.page_source # .decode('utf-8','ignore') html = etree.HTML(page) fligint_div = "//div[@id='J_flightlist2']/div" items = html.xpath(fligint_div) detail = [] for index,item in enumerate(items): flight_tr = fligint_div+'['+str(index+1)+']'+'//tr' istrain = html.xpath(flight_tr + "//div[@class='train_flight_tit']") if istrain: pass # is train add else: company = html.xpath(flight_tr + "//div[@class='info-flight J_flight_no']//text()") flight_time_from = html.xpath(flight_tr + "//td[@class='right']/div[1]//text()") flight_time_to = html.xpath(flight_tr + "//td[@class='left']/div[1]//text()") flight_time = [flight_time_from,flight_time_to] airports_from = html.xpath(flight_tr + "//td[@class='right']/div[2]//text()") airports_to = html.xpath(flight_tr + "//td[@class='left']/div[2]//text()") airports = [airports_from,airports_to] price = html.xpath(flight_tr + "[1]//td[@class='price middle ']/span//text()") detail.append( dict( company=company, flight_time=flight_time, airports=airports, price=price )) print detail driver.close() return detail