python - PhantomJS not extracting links Selenium -


i'm scraping website using selenium , scrapy , phantomjs. problem code , although code scrolls page extracts link upto limit. beyond ignores result of scrolling. when use firefox webdriver , working perfectly. since i'm running code in server, used phantomjs , encountered problem. below code:

# -*- coding: utf-8 -*-  scrapy.spider import basespider selenium import webdriver selenium.webdriver.common.keys import keys import time import csv import re selenium.webdriver.common.action_chains import actionchains selenium.webdriver.support.ui import webdriverwait   class dukespider(basespider):  name = "dspider"  allowed_domains = ["dukemedicine.org"]  start_urls = ["http://www.dukemedicine.org/find-doctors-physicians"]  #hlor    def __init__(self):     self.driver = webdriver.phantomjs(service_args=['--ignore-ssl-errors=true'])     self.driver.maximize_window()     print 'here'    def parse(self, response):      print 'nowhere'     print response     print response.url     b = open('doc_data_duke.csv', 'a')     = csv.writer(b, lineterminator='\n')     print 'a'      self.driver.get(response.url)     time.sleep(10)     wait = webdriverwait(self.driver, 10)     print 'helo'      click = self.driver.find_element_by_xpath("//span[@id='specialty']")     click.click()     click_again = self.driver.find_element_by_xpath("//ul[@class='doctor-type']/li[@class='ng-binding ng-scope'][2]")      click_again.click()     time.sleep(25)      act = actionchains(self.driver)     act.move_to_element(self.driver.find_element_by_id('doctor-matrix-section')).click()     print 'now here'      in range(0, 75):           #self.driver.find_element_by_xpath("//div[@id='doctor-matrix-section']").send_keys(keys.page_down)         #self.driver.execute_script("window.scrollby(0, document.body.scrollheight);")         #self.driver.find_element_by_tag_name("body").click()         #self.driver.find_element_by_tag_name("body").send_keys(keys.page_down)#findelement(by.tagname("body")).sendkeys(keys.up);         #self.driver.find_element_by_tag_name("body").send_keys(keys.end)         #bg = self.driver.find_element_by_css_selector('body')          #bg.send_keys(keys.space)         act.send_keys(keys.page_down).perform()         time.sleep(2)          print         += 1      links = self.driver.find_elements_by_xpath("//div[@class = 'result-information']/div[@class='name']/a")      l in links:         print l         doc_list = l.get_attribute('href')         if re.match(r'https:\/\/www\.dukemedicine\.org\/find-doctors-physicians\/#!\/(.*)', doc_list):             print doc_list             dr = webdriver.phantomjs(service_args=['--ignore-ssl-errors=true'])             dr.maximize_window()              dr.get(doc_list)              try:                 name_title = dr.find_element_by_xpath('//div[@class="header1 ng-binding"]').text                 name_titles = name_title.split(",", 1)                 name = name_titles[0].encode('utf-8')                  title = name_titles[1]                 print name.encode('utf-8')                 title = title[1:].encode('utf-8')                 print title.encode('utf-8')             except:                 name = ''                 title = ''             try:                 speciality = dr.find_element_by_xpath('//p[@class="specialties ng-scope"]').text              except:                 speciality = ''              try:                 language = dr.find_element_by_xpath(                     '//div[@class="lang ng-scope"]/div[@class="plaintext inline ng-binding"]').text             except:                 language = ''             if dr.find_elements_by_xpath('//div[@class="location-info"]'):                 locations = dr.find_elements_by_xpath('//div[@class="location-info"]')                 if len(locations) >= 3:                     locationa = locations[0].text.encode('utf-8')                     locationa = locationa.replace('directions', '')                     locationa = locationa.replace('\n', '')                     locationb = locations[1].text.encode('utf-8')                     locationb = locationb.replace('directions', '')                     locationb = locationb.replace('\n', '')                     locationc = locations[2].text.encode('utf-8')                     locationc = locationc.replace('\n', '')                     locationc = locationc.replace('directions', '')                 elif len(locations) == 2:                     locationa = locations[0].text.encode('utf-8')                     locationa = locationa.replace('directions', '')                     locationa = locationa.replace('\n', '')                     locationb = locations[1].text.encode('utf-8')                     locationb = locationa.replace('directions', '')                     locationb = locationb.replace('\n', '')                     locationc = ''                 elif len(locations) == 1:                     locationa = locations[0].text.encode('utf-8')                     locationa = locationa.replace('directions', '')                     locationa = locationa.replace('\n', '')                     locationb = ''                     locationc = ''             else:                 locationa = ''                 locationb = ''                 locationc = ''              dr.close()             data = [title, name, speciality, language, locationa, locationb, locationc]             print 'aaaa'             print data             a.writerow(data) 

no matter higher value set in range , ignores result beyond point.

let's use fact there element having total result count:

the idea iteratively scroll view of last found doctor until we've got doctors loaded.

implementation (with clarifying comments, leaving relevant "selenium" specific part):

# -*- coding: utf-8 -*- import time  selenium import webdriver selenium.common.exceptions import nosuchelementexception   driver = webdriver.phantomjs(service_args=['--ignore-ssl-errors=true', '--load-images=false']) # driver = webdriver.chrome() driver.maximize_window()  driver.get("http://www.dukemedicine.org/find-doctors-physicians")  # close optional survey popup if exists try:     driver.find_element_by_css_selector("area[alt=close]").click() except nosuchelementexception:     pass  # open filter dropdown click = driver.find_element_by_id("specialty") click.click()  # choose specialist specialist = driver.find_element_by_xpath("//ul[@class = 'doctor-type']/li[contains(., 'specialist')]") specialist.click()  # artificial delay: todo: fix? time.sleep(15)  # read total results count total_count = int(driver.find_element_by_id("doctor-number").text)  # initial results count results = driver.find_elements_by_css_selector("div.doctor-result") current_count = len(results)  # iterate while of results not loaded while current_count < total_count:     driver.execute_script("arguments[0].scrollintoview();", results[-1])      results = driver.find_elements_by_css_selector("div.doctor-result")     current_count = len(results)     print "current results count: %d" % current_count  # report total results print "----" print "total results loaded: %d" % current_count  driver.quit() 

works me in both phantomjs , chrome. here on console:

current results count: 36 current results count: 54 current results count: 72 current results count: 90 ... current results count: 1656 current results count: 1674 current results count: 1692 current results count: 1708 ---- total results loaded: 1708 

additionally note i've added --load-images=false command-line argument speeds things dramatically.


Comments