i'm scraping website using selenium , scrapy , phantomjs. problem code , although code scrolls page extracts link upto limit. beyond ignores result of scrolling. when use firefox webdriver , working perfectly. since i'm running code in server, used phantomjs , encountered problem. below code:
# -*- coding: utf-8 -*- scrapy.spider import basespider selenium import webdriver selenium.webdriver.common.keys import keys import time import csv import re selenium.webdriver.common.action_chains import actionchains selenium.webdriver.support.ui import webdriverwait class dukespider(basespider): name = "dspider" allowed_domains = ["dukemedicine.org"] start_urls = ["http://www.dukemedicine.org/find-doctors-physicians"] #hlor def __init__(self): self.driver = webdriver.phantomjs(service_args=['--ignore-ssl-errors=true']) self.driver.maximize_window() print 'here' def parse(self, response): print 'nowhere' print response print response.url b = open('doc_data_duke.csv', 'a') = csv.writer(b, lineterminator='\n') print 'a' self.driver.get(response.url) time.sleep(10) wait = webdriverwait(self.driver, 10) print 'helo' click = self.driver.find_element_by_xpath("//span[@id='specialty']") click.click() click_again = self.driver.find_element_by_xpath("//ul[@class='doctor-type']/li[@class='ng-binding ng-scope'][2]") click_again.click() time.sleep(25) act = actionchains(self.driver) act.move_to_element(self.driver.find_element_by_id('doctor-matrix-section')).click() print 'now here' in range(0, 75): #self.driver.find_element_by_xpath("//div[@id='doctor-matrix-section']").send_keys(keys.page_down) #self.driver.execute_script("window.scrollby(0, document.body.scrollheight);") #self.driver.find_element_by_tag_name("body").click() #self.driver.find_element_by_tag_name("body").send_keys(keys.page_down)#findelement(by.tagname("body")).sendkeys(keys.up); #self.driver.find_element_by_tag_name("body").send_keys(keys.end) #bg = self.driver.find_element_by_css_selector('body') #bg.send_keys(keys.space) act.send_keys(keys.page_down).perform() time.sleep(2) print += 1 links = self.driver.find_elements_by_xpath("//div[@class = 'result-information']/div[@class='name']/a") l in links: print l doc_list = l.get_attribute('href') if re.match(r'https:\/\/www\.dukemedicine\.org\/find-doctors-physicians\/#!\/(.*)', doc_list): print doc_list dr = webdriver.phantomjs(service_args=['--ignore-ssl-errors=true']) dr.maximize_window() dr.get(doc_list) try: name_title = dr.find_element_by_xpath('//div[@class="header1 ng-binding"]').text name_titles = name_title.split(",", 1) name = name_titles[0].encode('utf-8') title = name_titles[1] print name.encode('utf-8') title = title[1:].encode('utf-8') print title.encode('utf-8') except: name = '' title = '' try: speciality = dr.find_element_by_xpath('//p[@class="specialties ng-scope"]').text except: speciality = '' try: language = dr.find_element_by_xpath( '//div[@class="lang ng-scope"]/div[@class="plaintext inline ng-binding"]').text except: language = '' if dr.find_elements_by_xpath('//div[@class="location-info"]'): locations = dr.find_elements_by_xpath('//div[@class="location-info"]') if len(locations) >= 3: locationa = locations[0].text.encode('utf-8') locationa = locationa.replace('directions', '') locationa = locationa.replace('\n', '') locationb = locations[1].text.encode('utf-8') locationb = locationb.replace('directions', '') locationb = locationb.replace('\n', '') locationc = locations[2].text.encode('utf-8') locationc = locationc.replace('\n', '') locationc = locationc.replace('directions', '') elif len(locations) == 2: locationa = locations[0].text.encode('utf-8') locationa = locationa.replace('directions', '') locationa = locationa.replace('\n', '') locationb = locations[1].text.encode('utf-8') locationb = locationa.replace('directions', '') locationb = locationb.replace('\n', '') locationc = '' elif len(locations) == 1: locationa = locations[0].text.encode('utf-8') locationa = locationa.replace('directions', '') locationa = locationa.replace('\n', '') locationb = '' locationc = '' else: locationa = '' locationb = '' locationc = '' dr.close() data = [title, name, speciality, language, locationa, locationb, locationc] print 'aaaa' print data a.writerow(data)
no matter higher value set in range , ignores result beyond point.
let's use fact there element having total result count:
the idea iteratively scroll view of last found doctor until we've got doctors loaded.
implementation (with clarifying comments, leaving relevant "selenium" specific part):
# -*- coding: utf-8 -*- import time selenium import webdriver selenium.common.exceptions import nosuchelementexception driver = webdriver.phantomjs(service_args=['--ignore-ssl-errors=true', '--load-images=false']) # driver = webdriver.chrome() driver.maximize_window() driver.get("http://www.dukemedicine.org/find-doctors-physicians") # close optional survey popup if exists try: driver.find_element_by_css_selector("area[alt=close]").click() except nosuchelementexception: pass # open filter dropdown click = driver.find_element_by_id("specialty") click.click() # choose specialist specialist = driver.find_element_by_xpath("//ul[@class = 'doctor-type']/li[contains(., 'specialist')]") specialist.click() # artificial delay: todo: fix? time.sleep(15) # read total results count total_count = int(driver.find_element_by_id("doctor-number").text) # initial results count results = driver.find_elements_by_css_selector("div.doctor-result") current_count = len(results) # iterate while of results not loaded while current_count < total_count: driver.execute_script("arguments[0].scrollintoview();", results[-1]) results = driver.find_elements_by_css_selector("div.doctor-result") current_count = len(results) print "current results count: %d" % current_count # report total results print "----" print "total results loaded: %d" % current_count driver.quit()
works me in both phantomjs , chrome. here on console:
current results count: 36 current results count: 54 current results count: 72 current results count: 90 ... current results count: 1656 current results count: 1674 current results count: 1692 current results count: 1708 ---- total results loaded: 1708
additionally note i've added --load-images=false
command-line argument speeds things dramatically.
Comments
Post a Comment