Web scraping with python and selenium

Refresh

December 2018

Views

881 time

2

New to stack and been learning Python for a couple of months now. I am in the process of writing a script which logs on to a website (which I am a subscriber of) and scrape article titles and text.

So far I have been able to log on to the website and get to the page with the article titles, and pull the titles for the first page. However, I am having trouble cycling through the pages.

from selenium import webdriver

chrome_path = r"C:\Users\user.name\Desktop\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)

driver.get("http://www.WEBSITE.co.uk/")
driver.find_element_by_name("ctl00$LoginView1$Login1$UserName").send_keys('USERNAME')  # Enters username
driver.find_element_by_name("ctl00$LoginView1$Login1$Password").send_keys('PASSWORD')  # Enters password
driver.find_element_by_name("ctl00$LoginView1$Login1$Submit").click()  # Submits username/password
driver.find_element_by_xpath('//*[@id="middle_col"]/div[2]/div[1]/a[1]').click()  # Clicks on more articles


def title_scraper(max_pages):  # A loop to cycle through xpaths of various pages (?)
    page = 2  # Set at 2 for test circa 40 in total
    while page < max_pages:
        newPage = '//*[@id="ctl00_mainContentArea_ArticleListing1_gvwArticles"]/tbody/tr[11]/td/table/tbody/tr/td[' + str(page) + ']/a'  # xpath = //*[@id="ctl00_mainContentArea_ArticleListing1_gvwArticles"]/tbody/tr[11]/td/table/tbody/tr/td[1]/a - it is td[1] which increases depending on page number

driver.find_element_by_xpath(newPage).click()  # Scrapes article titles, currently only does the first page

titles = driver.find_elements_by_class_name("articletitle")
for title in titles:
    print(title.text)

Sorry if this has already been answered, I have had no luck with online resources so far!

Update:

def title_scraper(max_pages):
    page = 2
    while page < max_pages:
        path = '//*[@id="ctl00_mainContentArea_ArticleListing1_gvwArticles"]/tbody/tr[11]/td/table/tbody/tr/td[' + str(
            max_pages) + ']/a'
        driver.find_element_by_xpath(path)

    titles = driver.find_elements_by_class_name("articletitle")
    for title in titles:
        print(title.text)

0 answers