In this video, we are going to write a Python program to web scrape free course information from Udmey’s website and save the information in a CSV file.

If you are new to web scraping or is simply looking more practice, then you will find this tutorial useful. Before getting started, makes sure you have 1) web driver downloaded, 2) web scraping Python libraries installed.

Udemy’s website: https://www.udemy.com/courses/free/



Check out Kite (free AI Coding Assistant) → Link

 

Buy Me a Coffee? Your support is much appreciated!

PayPal Me: https://www.paypal.me/jiejenn/5
Venmo: @Jie-Jenn



Source Code:

import time
import pandas as pd # pip install pandas
# pip install selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup # pip install beautifulsoup4

sort_by_type = 'newest'

firefox_driver_path = 'geckodriver.exe'
delay = 15

driver = webdriver.Firefox(executable_path=firefox_driver_path)

def extract_text(soup_obj, tag, attribute_name, attribute_value):
    txt = soup_obj.find(tag, {attribute_name: attribute_value}).text.strip() if soup_obj.find(tag, {attribute_name: attribute_value}) else ''
    return txt

rows = []

for page_number in range(1, 4):
    page_url = f'https://www.udemy.com/courses/free/?lang=en&p={page_number}&sort=newest'
    driver.get(page_url)
    time.sleep(5)

    try:
        WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'course-list--container--3zXPS')))
    except TimeoutException:
        print('Loading exceeds delay time')
        break
    else:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        course_list = soup.find('div', {'class': 'course-list--container--3zXPS'})
        courses = course_list.find_all('a', {'class': 'udlite-custom-focus-visible browse-course-card--link--3KIkQ'})
        
        for course in courses:          
            course_url = '{0}{1}'.format('https://www.udemy.com', course['href'])
            course_title = course.select('div[class*="course-card--course-title"]')[0].text         
            course_headline = extract_text(course, 'p', 'data-purpose', 'safely-set-inner-html:course-card:course-headline')            
            author = extract_text(course, 'div', 'data-purpose', 'safely-set-inner-html:course-card:visible-instructors')
            course_rating = extract_text(course, 'span', 'data-purpose', 'rating-number')
            number_of_ratings = extract_text(course, 'span', 'class', 'udlite-text-xs course-card--reviews-text--12UpL')[1:-1]
            course_detail = course.find_all('span', {'class':'course-card--row--1OMjg'})
            course_length = course_detail[0].text
            number_of_lectures = course_detail[1].text
            difficulity = course_detail[2].text

            rows.append(
                [course_url, course_title, course_headline, author, course_rating, number_of_ratings, course_length, number_of_lectures, difficulity]               
            )

columns = ['url', 'Course Title', 'Course Headline', 'Instructor', 'Rating', 'Number of Ratings', 'Course Length', 'Number of Lectures', 'Difficulity']
df = pd.DataFrame(data=rows, columns=columns)
df.to_csv('Udmey Free Courses.csv', index=False)
driver.quit()