Buy Me a Coffee? Your support is much appreciated!



app.py

import os
import time
import datetime
from collections import namedtuple
import selenium.webdriver as webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotInteractableException
from bs4 import BeautifulSoup
import pandas as pd

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0'
firefox_driver_path = os.path.join(os.getcwd(), 'geckodriver.exe')
firefox_service = Service(firefox_driver_path)
firefox_option = Options()
firefox_option.set_preference('general.useragent.override', user_agent)
browser = webdriver.Firefox(service=firefox_service, options=firefox_option)
browser.implicitly_wait(7)

url = 'https://chicago.craigslist.org/'
browser.get(url)

# click a hyperlink
for_sale_element = browser.find_element(By.XPATH, "//a[@data-alltitle='all for sale']")
# print(for_sale_element.text)
# print(for_sale_element.location)
# print(for_sale_element.is_enabled())
for_sale_element.click()

# # select from dropdown (by displayed text)
# dropdown_neighborhood = browser.find_element(By.ID, 'subArea')
# select_neighrborhood = Select(dropdown_neighborhood)
# select_neighrborhood.select_by_visible_text('city of chicago')

# # select from dropdown (by index position)
# dropdown_neighborhood = browser.find_element(By.ID, 'subArea')
# select_neighrborhood = Select(dropdown_neighborhood)
# select_neighrborhood.select_by_index(5)

# select from dropdown (by index position)
dropdown_neighborhood = browser.find_element(By.ID, 'subArea')
select_neighrborhood = Select(dropdown_neighborhood)
select_neighrborhood.select_by_value('chc')

dropdown_neighborhood = browser.find_element(By.ID, 'subcatAbb')
select_neighrborhood = Select(dropdown_neighborhood)
select_neighrborhood.select_by_visible_text('computers')

# search query (enter on input)
search_query = 'laptop'
search_field = browser.find_element(By.ID, 'query')
search_field.clear()
search_field.send_keys(search_query)
search_field.send_keys(Keys.ENTER)
time.sleep(1)

# store listings into a CSV and Excel file
posts_html = []
to_stop = False

while not to_stop:
    search_results = browser.find_element(By.ID, 'search-results')
    soup = BeautifulSoup(search_results.get_attribute('innerHTML'), 'html.parser') 
    posts_html.extend(soup.find_all('li', {'class': 'result-row'}))

    try:
        browser.execute_script('window.scrollTo(0, 0)')
        button_next = browser.find_element(By.XPATH, '//a[@class="button next"]')
        button_next.click()
        time.sleep(0.5)
    except ElementNotInteractableException:
        to_stop = True

print('Collected {0} listings'.format(len(posts_html)))
    
# clean up & organize records
CraigslistPost = namedtuple('CraigslistPost', ['title', 'price', 'post_timestamp', 'location', 'post_url', 'image_url'])
craigslist_posts = []
for post_html in posts_html:
    title = post_html.find('a', 'result-title').text
    price = post_html.find('span', 'result-price').text
    post_timestamp = post_html.find('time').get('datetime')
    location = post_html.find('span', 'result-hood').text.replace('(', '').replace(')', '')
    post_url = post_html.find('a', 'result-title').get('href')
    image_url = post_html.find('img').get('src') if post_html.find('img') else ''
    craigslist_posts.append(CraigslistPost(title, price, post_timestamp, location, post_url, image_url))

df = pd.DataFrame(craigslist_posts)
df.to_csv(f'{search_query} ({datetime.datetime.now().strftime("%Y_%m_%d %H_%M_%S")}).csv', index=False)

df['link'] = df.apply(lambda row: f'=HYPERLINK("{row["post_url"]}","Link")', axis=1)
df.to_excel(f'{search_query} ({datetime.datetime.now().strftime("%Y_%m_%d %H_%M_%S")}).xlsx', index=False)

browser.close()