Buy Me a Coffee? Your support is much appreciated!
app.py
import os
import time
import datetime
from collections import namedtuple
import selenium.webdriver as webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotInteractableException
from bs4 import BeautifulSoup
import pandas as pd
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0'
firefox_driver_path = os.path.join(os.getcwd(), 'geckodriver.exe')
firefox_service = Service(firefox_driver_path)
firefox_option = Options()
firefox_option.set_preference('general.useragent.override', user_agent)
browser = webdriver.Firefox(service=firefox_service, options=firefox_option)
browser.implicitly_wait(7)
url = 'https://chicago.craigslist.org/'
browser.get(url)
# click a hyperlink
for_sale_element = browser.find_element(By.XPATH, "//a[@data-alltitle='all for sale']")
# print(for_sale_element.text)
# print(for_sale_element.location)
# print(for_sale_element.is_enabled())
for_sale_element.click()
# # select from dropdown (by displayed text)
# dropdown_neighborhood = browser.find_element(By.ID, 'subArea')
# select_neighrborhood = Select(dropdown_neighborhood)
# select_neighrborhood.select_by_visible_text('city of chicago')
# # select from dropdown (by index position)
# dropdown_neighborhood = browser.find_element(By.ID, 'subArea')
# select_neighrborhood = Select(dropdown_neighborhood)
# select_neighrborhood.select_by_index(5)
# select from dropdown (by index position)
dropdown_neighborhood = browser.find_element(By.ID, 'subArea')
select_neighrborhood = Select(dropdown_neighborhood)
select_neighrborhood.select_by_value('chc')
dropdown_neighborhood = browser.find_element(By.ID, 'subcatAbb')
select_neighrborhood = Select(dropdown_neighborhood)
select_neighrborhood.select_by_visible_text('computers')
# search query (enter on input)
search_query = 'laptop'
search_field = browser.find_element(By.ID, 'query')
search_field.clear()
search_field.send_keys(search_query)
search_field.send_keys(Keys.ENTER)
time.sleep(1)
# store listings into a CSV and Excel file
posts_html = []
to_stop = False
while not to_stop:
search_results = browser.find_element(By.ID, 'search-results')
soup = BeautifulSoup(search_results.get_attribute('innerHTML'), 'html.parser')
posts_html.extend(soup.find_all('li', {'class': 'result-row'}))
try:
browser.execute_script('window.scrollTo(0, 0)')
button_next = browser.find_element(By.XPATH, '//a[@class="button next"]')
button_next.click()
time.sleep(0.5)
except ElementNotInteractableException:
to_stop = True
print('Collected {0} listings'.format(len(posts_html)))
# clean up & organize records
CraigslistPost = namedtuple('CraigslistPost', ['title', 'price', 'post_timestamp', 'location', 'post_url', 'image_url'])
craigslist_posts = []
for post_html in posts_html:
title = post_html.find('a', 'result-title').text
price = post_html.find('span', 'result-price').text
post_timestamp = post_html.find('time').get('datetime')
location = post_html.find('span', 'result-hood').text.replace('(', '').replace(')', '')
post_url = post_html.find('a', 'result-title').get('href')
image_url = post_html.find('img').get('src') if post_html.find('img') else ''
craigslist_posts.append(CraigslistPost(title, price, post_timestamp, location, post_url, image_url))
df = pd.DataFrame(craigslist_posts)
df.to_csv(f'{search_query} ({datetime.datetime.now().strftime("%Y_%m_%d %H_%M_%S")}).csv', index=False)
df['link'] = df.apply(lambda row: f'=HYPERLINK("{row["post_url"]}","Link")', axis=1)
df.to_excel(f'{search_query} ({datetime.datetime.now().strftime("%Y_%m_%d %H_%M_%S")}).xlsx', index=False)
browser.close()