Skip to content

Instantly share code, notes, and snippets.

@fennecinspace
Last active August 10, 2024 08:56
Show Gist options
  • Save fennecinspace/9af07a9fe64074c7c317f0a8c2422c54 to your computer and use it in GitHub Desktop.
Save fennecinspace/9af07a9fe64074c7c317f0a8c2422c54 to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless") # Run in headless mode (no browser UI)
chrome_options.add_argument("--disable-gpu")
# Set up the Chrome WebDriver using webdriver_manager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
# Open the website
driver.get("https://cmrsurgical.com/job-search")
# Wait for the job listing to load using explicit wait
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.search_result_list .list-item a.search-item')))
# Extract job titles, locations, job types, and URLs
jobs = driver.find_elements(By.CSS_SELECTOR, '.search_result_list .list-item a.search-item')
job_data = []
for job in jobs:
title = job.find_element(By.CLASS_NAME, 'search-item__title').text
location = job.find_element(By.CLASS_NAME, 'search-item__location').text
job_type = job.find_element(By.CLASS_NAME, 'search-item__discipline').text
url = job.get_attribute('href')
job_data.append({
'Job Title': title,
'Location': location,
'Job Type': job_type,
'URL': url,
})
for i in range(len(job_data)):
try:
# Open each job page
driver.get(job_data[i]['URL'])
# Wait for the required sections to load and scrape the details
try:
application_url = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.apply-cta-container a'))).get_attribute('href')
except:
application_url = 'Not available'
try:
responsibilities = driver.find_element(By.XPATH, "//h3[text()='Responsibilities']/following-sibling::ul").text
except:
responsibilities = 'Not available'
try:
about_you = driver.find_element(By.XPATH, "//h3[text()='About you']/following-sibling::ul").text
except:
about_you = 'Not available'
# Append data to job_data list
job_data[i] = {
**job_data[i],
'Application': application_url,
# 'Responsibilities': responsibilities,
# 'About You': about_you,
}
except Exception as e:
print(e)
# Convert to DataFrame
df = pd.DataFrame(job_data)
# Save to CSV
df.to_csv('jobs_detailed.csv', index=False)
# Close the WebDriver
driver.quit()
print("Job details including application URL, responsibilities, and about you have been saved to jobs_detailed.csv.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment