Last active
August 10, 2024 08:56
-
-
Save fennecinspace/9af07a9fe64074c7c317f0a8c2422c54 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from webdriver_manager.chrome import ChromeDriverManager | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
import pandas as pd | |
# Set up Chrome options | |
chrome_options = Options() | |
# chrome_options.add_argument("--headless") # Run in headless mode (no browser UI) | |
chrome_options.add_argument("--disable-gpu") | |
# Set up the Chrome WebDriver using webdriver_manager | |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options) | |
# Open the website | |
driver.get("https://cmrsurgical.com/job-search") | |
# Wait for the job listing to load using explicit wait | |
wait = WebDriverWait(driver, 10) | |
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.search_result_list .list-item a.search-item'))) | |
# Extract job titles, locations, job types, and URLs | |
jobs = driver.find_elements(By.CSS_SELECTOR, '.search_result_list .list-item a.search-item') | |
job_data = [] | |
for job in jobs: | |
title = job.find_element(By.CLASS_NAME, 'search-item__title').text | |
location = job.find_element(By.CLASS_NAME, 'search-item__location').text | |
job_type = job.find_element(By.CLASS_NAME, 'search-item__discipline').text | |
url = job.get_attribute('href') | |
job_data.append({ | |
'Job Title': title, | |
'Location': location, | |
'Job Type': job_type, | |
'URL': url, | |
}) | |
for i in range(len(job_data)): | |
try: | |
# Open each job page | |
driver.get(job_data[i]['URL']) | |
# Wait for the required sections to load and scrape the details | |
try: | |
application_url = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.apply-cta-container a'))).get_attribute('href') | |
except: | |
application_url = 'Not available' | |
try: | |
responsibilities = driver.find_element(By.XPATH, "//h3[text()='Responsibilities']/following-sibling::ul").text | |
except: | |
responsibilities = 'Not available' | |
try: | |
about_you = driver.find_element(By.XPATH, "//h3[text()='About you']/following-sibling::ul").text | |
except: | |
about_you = 'Not available' | |
# Append data to job_data list | |
job_data[i] = { | |
**job_data[i], | |
'Application': application_url, | |
# 'Responsibilities': responsibilities, | |
# 'About You': about_you, | |
} | |
except Exception as e: | |
print(e) | |
# Convert to DataFrame | |
df = pd.DataFrame(job_data) | |
# Save to CSV | |
df.to_csv('jobs_detailed.csv', index=False) | |
# Close the WebDriver | |
driver.quit() | |
print("Job details including application URL, responsibilities, and about you have been saved to jobs_detailed.csv.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment