Last active
July 10, 2025 12:19
-
-
Save GorgeousOne/89fda2203740d231487b92e80fe22949 to your computer and use it in GitHub Desktop.
Summaery 2023 calender event scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Scraper to download the Summaery events into a .ics calender file for importing it into a calender app or something''' | |
import os | |
import pytz | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
from datetime import datetime | |
from icalendar import Calendar, Event | |
def get_event_start_end_place(event_entry): | |
'''read the start, end time and possible place from the same calender info element''' | |
date_string = get_event_date(event_entry) | |
time_info = event_entry.find(class_='cal_maininfo') | |
infos = time_info.find_all('li') | |
time_pattern = r'(\d{1,2}\.\d{2})' | |
# filter event start and end for only the datetime string | |
start = date_string + ' ' + re.search(time_pattern, infos[0].text).group(1) | |
end = re.search(time_pattern, infos[1].text).group(1) | |
# add the event date to event end if it's not in the string yet | |
if len(end) < 10: | |
end = date_string + ' ' + end | |
place = re.sub(r'Ort:', '', infos[2].text).strip() if len(infos) > 2 else None | |
time_format = '%d.%m.%Y %H.%M' | |
germany = pytz.timezone('Europe/Berlin') | |
start = datetime.strptime(start, time_format) | |
end = datetime.strptime(end, time_format) | |
return germany.localize(start), germany.localize(end), place | |
def get_event_date(event_entry): | |
'''read the event date string from the big date element to the left. and append a year''' | |
month = event_entry.find(class_='month').text.strip() | |
day = event_entry.find(class_='day_of_month').text.strip() | |
return day.zfill(2) + '.' + month.zfill(2) + '.' + str(year) | |
def get_event_title_url(event_entry): | |
'''read the event title and url for more info from the desciption to the right''' | |
title_elem = event_entry.find('h2').find('a') | |
title = title_elem.text.strip() | |
url = 'https://www.uni-weimar.de' + title_elem['href'] | |
return title, url | |
def get_event_teaser(event_entry): | |
'''read the description text of the event on the right''' | |
teaser = event_entry.find(class_='teasertext').text.strip() | |
return re.sub(r'\bmehr\b$', '', teaser).strip() | |
def add_event_to_cal(cal, start, end, place, title, url, teaser): | |
'''create and add a calendar event with all the info to the calendar''' | |
event = Event() | |
event.add('summary', title) | |
event.add('dtstart', start) | |
event.add('dtend', end) | |
if place: | |
event.add('location', place) | |
event.add('url', url) | |
event.add('description', teaser) | |
cal.add_component(event) | |
year = 2025 | |
page_url = 'https://www.uni-weimar.de/de/universitaet/aktuell/veranstaltungskalender/highlights-des-jahres/2025/summaery/veranstaltungen/' | |
if __name__ == '__main__': | |
filepath = f'website_{year}.html' | |
if not os.path.exists(filepath): | |
print('downloading website') | |
response = requests.get(page_url) | |
with open(filepath, 'wb') as f: | |
print('saved website') | |
f.write(response.content) | |
with open(filepath, 'r', encoding='utf-8') as f: | |
print('loading html') | |
html_string = f.read() | |
import re | |
html_string = re.sub(r'\s\s+', ' ', html_string) | |
dom = BeautifulSoup(html_string, 'html.parser') | |
# find all events listed on the page | |
event_entries = dom.find_all(class_='summaeryLiveEvent') | |
print(f'Found {len(event_entries)} events') | |
cal = Calendar() | |
for i, entry in enumerate(event_entries): | |
add_event_to_cal( | |
cal, | |
*get_event_start_end_place(entry), | |
*get_event_title_url(entry), | |
get_event_teaser(entry)) | |
print(i + 1, '/', len(event_entries), get_event_title_url(entry)[0]) | |
# write them into one big icalendar file | |
with open(f'summaery_{year}.ics', 'wb') as f: | |
f.write(cal.to_ical()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment