Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion countries/ch.json
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,9 @@
{
"name": "Fresque de l'Economie Circulaire",
"url": "https://www.billetweb.fr/pro/lafresquedeleconomiecirculaire",
"language_code": "fr",
"language_code": "fr",
"type": "scraper",
"iframe": "event41148",
"id": 300
},
{
Expand Down
71 changes: 48 additions & 23 deletions src/trouver_une_fresque_scraper/scraper/eventbrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from selenium.webdriver.support import expected_conditions as EC

from trouver_une_fresque_scraper.db.records import get_record_dict
from trouver_une_fresque_scraper.utils.date_and_time import get_dates
from trouver_une_fresque_scraper.utils.date_and_time import get_dates_from_element
from trouver_une_fresque_scraper.utils.errors import (
FreskError,
FreskDateBadFormat,
Expand Down Expand Up @@ -184,12 +184,15 @@ def get_eventbrite_data(sources, service, options):
###########################################################
# Is it an online event?
################################################################
online = False
try:
online_el = driver.find_element(By.CSS_SELECTOR, "p.location-info__address-text")
online = is_online(online_el.text)
except NoSuchElementException:
pass
online = is_online(title)
if not online:
try:
short_location_el = driver.find_element(
By.CSS_SELECTOR, "span.start-date-and-location__location"
)
online = is_online(short_location_el.text)
except NoSuchElementException:
pass

################################################################
# Location data
Expand All @@ -205,11 +208,16 @@ def get_eventbrite_data(sources, service, options):
country_code = ""

if not online:
location_el = driver.find_element(By.CSS_SELECTOR, "div.location-info__address")
full_location_text = location_el.text.split("\n")
location_name = full_location_text[0]
address_and_city = full_location_text[1]
full_location = f"{location_name}, {address_and_city}"
try:
full_location_el = driver.find_element(
By.CSS_SELECTOR, 'div[class^="Location-module__addressWrapper___"'
)
except NoSuchElementException:
logging.error(
f"Location element not found for offline event {link}.",
)
continue
full_location = full_location_el.text.replace("\n", ", ")

try:
address_dict = get_address(full_location)
Expand All @@ -231,7 +239,7 @@ def get_eventbrite_data(sources, service, options):
# Description
################################################################
try:
description_title_el = driver.find_element(By.CSS_SELECTOR, "div.eds-text--left")
description_title_el = driver.find_element(By.CSS_SELECTOR, "div.event-description")
description = description_title_el.text
except NoSuchElementException:
logging.info("Rejecting record: Description not found.")
Expand Down Expand Up @@ -274,14 +282,15 @@ def get_eventbrite_data(sources, service, options):
try:
date_info_el = driver.find_element(
by=By.CSS_SELECTOR,
value="span.date-info__full-datetime",
value="time.start-date-and-location__date",
)
event_time = date_info_el.text
except NoSuchElementException:
raise FreskDateNotFound

try:
event_start_datetime, event_end_datetime = get_dates(event_time)
event_start_datetime, event_end_datetime = get_dates_from_element(
date_info_el
)
except FreskDateBadFormat as error:
logging.info(f"Reject record: {error}")
continue
Expand All @@ -307,26 +316,42 @@ def get_eventbrite_data(sources, service, options):

if not already_scanned:
event_info.append(
[uuid, event_start_datetime, event_end_datetime, tickets_link]
[
uuid,
event_start_datetime,
event_end_datetime,
tickets_link,
]
)

# There is only one event on this page.
except TimeoutException:
################################################################
# Single event with multiple dates (a "collection").
################################################################
try:
check_availability_btn = driver.find_element(
by=By.CSS_SELECTOR, value="button.check-availability-btn__button"
)
# TODO: add support for this.
logging.error(f"EventBrite collection not supported in event {link}.")
continue
except NoSuchElementException:
pass

################################################################
# Dates
################################################################
try:
date_info_el = driver.find_element(
by=By.CSS_SELECTOR,
value="span.date-info__full-datetime",
value="time.start-date-and-location__date",
)
event_time = date_info_el.text
except NoSuchElementException as error:
logging.info(f"Reject record: {error}")
continue
except NoSuchElementException:
raise FreskDateNotFound

try:
event_start_datetime, event_end_datetime = get_dates(event_time)
event_start_datetime, event_end_datetime = get_dates_from_element(date_info_el)
except FreskDateBadFormat as error:
logging.info(f"Reject record: {error}")
continue
Expand Down
85 changes: 84 additions & 1 deletion src/trouver_une_fresque_scraper/utils/date_and_time.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import traceback
import logging

from datetime import datetime, timedelta
from dateutil.parser import parse
Expand Down Expand Up @@ -272,7 +273,7 @@ def get_dates(event_time):
FRENCH_MONTHS[match.group("month")],
int(match.group("day")),
int(start_parts[0]),
int(start_parts[1]) if len(start_parts) > 1 and len(start_parts[1]) else 0,
(int(start_parts[1]) if len(start_parts) > 1 and len(start_parts[1]) else 0),
)
end_parts = match.group("end_time").split("h")
event_end_datetime = datetime(
Expand All @@ -290,4 +291,86 @@ def get_dates(event_time):
except Exception as e:
if not isinstance(e, FreskError):
traceback.print_exc()
logging.error(f"get_dates: {event_time}")
raise FreskDateBadFormat(event_time)


def get_dates_from_element(el):
"""Returns start and end datetime objects extracted from the element.

The "datetime" attribute of the element is used if present to extract the date, otherwise falls back on get_dates to parse the day and hours from the element text. Returns None, None on failure.

May throw FreskDateDifferentTimezone, FreskDateBadFormat and any exception thrown by get_dates.
"""
event_day = el.get_attribute("datetime")
event_time = el.text

try:
# Leverage the datetime attribute if present.
# datetime: 2025-12-05
# text: déc. 5 de 9am à 12pm UTC+1
if event_day:
day_match = re.match(r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})", event_day)

def PATTERN_TIME(hour_name, minute_name, pm_name):
return (
r"(?P<"
+ hour_name
+ r">\d{1,2})(?P<"
+ minute_name
+ r">:\d{2})?(?P<"
+ pm_name
+ r">(am|pm|vorm.|nachm.))"
)

def ParseTime(match_object, hour_name, minute_name, pm_name):
hour = int(match_object.group(hour_name))
PATTERN_PM = ["pm", "nachm."]
if match_object.group(pm_name) in PATTERN_PM and hour < 12:
hour += 12

minute = 0
match_minute = hour_match.group(minute_name)
if match_minute:
minute = int(match_minute[1:])

return hour, minute

# TODO: add proper support for timezone.
# We use re.search to skip the text for the date at the beginning of the string.
hour_match = re.search(
r"(de|von)\s"
+ PATTERN_TIME("start_hour", "start_minute", "start_am_or_pm")
+ r"\s"
+ r"(à|bis)\s"
+ PATTERN_TIME("end_hour", "end_minute", "end_am_or_pm")
+ r"\s"
+ r"((UTC|MEZ)(?P<timezone>.*))",
event_time,
)
if day_match and hour_match:
timezone = hour_match.group("timezone")
if timezone and timezone not in ("+1", "+2"):
raise FreskDateDifferentTimezone(event_time)
dt = datetime(
int(day_match.group("year")),
int(day_match.group("month")),
int(day_match.group("day")),
)
start_hour, start_minute = ParseTime(
hour_match, "start_hour", "start_minute", "start_am_or_pm"
)
end_hour, end_minute = ParseTime(
hour_match, "end_hour", "end_minute", "end_am_or_pm"
)
return datetime(dt.year, dt.month, dt.day, start_hour, start_minute), datetime(
dt.year, dt.month, dt.day, end_hour, end_minute
)

return get_dates(event_time)

except Exception as e:
if not isinstance(e, FreskError):
traceback.print_exc()
logging.error(f"get_dates_from_element: {event_time}")
raise FreskDateBadFormat(event_time)
85 changes: 84 additions & 1 deletion src/trouver_une_fresque_scraper/utils/date_and_time_test.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from datetime import datetime
import logging
from attrs import define


from trouver_une_fresque_scraper.utils import date_and_time


def run_tests():
def run_get_dates_tests():
# tuple fields:
# 1. Test case name or ID
# 2. Input date string
Expand Down Expand Up @@ -80,3 +81,85 @@ def run_tests():
logging.error(f"{test_case[0]}: expected {test_case[2]} but got {actual_start_time}")
if actual_end_time != test_case[3]:
logging.error(f"{test_case[0]}: expected {test_case[3]} but got {actual_end_time}")


@define
class MockWebDriverElement:
text: str
dt: str | None

def get_attribute(self, ignored: str) -> str | None:
return self.dt


def run_get_dates_from_element_tests():
# tuple fields:
# 1. Test case name or ID
# 2. Input date string
# 3. Expected output start datetime
# 4. Expected output end datetime
test_cases = [
(
"BilletWeb: no datetime, fallback on text parsing",
None,
"Thu Oct 19, 2023 from 01:00 PM to 02:00 PM",
datetime(2023, 10, 19, 13, 0),
datetime(2023, 10, 19, 14, 0),
),
(
"EventBrite: morning",
"2025-12-05",
"déc. 5 de 8am à 11am UTC",
datetime(2025, 12, 5, 8, 0),
datetime(2025, 12, 5, 11, 0),
),
(
"EventBrite: evening",
"2025-12-12",
"déc. 12 de 6pm à 9pm UTC+1",
datetime(2025, 12, 12, 18, 0),
datetime(2025, 12, 12, 21, 0),
),
(
"EventBrite: afternoon in German",
"2024-12-16",
"Dez. 16 von 5nachm. bis 8nachm. UTC",
datetime(2024, 12, 16, 17, 0),
datetime(2024, 12, 16, 20, 0),
),
(
"EventBrite: afternoon with minutes in German",
"2024-12-03",
"Dez. 3 von 5:30nachm. bis 8:30nachm. MEZ",
datetime(2024, 12, 3, 17, 30),
datetime(2024, 12, 3, 20, 30),
),
(
"EventBrite: PM adds 12 to the hours only from 1 PM onwards",
"2025-12-14",
"déc. 14 de 9:30am à 12:30pm UTC+1",
datetime(2025, 12, 14, 9, 30),
datetime(2025, 12, 14, 12, 30),
),
(
"EventBrite: start and end minutes differ",
"2026-01-21",
"janv. 21 de 9am à 12:30pm UTC+1",
datetime(2026, 1, 21, 9, 0),
datetime(2026, 1, 21, 12, 30),
),
]
for test_case in test_cases:
logging.info(f"Running {test_case[0]}")
actual_start_time, actual_end_time = date_and_time.get_dates_from_element(
MockWebDriverElement(dt=test_case[1], text=test_case[2])
)
if actual_start_time != test_case[3]:
logging.error(f"{test_case[0]}: expected {test_case[3]} but got {actual_start_time}")
if actual_end_time != test_case[4]:
logging.error(f"{test_case[0]}: expected {test_case[4]} but got {actual_end_time}")


def run_tests():
run_get_dates_tests()
run_get_dates_from_element_tests()