Skip to content

Commit 0e08f64

Browse files
committed
Refactor date/time parsing for mitpe events (#1840)
1 parent b9a9d45 commit 0e08f64

File tree

4 files changed

+280
-12
lines changed

4 files changed

+280
-12
lines changed

news_events/etl/mitpe_events.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from main.utils import now_in_utc
1010
from news_events.constants import ALL_AUDIENCES, FeedType
11-
from news_events.etl.utils import fetch_data_by_page, parse_date
11+
from news_events.etl.utils import fetch_data_by_page, parse_date_time_range
1212

1313
log = logging.getLogger(__name__)
1414
MITPE_EVENTS_TITLE = "MIT Professional Education Events"
@@ -66,16 +66,9 @@ def transform_item(item: dict) -> dict:
6666
6767
"""
6868

69-
times = item.get("time_range", "").split("-")
70-
start_dt = parse_date(
71-
f"{item.get("start_date")} {times[0] if len(times) > 0 else ''}"
69+
start_dt, end_dt = parse_date_time_range(
70+
item.get("start_date"), item.get("end_date"), item.get("time_range")
7271
)
73-
if not start_dt:
74-
# Time range may be invalid, try without it
75-
start_dt = parse_date(f"{item.get("start_date")}")
76-
end_dt = parse_date(f"{item.get("end_date")} {times[1] if len(times) > 1 else ''}")
77-
if not end_dt:
78-
end_dt = parse_date(f"{item.get("end_date")}")
7972

8073
# Do not bother transforming past events
8174
now = now_in_utc()

news_events/etl/mitpe_events_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ def test_transform(mitpe_events_json_data):
6464
2024, 8, 15, 21, 0, 0, tzinfo=UTC
6565
)
6666
assert items[3]["detail"]["event_datetime"] == datetime(
67-
2023, 5, 12, 4, 0, 0, tzinfo=UTC
67+
2023, 5, 12, 16, 0, 0, tzinfo=UTC
6868
)
6969
assert items[3]["detail"]["event_end_datetime"] == datetime(
70-
2023, 5, 12, 4, 0, 0, tzinfo=UTC
70+
2023, 5, 12, 16, 0, 0, tzinfo=UTC
7171
)

news_events/etl/utils.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,33 @@
11
"""Utility functions for news/events ETL pipelines"""
22

33
import logging
4+
import re
5+
from dataclasses import dataclass
46
from datetime import UTC, datetime
57
from time import mktime, struct_time
8+
from typing import Optional
69
from zoneinfo import ZoneInfo
710

811
import dateparser
912
import requests
1013
from bs4 import BeautifulSoup as Soup
1114
from bs4 import Tag
15+
from dateparser import timezone_parser
1216
from django.conf import settings
1317

1418
from main.constants import ISOFORMAT
1519

1620
log = logging.getLogger(__name__)
1721

1822

23+
@dataclass
24+
class FormattedTime:
25+
hour: Optional[str]
26+
minute: Optional[str]
27+
ampm: Optional[str]
28+
tz: Optional[str]
29+
30+
1931
def get_soup(url: str) -> Soup:
2032
"""
2133
Get a BeautifulSoup object from a URL.
@@ -131,3 +143,132 @@ def parse_date(text_date: str) -> datetime:
131143
except: # noqa: E722
132144
logging.exception("unparsable date received - ignoring '%s'", text_date)
133145
return dt_utc
146+
147+
148+
def convert_to_utc(dt: datetime, known_tz: str) -> datetime:
149+
"""
150+
Convert a datetime object to UTC timezone. If its
151+
orignal timezone is not known, assume it is in US/Eastern.
152+
153+
Args:
154+
dt (datetime): The datetime object to convert
155+
known_tz (str): The timezone string if known
156+
157+
Returns:
158+
datetime: The datetime object in UTC timezone
159+
"""
160+
if not dt:
161+
return None
162+
if not known_tz:
163+
# Assume it is in US/Eastern where MIT is
164+
dt = dt.replace(tzinfo=ZoneInfo("US/Eastern"))
165+
return dt.astimezone(UTC)
166+
167+
168+
def format_time(matched_time: re.Match) -> FormattedTime:
169+
"""
170+
Format a time regex match group into a standard format
171+
172+
Args:
173+
time_str (str): The time string to parse
174+
175+
Returns:
176+
FormattedTime: A formatted time object
177+
"""
178+
# Regex for AM/PM and timezone
179+
ampm_tz_regex = re.compile(r"(am|pm)\s*([A-Za-z]{2,3})?", re.IGNORECASE)
180+
ampm, tz = "", ""
181+
hour = matched_time.group(1) or ""
182+
minute = matched_time.group(2) or (":00" if hour else "")
183+
ampm_and_tz_match = re.search(ampm_tz_regex, matched_time.group(3) or "")
184+
if ampm_and_tz_match:
185+
ampm = ampm_and_tz_match.group(1) or ""
186+
tz = ampm_and_tz_match.group(2) or ""
187+
return FormattedTime(
188+
hour, minute, ampm, (tz if timezone_parser.word_is_tz(tz.upper()) else "")
189+
)
190+
191+
192+
def parse_date_time_range(
193+
start_date_str: str, end_date_str: str, time_range_str: str
194+
) -> tuple[datetime, datetime]:
195+
"""
196+
Attempt to parse the time range from the MITPE events API.
197+
If the time cannot be parsed, default to noon Easterm time,
198+
then convert to UTC.
199+
The field might not always contain a valid time/range.
200+
201+
Args:
202+
start_date_str (str): start date string
203+
end_date_str (str): end date string
204+
time_range (str): time range string
205+
206+
Returns:
207+
tuple(datetime, datetime): start and end datetimes in UTC timezone
208+
209+
"""
210+
# If one date is missing, set it to the other
211+
end_date_str = end_date_str or start_date_str
212+
start_date_str = start_date_str or end_date_str
213+
214+
default_time = FormattedTime("12", ":00", "PM", "")
215+
default_time_str = "12:00 PM"
216+
# Set start/end times to noon as default
217+
start_time, end_time = (default_time, default_time)
218+
# Try to split the string into start and end times
219+
split_times = list(
220+
re.finditer(
221+
re.compile(r"(\d{1,2})(:\d{2})?(\D*)", re.IGNORECASE), time_range_str or ""
222+
)
223+
)
224+
if split_times:
225+
# At least one time match was found
226+
formatted_times = [format_time(time_match) for time_match in split_times]
227+
# make ruff happy
228+
TWO = 2
229+
TWELVE = 12
230+
if len(formatted_times) == TWO:
231+
# Both start and end times were found
232+
start_time, end_time = formatted_times
233+
if start_time.hour and end_time.hour:
234+
# Times must at least have an hour to be valid
235+
if int(start_time.hour) > int(end_time.hour):
236+
# Example: 8 - 1 PM; 8 AM - 1
237+
start_time.ampm = start_time.ampm or "AM"
238+
end_time.ampm = end_time.ampm or "PM"
239+
elif int(end_time.hour) == TWELVE and int(start_time.hour) < TWELVE:
240+
# Example: 10 - 12 PM
241+
start_time.ampm = start_time.ampm or "AM"
242+
end_time.ampm = end_time.ampm or "PM"
243+
else:
244+
# Anything else, if AM/PM missing for one, set it to the other,
245+
# or "" if both are missing
246+
start_time.ampm = start_time.ampm or end_time.ampm or ""
247+
end_time.ampm = end_time.ampm or start_time.ampm or ""
248+
# If timezone missing for one, set it to the other,
249+
# or "" if both are missing
250+
start_time.tz = start_time.tz or end_time.tz or ""
251+
end_time.tz = end_time.tz or start_time.tz or ""
252+
elif len(formatted_times) == 1:
253+
# Only one time was found, set both start and end to that time
254+
start_time = formatted_times[0]
255+
end_time = start_time
256+
257+
# Ignore time range and use default time range if dates aren't parsable with it
258+
start_date = dateparser.parse(
259+
f"{start_date_str} {start_time.hour}{start_time.minute} "
260+
f"{start_time.ampm} {start_time.tz}"
261+
) or dateparser.parse(f"{start_date_str} {default_time_str}")
262+
end_date = dateparser.parse(
263+
f"{end_date_str} {end_time.hour}{end_time.minute} "
264+
f"{end_time.ampm} {end_time.tz}"
265+
) or dateparser.parse(f"{end_date_str} {default_time_str}")
266+
267+
if end_date and start_date and end_date < start_date:
268+
# This is nonsensical, so just set the end date to the start date
269+
end_date = start_date
270+
if not start_date:
271+
log.error("Failed to parse start date %s", start_date_str)
272+
return convert_to_utc(start_date, start_time.tz), convert_to_utc(
273+
end_date, end_time.tz
274+
)

news_events/etl/utils_test.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Tests for utils functions"""
22

3+
from datetime import UTC, datetime
34
from pathlib import Path
45
from time import struct_time
56
from urllib.error import HTTPError
@@ -82,3 +83,136 @@ def test_get_request_json_error_raise(mocker):
8283
)
8384
with pytest.raises(HTTPError):
8485
utils.get_request_json("https://test.mit.edu", raise_on_error=True)
86+
87+
88+
@pytest.mark.parametrize(
89+
("start_date_str", "end_date_str", "time_range_str", "start_dt", "end_dt"),
90+
[
91+
(
92+
"2024-01-15",
93+
"2024-01-15",
94+
"9-10 AM",
95+
datetime(2024, 1, 15, 14, 0, 0, tzinfo=UTC),
96+
datetime(2024, 1, 15, 15, 0, 0, tzinfo=UTC),
97+
),
98+
(
99+
"2024-01-15",
100+
None,
101+
"9-10 AM",
102+
datetime(2024, 1, 15, 14, 0, 0, tzinfo=UTC),
103+
datetime(2024, 1, 15, 15, 0, 0, tzinfo=UTC),
104+
),
105+
(
106+
"2024-07-15",
107+
"2024-07-16",
108+
"9 - 12 PM",
109+
datetime(2024, 7, 15, 13, 0, 0, tzinfo=UTC),
110+
datetime(2024, 7, 16, 16, 0, 0, tzinfo=UTC),
111+
),
112+
(
113+
"2024-07-15",
114+
"2024-07-15",
115+
"3:30 PM and ends at 5:45 PM",
116+
datetime(2024, 7, 15, 19, 30, 0, tzinfo=UTC),
117+
datetime(2024, 7, 15, 21, 45, 0, tzinfo=UTC),
118+
),
119+
(
120+
"2024-07-15",
121+
"2024-07-15",
122+
"3:30 PM - 5:30 PM pdt", # Should figure out this is Pacific Daylight Time
123+
datetime(2024, 7, 15, 22, 30, 0, tzinfo=UTC),
124+
datetime(2024, 7, 16, 0, 30, 0, tzinfo=UTC),
125+
),
126+
(
127+
"Future date tbd",
128+
None,
129+
None,
130+
None,
131+
None,
132+
),
133+
(
134+
"2024-07-15",
135+
"2024-07-30",
136+
"Every afternoon after end of class",
137+
datetime(2024, 7, 15, 16, 0, 0, tzinfo=UTC),
138+
datetime(2024, 7, 30, 16, 0, 0, tzinfo=UTC),
139+
),
140+
(
141+
"2024-07-15",
142+
"2024-07-15",
143+
"1pm",
144+
datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC),
145+
datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC),
146+
),
147+
(
148+
"2024-07-15",
149+
"2024-07-15",
150+
"8 to 1pm", # Should correctly guess that 8 is AM
151+
datetime(2024, 7, 15, 12, 0, 0, tzinfo=UTC),
152+
datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC),
153+
),
154+
(
155+
"2024-07-15",
156+
"2024-07-15",
157+
"8 AM to 1", # Should correctly guess that 1 is PM
158+
datetime(2024, 7, 15, 12, 0, 0, tzinfo=UTC),
159+
datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC),
160+
),
161+
(
162+
"2024-12-15",
163+
"2024-12-15",
164+
"11 to 12 pm", # Should correctly guess that 11 is AM
165+
datetime(2024, 12, 15, 16, 0, 0, tzinfo=UTC),
166+
datetime(2024, 12, 15, 17, 0, 0, tzinfo=UTC),
167+
),
168+
(
169+
"2024-07-15",
170+
"2024-07-15",
171+
"Beginning at 4:30 and ending at about 6pm",
172+
datetime(2024, 7, 15, 20, 30, 0, tzinfo=UTC),
173+
datetime(2024, 7, 15, 22, 0, 0, tzinfo=UTC),
174+
),
175+
(
176+
"2024-07-15",
177+
"2024-07-15",
178+
"3:00pm; weather permitting",
179+
datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC),
180+
datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC),
181+
),
182+
(
183+
"2024-07-15",
184+
"2024-07-15",
185+
"3:00pm; doors open at 2:30pm", # Ignore any end time before the start time
186+
datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC),
187+
datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC),
188+
),
189+
(
190+
"2024-07-15",
191+
"2024-07-15",
192+
"Beginning at 4:30", # No AM/PM, so take it literally as is
193+
datetime(2024, 7, 15, 8, 30, 0, tzinfo=UTC),
194+
datetime(2024, 7, 15, 8, 30, 0, tzinfo=UTC),
195+
),
196+
(
197+
"2024-07-15",
198+
"2024-07-30",
199+
"Beginning at 16:30", # No AM/PM, so take it literally as is
200+
datetime(2024, 7, 15, 20, 30, 0, tzinfo=UTC),
201+
datetime(2024, 7, 30, 20, 30, 0, tzinfo=UTC),
202+
),
203+
(
204+
None,
205+
"2024-11-30",
206+
"Bldg. 123, E52nd Street, Salon MIT", # Invalid time, default to noon Eastern time, convert to UTC
207+
datetime(2024, 11, 30, 17, 0, 0, tzinfo=UTC),
208+
datetime(2024, 11, 30, 17, 0, 0, tzinfo=UTC),
209+
),
210+
],
211+
)
212+
def test_parse_date_time_range(
213+
start_date_str, end_date_str, time_range_str, start_dt, end_dt
214+
):
215+
"""parse_date_time_range should return the expected start and end datetimes"""
216+
assert utils.parse_date_time_range(
217+
start_date_str, end_date_str, time_range_str
218+
) == (start_dt, end_dt)

0 commit comments

Comments
 (0)