Skip to content

Commit d23473d

Browse files
committed
Handle more scenarios and add tests for them; default to noon eastern and convert to utc if times cannot be parsed
1 parent 4a7216f commit d23473d

File tree

3 files changed

+179
-35
lines changed

3 files changed

+179
-35
lines changed

news_events/etl/mitpe_events_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ def test_transform(mitpe_events_json_data):
6464
2024, 8, 15, 21, 0, 0, tzinfo=UTC
6565
)
6666
assert items[3]["detail"]["event_datetime"] == datetime(
67-
2023, 5, 12, 4, 0, 0, tzinfo=UTC
67+
2023, 5, 12, 16, 0, 0, tzinfo=UTC
6868
)
6969
assert items[3]["detail"]["event_end_datetime"] == datetime(
70-
2023, 5, 12, 4, 0, 0, tzinfo=UTC
70+
2023, 5, 12, 16, 0, 0, tzinfo=UTC
7171
)

news_events/etl/utils.py

Lines changed: 101 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,32 @@
22

33
import logging
44
import re
5+
from dataclasses import dataclass
56
from datetime import UTC, datetime
67
from time import mktime, struct_time
8+
from typing import Optional
79
from zoneinfo import ZoneInfo
810

911
import dateparser
1012
import requests
1113
from bs4 import BeautifulSoup as Soup
1214
from bs4 import Tag
15+
from dateparser import timezone_parser
1316
from django.conf import settings
1417

1518
from main.constants import ISOFORMAT
1619

1720
log = logging.getLogger(__name__)
1821

1922

23+
@dataclass
24+
class FormattedTime:
25+
hour: Optional[str]
26+
minute: Optional[str]
27+
ampm: Optional[str]
28+
tz: Optional[str]
29+
30+
2031
def get_soup(url: str) -> Soup:
2132
"""
2233
Get a BeautifulSoup object from a URL.
@@ -154,47 +165,110 @@ def convert_to_utc(dt: datetime, known_tz: str) -> datetime:
154165
return dt.astimezone(UTC)
155166

156167

168+
def format_time(matched_time: re.Match) -> FormattedTime:
169+
"""
170+
Format a time regex match group into a standard format
171+
172+
Args:
173+
time_str (str): The time string to parse
174+
175+
Returns:
176+
FormattedTime: A formatted time object
177+
"""
178+
# Regex for AM/PM and timezone
179+
ampm_tz_regex = re.compile(r"(am|pm)\s*([A-Za-z]{2,3})?", re.IGNORECASE)
180+
ampm, tz = "", ""
181+
hour = matched_time.group(1) or ""
182+
minute = matched_time.group(2) or (":00" if hour else "")
183+
ampm_and_tz_match = re.search(ampm_tz_regex, matched_time.group(3) or "")
184+
if ampm_and_tz_match:
185+
ampm = ampm_and_tz_match.group(1) or ""
186+
tz = ampm_and_tz_match.group(2) or ""
187+
return FormattedTime(
188+
hour, minute, ampm, (tz if timezone_parser.word_is_tz(tz.upper()) else "")
189+
)
190+
191+
157192
def parse_date_time_range(
158193
start_date_str: str, end_date_str: str, time_range_str: str
159194
) -> tuple[datetime, datetime]:
160195
"""
161196
Attempt to parse the time range from the MITPE events API.
162-
The field might not actually contain a time or range.
197+
If the time cannot be parsed, default to noon Easterm time,
198+
then convert to UTC.
199+
The field might not always contain a valid time/range.
163200
164201
Args:
165202
start_date_str (str): start date string
166203
end_date_str (str): end date string
167204
time_range (str): time range string
168205
169206
Returns:
170-
tuple: start and end datetimes
207+
tuple(datetime, datetime): start and end datetimes in UTC timezone
171208
172209
"""
173-
start_time, start_ampm, end_time, end_ampm, tz = "", "", "", "", ""
174-
time_regex = re.compile(
175-
r"(\d{1,2})(:\d{2})?\s*(am|pm)?\s*-?\s*(\d{1,2})(:?\d{2})?\s*(am|pm)?\s*([A-Za-z]{2,3})?",
176-
re.IGNORECASE,
210+
# If one date is missing, set it to the other
211+
end_date_str = end_date_str or start_date_str
212+
start_date_str = start_date_str or end_date_str
213+
214+
default_time = FormattedTime("12", ":00", "PM", "")
215+
default_time_str = "12:00 PM"
216+
# Set start/end times to noon as default
217+
start_time, end_time = (default_time, default_time)
218+
# Try to split the string into start and end times
219+
split_times = list(
220+
re.finditer(
221+
re.compile(r"(\d{1,2})(:\d{2})?(\D*)", re.IGNORECASE), time_range_str or ""
222+
)
177223
)
178-
time_match = re.match(time_regex, time_range_str or "")
179-
if time_match:
180-
start_time = f"{time_match.group(1)}{time_match.group(2) or ':00'}" or ""
181-
start_ampm = time_match.group(3) or ""
182-
end_time = f"{time_match.group(4)}{time_match.group(5) or ':00'}" or start_time
183-
end_ampm = time_match.group(6) or ""
184-
tz = (time_match.group(7) or "").upper()
185-
start_date = dateparser.parse(
186-
f"{start_date_str} {start_time}{start_ampm} {tz}"
187-
) or dateparser.parse(start_date_str)
188-
else:
189-
start_date = dateparser.parse(start_date_str)
190-
if end_date_str:
191-
end_date = dateparser.parse(
192-
f"{end_date_str} {end_time}{end_ampm or ''} {tz}"
193-
) or dateparser.parse(end_date_str)
194-
else:
195-
end_date = dateparser.parse(
196-
f"{start_date_str} {end_time}{end_ampm or ""} {tz}"
197-
) or dateparser.parse(start_date_str)
224+
if split_times:
225+
# At least one time match was found
226+
formatted_times = [format_time(time_match) for time_match in split_times]
227+
# make ruff happy
228+
TWO = 2
229+
TWELVE = 12
230+
if len(formatted_times) == TWO:
231+
# Both start and end times were found
232+
start_time, end_time = formatted_times
233+
if start_time.hour and end_time.hour:
234+
# Times must at least have an hour to be valid
235+
if int(start_time.hour) > int(end_time.hour):
236+
# Example: 8 - 1 PM; 8 AM - 1
237+
start_time.ampm = start_time.ampm or "AM"
238+
end_time.ampm = end_time.ampm or "PM"
239+
elif int(end_time.hour) == TWELVE and int(start_time.hour) < TWELVE:
240+
# Example: 10 - 12 PM
241+
start_time.ampm = start_time.ampm or "AM"
242+
end_time.ampm = end_time.ampm or "PM"
243+
else:
244+
# Anything else, if AM/PM missing for one, set it to the other,
245+
# or "" if both are missing
246+
start_time.ampm = start_time.ampm or end_time.ampm or ""
247+
end_time.ampm = end_time.ampm or start_time.ampm or ""
248+
# If timezone missing for one, set it to the other,
249+
# or "" if both are missing
250+
start_time.tz = start_time.tz or end_time.tz or ""
251+
end_time.tz = end_time.tz or start_time.tz or ""
252+
elif len(formatted_times) == 1:
253+
# Only one time was found, set both start and end to that time
254+
start_time = formatted_times[0]
255+
end_time = start_time
256+
257+
# Ignore time range and use default time range if dates aren't parsable with it
258+
start_date = dateparser.parse(
259+
f"{start_date_str} {start_time.hour}{start_time.minute} "
260+
f"{start_time.ampm} {start_time.tz}"
261+
) or dateparser.parse(f"{start_date_str} {default_time_str}")
262+
end_date = dateparser.parse(
263+
f"{end_date_str} {end_time.hour}{end_time.minute} "
264+
f"{end_time.ampm} {end_time.tz}"
265+
) or dateparser.parse(f"{end_date_str} {default_time_str}")
266+
267+
if end_date and start_date and end_date < start_date:
268+
# This is nonsensical, so just set the end date to the start date
269+
end_date = start_date
198270
if not start_date:
199271
log.error("Failed to parse start date %s", start_date_str)
200-
return convert_to_utc(start_date, tz), convert_to_utc(end_date, tz)
272+
return convert_to_utc(start_date, start_time.tz), convert_to_utc(
273+
end_date, end_time.tz
274+
)

news_events/etl/utils_test.py

Lines changed: 76 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,16 +112,16 @@ def test_get_request_json_error_raise(mocker):
112112
(
113113
"2024-07-15",
114114
"2024-07-15",
115-
"3:30 PM - 5:45 PM",
115+
"3:30 PM and ends at 5:45 PM",
116116
datetime(2024, 7, 15, 19, 30, 0, tzinfo=UTC),
117117
datetime(2024, 7, 15, 21, 45, 0, tzinfo=UTC),
118118
),
119119
(
120120
"2024-07-15",
121121
"2024-07-15",
122-
"3:30 PM - 5:30 PM pst",
123-
datetime(2024, 7, 15, 23, 30, 0, tzinfo=UTC),
124-
datetime(2024, 7, 16, 1, 30, 0, tzinfo=UTC),
122+
"3:30 PM - 5:30 PM pdt", # Should figure out this is Pacific Daylight Time
123+
datetime(2024, 7, 15, 22, 30, 0, tzinfo=UTC),
124+
datetime(2024, 7, 16, 0, 30, 0, tzinfo=UTC),
125125
),
126126
(
127127
"Future date tbd",
@@ -134,8 +134,78 @@ def test_get_request_json_error_raise(mocker):
134134
"2024-07-15",
135135
"2024-07-30",
136136
"Every afternoon after end of class",
137-
datetime(2024, 7, 15, 4, 0, 0, tzinfo=UTC),
138-
datetime(2024, 7, 30, 4, 0, 0, tzinfo=UTC),
137+
datetime(2024, 7, 15, 16, 0, 0, tzinfo=UTC),
138+
datetime(2024, 7, 30, 16, 0, 0, tzinfo=UTC),
139+
),
140+
(
141+
"2024-07-15",
142+
"2024-07-15",
143+
"1pm",
144+
datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC),
145+
datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC),
146+
),
147+
(
148+
"2024-07-15",
149+
"2024-07-15",
150+
"8 to 1pm", # Should correctly guess that 8 is AM
151+
datetime(2024, 7, 15, 12, 0, 0, tzinfo=UTC),
152+
datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC),
153+
),
154+
(
155+
"2024-07-15",
156+
"2024-07-15",
157+
"8 AM to 1", # Should correctly guess that 1 is PM
158+
datetime(2024, 7, 15, 12, 0, 0, tzinfo=UTC),
159+
datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC),
160+
),
161+
(
162+
"2024-12-15",
163+
"2024-12-15",
164+
"11 to 12 pm", # Should correctly guess that 11 is AM
165+
datetime(2024, 12, 15, 16, 0, 0, tzinfo=UTC),
166+
datetime(2024, 12, 15, 17, 0, 0, tzinfo=UTC),
167+
),
168+
(
169+
"2024-07-15",
170+
"2024-07-15",
171+
"Beginning at 4:30 and ending at about 6pm",
172+
datetime(2024, 7, 15, 20, 30, 0, tzinfo=UTC),
173+
datetime(2024, 7, 15, 22, 0, 0, tzinfo=UTC),
174+
),
175+
(
176+
"2024-07-15",
177+
"2024-07-15",
178+
"3:00pm; weather permitting",
179+
datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC),
180+
datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC),
181+
),
182+
(
183+
"2024-07-15",
184+
"2024-07-15",
185+
"3:00pm; doors open at 2:30pm", # Ignore any end time before the start time
186+
datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC),
187+
datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC),
188+
),
189+
(
190+
"2024-07-15",
191+
"2024-07-15",
192+
"Beginning at 4:30", # No AM/PM, so take it literally as is
193+
datetime(2024, 7, 15, 8, 30, 0, tzinfo=UTC),
194+
datetime(2024, 7, 15, 8, 30, 0, tzinfo=UTC),
195+
),
196+
(
197+
"2024-07-15",
198+
"2024-07-30",
199+
"Beginning at 16:30", # No AM/PM, so take it literally as is
200+
datetime(2024, 7, 15, 20, 30, 0, tzinfo=UTC),
201+
datetime(2024, 7, 30, 20, 30, 0, tzinfo=UTC),
202+
),
203+
(
204+
None,
205+
"2024-11-30",
206+
"Bldg. 123, E52nd Street, Salon MIT", # Invalid time, default to noon Eastern time, convert to UTC
207+
datetime(2024, 11, 30, 17, 0, 0, tzinfo=UTC),
208+
datetime(2024, 11, 30, 17, 0, 0, tzinfo=UTC),
139209
),
140210
],
141211
)

0 commit comments

Comments
 (0)