|
2 | 2 |
|
3 | 3 | import logging
|
4 | 4 | import re
|
| 5 | +from dataclasses import dataclass |
5 | 6 | from datetime import UTC, datetime
|
6 | 7 | from time import mktime, struct_time
|
| 8 | +from typing import Optional |
7 | 9 | from zoneinfo import ZoneInfo
|
8 | 10 |
|
9 | 11 | import dateparser
|
|
17 | 19 | log = logging.getLogger(__name__)
|
18 | 20 |
|
19 | 21 |
|
| 22 | +@dataclass |
| 23 | +class FormattedTime: |
| 24 | + hour: Optional[str] |
| 25 | + minute: Optional[str] |
| 26 | + ampm: Optional[str] |
| 27 | + tz: Optional[str] |
| 28 | + |
| 29 | + |
20 | 30 | def get_soup(url: str) -> Soup:
|
21 | 31 | """
|
22 | 32 | Get a BeautifulSoup object from a URL.
|
@@ -154,47 +164,102 @@ def convert_to_utc(dt: datetime, known_tz: str) -> datetime:
|
154 | 164 | return dt.astimezone(UTC)
|
155 | 165 |
|
156 | 166 |
|
| 167 | +def format_time(matched_time: re.Match) -> FormattedTime: |
| 168 | + """ |
| 169 | + Format a time regex match group into a standard format |
| 170 | +
|
| 171 | + Args: |
| 172 | + time_str (str): The time string to parse |
| 173 | +
|
| 174 | + Returns: |
| 175 | + FormattedTime: A formatted time object |
| 176 | + """ |
| 177 | + # Regex for AM/PM and timezone |
| 178 | + ampm_tz_regex = re.compile(r"(am|pm)\s*([A-Za-z]{2,3})?", re.IGNORECASE) |
| 179 | + ampm, tz = "", "" |
| 180 | + hour = matched_time.group(1) or "" |
| 181 | + minute = matched_time.group(2) or (":00" if hour else "") |
| 182 | + ampm_and_tz_match = re.search(ampm_tz_regex, matched_time.group(3) or "") |
| 183 | + if ampm_and_tz_match: |
| 184 | + ampm = ampm_and_tz_match.group(1) or "" |
| 185 | + tz = ampm_and_tz_match.group(2) or "" |
| 186 | + return FormattedTime(hour, minute, ampm, tz) |
| 187 | + |
| 188 | + |
157 | 189 | def parse_date_time_range(
|
158 | 190 | start_date_str: str, end_date_str: str, time_range_str: str
|
159 | 191 | ) -> tuple[datetime, datetime]:
|
160 | 192 | """
|
161 | 193 | Attempt to parse the time range from the MITPE events API.
|
162 |
| - The field might not actually contain a time or range. |
| 194 | + If the time cannot be parsed, default to noon Easterm time, |
| 195 | + then convert to UTC. |
| 196 | + The field might not always contain a valid time/range. |
163 | 197 |
|
164 | 198 | Args:
|
165 | 199 | start_date_str (str): start date string
|
166 | 200 | end_date_str (str): end date string
|
167 | 201 | time_range (str): time range string
|
168 | 202 |
|
169 | 203 | Returns:
|
170 |
| - tuple: start and end datetimes |
| 204 | + tuple(datetime, datetime): start and end datetimes in UTC timezone |
171 | 205 |
|
172 | 206 | """
|
173 |
| - start_time, start_ampm, end_time, end_ampm, tz = "", "", "", "", "" |
174 |
| - time_regex = re.compile( |
175 |
| - r"(\d{1,2})(:\d{2})?\s*(am|pm)?\s*-?\s*(\d{1,2})(:?\d{2})?\s*(am|pm)?\s*([A-Za-z]{2,3})?", |
176 |
| - re.IGNORECASE, |
| 207 | + if not end_date_str: |
| 208 | + end_date_str = start_date_str |
| 209 | + if not start_date_str: |
| 210 | + start_date_str = end_date_str |
| 211 | + default_time = FormattedTime("12", ":00", "PM", "") |
| 212 | + # Set start/end times to noon as default |
| 213 | + start_time, end_time = (default_time, default_time) |
| 214 | + # Try to split the string into start and end times |
| 215 | + split_times = list( |
| 216 | + re.finditer( |
| 217 | + re.compile(r"(\d{1,2})(:\d{2})?(\D*)", re.IGNORECASE), time_range_str or "" |
| 218 | + ) |
| 219 | + ) |
| 220 | + if split_times: |
| 221 | + # At least one time match was found |
| 222 | + formatted_times = [format_time(time_match) for time_match in split_times] |
| 223 | + # make ruff happy |
| 224 | + TWO = 2 |
| 225 | + TWELVE = 12 |
| 226 | + if len(formatted_times) == TWO: |
| 227 | + # Both start and end times were found |
| 228 | + start_time, end_time = formatted_times |
| 229 | + if start_time.hour and end_time.hour: |
| 230 | + # Times must at least have an hour to be valid |
| 231 | + if int(start_time.hour) > int(end_time.hour): |
| 232 | + # Example: 8 - 1 PM |
| 233 | + start_time.ampm = start_time.ampm or "AM" |
| 234 | + end_time.ampm = end_time.ampm or "PM" |
| 235 | + elif int(end_time.hour) == TWELVE and int(start_time.hour) < TWELVE: |
| 236 | + # Example: 10 - 12 PM |
| 237 | + start_time.ampm = start_time.ampm or "AM" |
| 238 | + end_time.ampm = end_time.ampm or "PM" |
| 239 | + else: |
| 240 | + # Anything else, if AM/PM missing for one, set it to the other, |
| 241 | + # or "" if both are missing |
| 242 | + start_time.ampm = start_time.ampm or end_time.ampm or "" |
| 243 | + end_time.ampm = end_time.ampm or start_time.ampm or "" |
| 244 | + # If timezone missing for one, set it to the other, |
| 245 | + # or "" if both are missing |
| 246 | + start_time.tz = start_time.tz or end_time.tz or "" |
| 247 | + end_time.tz = end_time.tz or start_time.tz or "" |
| 248 | + elif len(formatted_times) == 1: |
| 249 | + # Only one time was found, set both start and end to that time |
| 250 | + start_time = formatted_times[0] |
| 251 | + end_time = start_time |
| 252 | + |
| 253 | + start_date = dateparser.parse( |
| 254 | + f"{start_date_str} {start_time.hour}{start_time.minute} " |
| 255 | + f"{start_time.ampm} {start_time.tz}" |
| 256 | + ) |
| 257 | + end_date = dateparser.parse( |
| 258 | + f"{end_date_str} {end_time.hour}{end_time.minute} " |
| 259 | + f"{end_time.ampm} {end_time.tz}" |
177 | 260 | )
|
178 |
| - time_match = re.match(time_regex, time_range_str or "") |
179 |
| - if time_match: |
180 |
| - start_time = f"{time_match.group(1)}{time_match.group(2) or ':00'}" or "" |
181 |
| - start_ampm = time_match.group(3) or "" |
182 |
| - end_time = f"{time_match.group(4)}{time_match.group(5) or ':00'}" or start_time |
183 |
| - end_ampm = time_match.group(6) or "" |
184 |
| - tz = (time_match.group(7) or "").upper() |
185 |
| - start_date = dateparser.parse( |
186 |
| - f"{start_date_str} {start_time}{start_ampm} {tz}" |
187 |
| - ) or dateparser.parse(start_date_str) |
188 |
| - else: |
189 |
| - start_date = dateparser.parse(start_date_str) |
190 |
| - if end_date_str: |
191 |
| - end_date = dateparser.parse( |
192 |
| - f"{end_date_str} {end_time}{end_ampm or ''} {tz}" |
193 |
| - ) or dateparser.parse(end_date_str) |
194 |
| - else: |
195 |
| - end_date = dateparser.parse( |
196 |
| - f"{start_date_str} {end_time}{end_ampm or ""} {tz}" |
197 |
| - ) or dateparser.parse(start_date_str) |
198 | 261 | if not start_date:
|
199 | 262 | log.error("Failed to parse start date %s", start_date_str)
|
200 |
| - return convert_to_utc(start_date, tz), convert_to_utc(end_date, tz) |
| 263 | + return convert_to_utc(start_date, start_time.tz), convert_to_utc( |
| 264 | + end_date, end_time.tz |
| 265 | + ) |
0 commit comments