diff --git a/learning_resources/site_scrapers/utils.py b/learning_resources/site_scrapers/utils.py index 102ba88dae..ff6fabbe18 100644 --- a/learning_resources/site_scrapers/utils.py +++ b/learning_resources/site_scrapers/utils.py @@ -5,6 +5,7 @@ def scraper_for_site(url): + url = url.replace("http://", "https://") for pattern in SITE_SCRAPER_MAP: if re.search(pattern, url): return SITE_SCRAPER_MAP[pattern](url) diff --git a/learning_resources/site_scrapers/utils_test.py b/learning_resources/site_scrapers/utils_test.py index 181f963d0a..f15d8688a7 100644 --- a/learning_resources/site_scrapers/utils_test.py +++ b/learning_resources/site_scrapers/utils_test.py @@ -29,3 +29,22 @@ def test_scraper_for_site(mocker, url, expected_scraper_class): scraper = scraper_for_site(url) assert isinstance(scraper, expected_scraper_class) + + +@pytest.mark.parametrize( + "url", + [ + "http://example.com", + "http://micromasters.mit.edu/ds/", + "http://unknownsite.com", + "http://executive.mit.edu/course/innovation-executive-academy/a05U1000005l8nFIAQ.html", + ], +) +def test_scraper_forces_https(mocker, url): + """ + Test that the scraper class forces https for the start url + """ + + scraper = scraper_for_site(url) + assert "http://" not in scraper.start_url + assert "https://" in scraper.start_url