Skip to content

Commit e41d4e5

Browse files
authored
Force https for urls in scraper (#2222)
* force http to https * fixing util method to convert to https * adding test mode field * adding filters to pull in test_mode resources * Revert "adding test mode field" This reverts commit 9af6840. * Revert "adding filters to pull in test_mode resources" This reverts commit 2ff2ac9.
1 parent e8596b6 commit e41d4e5

File tree

2 files changed

+20
-0
lines changed

2 files changed

+20
-0
lines changed

learning_resources/site_scrapers/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66

77
def scraper_for_site(url):
8+
url = url.replace("http://", "https://")
89
for pattern in SITE_SCRAPER_MAP:
910
if re.search(pattern, url):
1011
return SITE_SCRAPER_MAP[pattern](url)

learning_resources/site_scrapers/utils_test.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,22 @@ def test_scraper_for_site(mocker, url, expected_scraper_class):
2929

3030
scraper = scraper_for_site(url)
3131
assert isinstance(scraper, expected_scraper_class)
32+
33+
34+
@pytest.mark.parametrize(
35+
"url",
36+
[
37+
"http://example.com",
38+
"http://micromasters.mit.edu/ds/",
39+
"http://unknownsite.com",
40+
"http://executive.mit.edu/course/innovation-executive-academy/a05U1000005l8nFIAQ.html",
41+
],
42+
)
43+
def test_scraper_forces_https(mocker, url):
44+
"""
45+
Test that the scraper class forces https for the start url
46+
"""
47+
48+
scraper = scraper_for_site(url)
49+
assert "http://" not in scraper.start_url
50+
assert "https://" in scraper.start_url

0 commit comments

Comments
 (0)