Skip to content

Commit 4f82990

Browse files
authored
multi page marketing site scraping (#2196)
* adding initial base scraper * making scrape task use scraper class * adding utils file * adding utils file * fixing mock * test fixes * adding implicit wait * adding test for utils
1 parent d129de8 commit 4f82990

11 files changed

+165
-81
lines changed

learning_resources/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def add_file_to_bucket_recursive(bucket, file_base, s3_base, file_object):
8787
@pytest.fixture(autouse=True)
8888
def marketing_metadata_mocks(mocker):
8989
mocker.patch(
90-
"learning_resources.utils.fetch_page",
90+
"learning_resources.site_scrapers.base_scraper.BaseScraper.fetch_page",
9191
return_value="""
9292
<html>
9393
<body>

learning_resources/site_scrapers/__init__.py

Whitespace-only changes.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import logging
2+
3+
import requests
4+
from django.conf import settings
5+
from selenium.webdriver.support.ui import WebDriverWait
6+
7+
from learning_resources.utils import get_web_driver
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
class BaseScraper:
13+
use_webdriver = settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER
14+
driver = None
15+
16+
def __init__(self, start_url):
17+
self.start_url = start_url
18+
if self.use_webdriver:
19+
self.driver = get_web_driver()
20+
21+
def fetch_page(self, url):
22+
if url:
23+
if self.driver:
24+
self.driver.get(url)
25+
WebDriverWait(self.driver, 10).until(
26+
lambda d: d.execute_script("return document.readyState")
27+
== "complete"
28+
)
29+
return self.driver.execute_script("return document.body.innerHTML")
30+
else:
31+
try:
32+
response = requests.get(url, timeout=10)
33+
if response.ok:
34+
return response.text
35+
except requests.exceptions.RequestException:
36+
logger.exception("Error fetching page from %s", url)
37+
return None
38+
39+
def scrape(self):
40+
page_content = self.fetch_page(self.start_url)
41+
if page_content:
42+
return page_content
43+
else:
44+
logger.error("Failed to fetch page content from %s", self.start_url)
45+
return None
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from learning_resources.site_scrapers.mitx_program_page_scraper import (
2+
MITXProgramPageScraper,
3+
)
4+
from learning_resources.site_scrapers.sloan_course_page_scraper import (
5+
SloanCoursePageScraper,
6+
)
7+
8+
SITE_SCRAPER_MAP = {
9+
r"^https://executive.mit.edu/course/(.*?)": SloanCoursePageScraper,
10+
r"https://micromasters.mit.edu/(.*?)/$": MITXProgramPageScraper,
11+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from selenium.webdriver.common.by import By
2+
3+
from learning_resources.site_scrapers.base_scraper import BaseScraper
4+
5+
6+
class MITXProgramPageScraper(BaseScraper):
7+
def scrape(self, *args, **kwargs):
8+
content = super().scrape(*args, **kwargs)
9+
extra_links = []
10+
if self.driver:
11+
for link in self.driver.find_elements(By.CLASS_NAME, "tab-link"):
12+
link_url = link.get_attribute("href")
13+
if link_url != self.start_url:
14+
extra_links.append(link_url)
15+
for link_url in extra_links:
16+
page_content = self.fetch_page(link_url)
17+
if page_content:
18+
content += page_content
19+
return content
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from selenium.common.exceptions import (
2+
ElementNotInteractableException,
3+
JavascriptException,
4+
NoSuchElementException,
5+
TimeoutException,
6+
)
7+
from selenium.webdriver.common.by import By
8+
from selenium.webdriver.support import expected_conditions
9+
from selenium.webdriver.support.ui import WebDriverWait
10+
11+
from learning_resources.site_scrapers.base_scraper import BaseScraper
12+
13+
14+
class SloanCoursePageScraper(BaseScraper):
15+
def webdriver_fetch_extra_elements(self):
16+
"""
17+
Attempt to Fetch any extra possible js loaded elements that
18+
require interaction to display
19+
"""
20+
errors = [
21+
NoSuchElementException,
22+
JavascriptException,
23+
ElementNotInteractableException,
24+
TimeoutException,
25+
]
26+
wait = WebDriverWait(
27+
self.driver, timeout=0.1, poll_frequency=0.01, ignored_exceptions=errors
28+
)
29+
for tab_id in ["faculty-tab", "reviews-tab", "participants-tab"]:
30+
wait.until(
31+
expected_conditions.visibility_of_element_located((By.ID, tab_id))
32+
)
33+
self.driver.execute_script(f"document.getElementById('{tab_id}').click()")
34+
return self.driver.execute_script("return document.body.innerHTML")
35+
36+
def scrape(self, *args, **kwargs):
37+
content = super().scrape(*args, **kwargs)
38+
if self.driver:
39+
content = self.webdriver_fetch_extra_elements()
40+
return content
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import re
2+
3+
from learning_resources.site_scrapers.base_scraper import BaseScraper
4+
from learning_resources.site_scrapers.constants import SITE_SCRAPER_MAP
5+
6+
7+
def scraper_for_site(url):
8+
for pattern in SITE_SCRAPER_MAP:
9+
if re.search(pattern, url):
10+
return SITE_SCRAPER_MAP[pattern](url)
11+
return BaseScraper(url)
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pytest
2+
3+
from learning_resources.site_scrapers.base_scraper import BaseScraper
4+
from learning_resources.site_scrapers.mitx_program_page_scraper import (
5+
MITXProgramPageScraper,
6+
)
7+
from learning_resources.site_scrapers.sloan_course_page_scraper import (
8+
SloanCoursePageScraper,
9+
)
10+
from learning_resources.site_scrapers.utils import scraper_for_site
11+
12+
13+
@pytest.mark.parametrize(
14+
("url", "expected_scraper_class"),
15+
[
16+
("https://example.com", BaseScraper),
17+
("https://micromasters.mit.edu/ds/", MITXProgramPageScraper),
18+
("https://unknownsite.com", BaseScraper),
19+
(
20+
"https://executive.mit.edu/course/innovation-executive-academy/a05U1000005l8nFIAQ.html",
21+
SloanCoursePageScraper,
22+
),
23+
],
24+
)
25+
def test_scraper_for_site(mocker, url, expected_scraper_class):
26+
"""
27+
Test that scraper_for_site returns the correct scraper class based on the URL
28+
"""
29+
30+
scraper = scraper_for_site(url)
31+
assert isinstance(scraper, expected_scraper_class)

learning_resources/tasks.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
from learning_resources.etl.pipelines import ocw_courses_etl
2323
from learning_resources.etl.utils import get_learning_course_bucket_name
2424
from learning_resources.models import ContentFile, LearningResource
25-
from learning_resources.utils import fetch_page, html_to_markdown, load_course_blocklist
25+
from learning_resources.site_scrapers.utils import scraper_for_site
26+
from learning_resources.utils import html_to_markdown, load_course_blocklist
2627
from learning_resources_search.exceptions import RetryError
2728
from main.celery import app
2829
from main.constants import ISOFORMAT
@@ -503,7 +504,8 @@ def scrape_marketing_pages(self):
503504
def marketing_page_for_resources(resource_ids):
504505
for learning_resource in LearningResource.objects.filter(id__in=resource_ids):
505506
marketing_page_url = learning_resource.url
506-
page_content = fetch_page(marketing_page_url)
507+
scraper = scraper_for_site(marketing_page_url)
508+
page_content = scraper.scrape()
507509
if page_content:
508510
content_file, _ = ContentFile.objects.update_or_create(
509511
learning_resource=learning_resource,

learning_resources/tasks_test.py

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,6 @@
2525
scrape_marketing_pages,
2626
update_next_start_date_and_prices,
2727
)
28-
from learning_resources.utils import (
29-
fetch_page,
30-
)
3128

3229
pytestmark = pytest.mark.django_db
3330
# pylint:disable=redefined-outer-name,unused-argument,too-many-arguments
@@ -468,31 +465,6 @@ def test_summarize_unprocessed_content(
468465
assert get_unprocessed_content_file_ids_mock.call_count == 0 if ids else 1
469466

470467

471-
@pytest.mark.parametrize("use_webdriver", [True], ids=["with_webdriver"])
472-
def test_fetch_page_with_webdriver(mocker, use_webdriver, settings):
473-
"""Test that fetch_page uses WebDriver when settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER is True"""
474-
475-
settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER = use_webdriver
476-
477-
mock_driver = mocker.MagicMock()
478-
mock_driver.execute_script.return_value = "<html><body>Page content</body></html>"
479-
mock_get_web_driver = mocker.patch(
480-
"learning_resources.utils._get_web_driver", return_value=mock_driver
481-
)
482-
mock_webdriver_fetch_extra = mocker.patch(
483-
"learning_resources.utils._webdriver_fetch_extra_elements"
484-
)
485-
486-
url = "https://example.com/course"
487-
result = fetch_page(url, use_webdriver=use_webdriver)
488-
489-
assert result == "<html><body>Page content</body></html>"
490-
mock_get_web_driver.assert_called_once()
491-
mock_driver.get.assert_called_once_with(url)
492-
mock_webdriver_fetch_extra.assert_called_once_with(mock_driver)
493-
mock_driver.execute_script.assert_called_once_with("return document.body.innerHTML")
494-
495-
496468
@pytest.mark.django_db
497469
def test_marketing_page_for_resources_with_webdriver(mocker, settings):
498470
"""Test that marketing_page_for_resources uses WebDriver to fetch content"""
@@ -508,7 +480,8 @@ def test_marketing_page_for_resources_with_webdriver(mocker, settings):
508480

509481
html_content = "<html><body><h1>Test Course</h1><p>Course content</p></body></html>"
510482
mock_fetch_page = mocker.patch(
511-
"learning_resources.tasks.fetch_page", return_value=html_content
483+
"learning_resources.site_scrapers.base_scraper.BaseScraper.fetch_page",
484+
return_value=html_content,
512485
)
513486

514487
markdown_content = "# Test Course\n\nCourse content"

learning_resources/utils.py

Lines changed: 1 addition & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,7 @@
1717
from django.db.models import Q
1818
from retry import retry
1919
from selenium import webdriver
20-
from selenium.common.exceptions import (
21-
ElementNotInteractableException,
22-
JavascriptException,
23-
NoSuchElementException,
24-
TimeoutException,
25-
)
2620
from selenium.webdriver.chrome.options import Options
27-
from selenium.webdriver.common.by import By
28-
from selenium.webdriver.support import expected_conditions
29-
from selenium.webdriver.support.ui import WebDriverWait
3021

3122
from learning_resources.constants import (
3223
GROUP_STAFF_LISTS_EDITORS,
@@ -619,7 +610,7 @@ def html_to_markdown(html):
619610

620611

621612
@cache
622-
def _get_web_driver():
613+
def get_web_driver():
623614
service = webdriver.ChromeService(executable_path=which("chromedriver"))
624615
chrome_options = Options()
625616
chrome_options.add_argument("--headless=new")
@@ -633,45 +624,6 @@ def _get_web_driver():
633624
return webdriver.Chrome(service=service, options=chrome_options)
634625

635626

636-
def _webdriver_fetch_extra_elements(driver):
637-
"""
638-
Attempt to Fetch any extra possible js loaded elements that
639-
require interaction to display
640-
"""
641-
errors = [
642-
NoSuchElementException,
643-
JavascriptException,
644-
ElementNotInteractableException,
645-
TimeoutException,
646-
]
647-
wait = WebDriverWait(
648-
driver, timeout=0.1, poll_frequency=0.01, ignored_exceptions=errors
649-
)
650-
for tab_id in ["faculty-tab", "reviews-tab", "participants-tab"]:
651-
wait.until(expected_conditions.visibility_of_element_located((By.ID, tab_id)))
652-
driver.execute_script(f"document.getElementById('{tab_id}').click()")
653-
654-
655-
def fetch_page(url, use_webdriver=settings.EMBEDDINGS_EXTERNAL_FETCH_USE_WEBDRIVER):
656-
if url:
657-
if use_webdriver:
658-
driver = _get_web_driver()
659-
driver.get(url)
660-
try:
661-
_webdriver_fetch_extra_elements(driver)
662-
except TimeoutException:
663-
log.warning("Error custom elements page from %s", url)
664-
return driver.execute_script("return document.body.innerHTML")
665-
else:
666-
try:
667-
response = requests.get(url, timeout=10)
668-
if response.ok:
669-
return response.text
670-
except requests.exceptions.RequestException:
671-
log.exception("Error fetching page from %s", url)
672-
return None
673-
674-
675627
def json_to_markdown(obj, indent=0):
676628
"""
677629
Recursively converts a JSON object into a readable

0 commit comments

Comments
 (0)