From d60f615d9a05f3489d7e48246cce371911991c65 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 30 Apr 2025 13:48:01 -0400 Subject: [PATCH 1/6] force http to https --- .../site_scrapers/base_scraper.py | 2 +- .../site_scrapers/utils_test.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/learning_resources/site_scrapers/base_scraper.py b/learning_resources/site_scrapers/base_scraper.py index 6276ee58a8..b980d194ac 100644 --- a/learning_resources/site_scrapers/base_scraper.py +++ b/learning_resources/site_scrapers/base_scraper.py @@ -14,7 +14,7 @@ class BaseScraper: driver = None def __init__(self, start_url): - self.start_url = start_url + self.start_url = start_url.replace("http://", "https://") if self.use_webdriver: self.driver = get_web_driver() diff --git a/learning_resources/site_scrapers/utils_test.py b/learning_resources/site_scrapers/utils_test.py index 181f963d0a..f15d8688a7 100644 --- a/learning_resources/site_scrapers/utils_test.py +++ b/learning_resources/site_scrapers/utils_test.py @@ -29,3 +29,22 @@ def test_scraper_for_site(mocker, url, expected_scraper_class): scraper = scraper_for_site(url) assert isinstance(scraper, expected_scraper_class) + + +@pytest.mark.parametrize( + "url", + [ + "http://example.com", + "http://micromasters.mit.edu/ds/", + "http://unknownsite.com", + "http://executive.mit.edu/course/innovation-executive-academy/a05U1000005l8nFIAQ.html", + ], +) +def test_scraper_forces_https(mocker, url): + """ + Test that the scraper class forces https for the start url + """ + + scraper = scraper_for_site(url) + assert "http://" not in scraper.start_url + assert "https://" in scraper.start_url From d51b7061fc09a68d92904ac09ce8646da1887a81 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 30 Apr 2025 14:36:04 -0400 Subject: [PATCH 2/6] fixing util method to convert to https --- learning_resources/site_scrapers/base_scraper.py | 2 +- learning_resources/site_scrapers/utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/learning_resources/site_scrapers/base_scraper.py b/learning_resources/site_scrapers/base_scraper.py index b980d194ac..6276ee58a8 100644 --- a/learning_resources/site_scrapers/base_scraper.py +++ b/learning_resources/site_scrapers/base_scraper.py @@ -14,7 +14,7 @@ class BaseScraper: driver = None def __init__(self, start_url): - self.start_url = start_url.replace("http://", "https://") + self.start_url = start_url if self.use_webdriver: self.driver = get_web_driver() diff --git a/learning_resources/site_scrapers/utils.py b/learning_resources/site_scrapers/utils.py index 102ba88dae..ff6fabbe18 100644 --- a/learning_resources/site_scrapers/utils.py +++ b/learning_resources/site_scrapers/utils.py @@ -5,6 +5,7 @@ def scraper_for_site(url): + url = url.replace("http://", "https://") for pattern in SITE_SCRAPER_MAP: if re.search(pattern, url): return SITE_SCRAPER_MAP[pattern](url) From 9af68402ba4d41924f5d73b0ff7f8b883f8b082e Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 1 May 2025 11:59:16 -0400 Subject: [PATCH 3/6] adding test mode field --- .../0089_learningresource_test_mode.py | 17 +++++++++++++++++ learning_resources/models.py | 3 +++ 2 files changed, 20 insertions(+) create mode 100644 learning_resources/migrations/0089_learningresource_test_mode.py diff --git a/learning_resources/migrations/0089_learningresource_test_mode.py b/learning_resources/migrations/0089_learningresource_test_mode.py new file mode 100644 index 0000000000..d753bd8d7b --- /dev/null +++ b/learning_resources/migrations/0089_learningresource_test_mode.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.20 on 2025-04-30 19:13 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("learning_resources", "0088_add_content_summarizer_config"), + ] + + operations = [ + migrations.AddField( + model_name="learningresource", + name="test_mode", + field=models.BooleanField(default=False), + ), + ] diff --git a/learning_resources/models.py b/learning_resources/models.py index 7814aff8da..a6a6df593b 100644 --- a/learning_resources/models.py +++ b/learning_resources/models.py @@ -467,6 +467,9 @@ class LearningResource(TimestampedModel): default=default_delivery, ) license_cc = models.BooleanField(default=False) + + test_mode = models.BooleanField(default=False) + continuing_ed_credits = models.DecimalField( max_digits=5, decimal_places=2, null=True, blank=True ) From 2ff2ac9785a8ee72c69757b7d2e38dee738849cd Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 1 May 2025 12:02:12 -0400 Subject: [PATCH 4/6] adding filters to pull in test_mode resources --- learning_resources/etl/edx_shared.py | 6 +++++- .../commands/backpopulate_mit_edx_files.py | 5 ++++- .../commands/backpopulate_mitxonline_files.py | 5 ++++- .../commands/backpopulate_oll_files.py | 4 +++- .../commands/backpopulate_xpro_files.py | 5 ++++- .../management/commands/mixins.py | 17 +++++++++++++++++ learning_resources/tasks.py | 9 +++++---- 7 files changed, 42 insertions(+), 9 deletions(-) create mode 100644 learning_resources/management/commands/mixins.py diff --git a/learning_resources/etl/edx_shared.py b/learning_resources/etl/edx_shared.py index c992d4af3b..15a0076847 100644 --- a/learning_resources/etl/edx_shared.py +++ b/learning_resources/etl/edx_shared.py @@ -6,6 +6,8 @@ from tarfile import ReadError from tempfile import TemporaryDirectory +from django.db.models import Q + from learning_resources.etl.constants import ETLSource from learning_resources.etl.loaders import load_content_files from learning_resources.etl.utils import ( @@ -112,9 +114,11 @@ def sync_edx_course_files( runs = LearningResourceRun.objects.filter( learning_resource__etl_source=etl_source, learning_resource_id__in=ids, - learning_resource__published=True, published=True, + ).filter( + Q(learning_resource__published=True) | Q(learning_resource__test_mode=True) ) + if etl_source == ETLSource.mit_edx.name: # Additional processing of run ids and tarfile names, # because edx data is structured differently diff --git a/learning_resources/management/commands/backpopulate_mit_edx_files.py b/learning_resources/management/commands/backpopulate_mit_edx_files.py index de6a57098a..ada83afd3b 100644 --- a/learning_resources/management/commands/backpopulate_mit_edx_files.py +++ b/learning_resources/management/commands/backpopulate_mit_edx_files.py @@ -3,16 +3,18 @@ from django.conf import settings from django.core.management import BaseCommand +from learning_resources.management.commands.mixins import BaseCommandMixin from learning_resources.tasks import import_all_mit_edx_files from main.utils import now_in_utc -class Command(BaseCommand): +class Command(BaseCommandMixin, BaseCommand): """Populate MIT edX course run files""" help = "Populate MIT edX course run files" def add_arguments(self, parser): + super().add_arguments(parser) parser.add_argument( "-c", "--chunk-size", @@ -43,6 +45,7 @@ def handle(self, *args, **options): # noqa: ARG002 if options["learning_resource_ids"] else None ) + self.configure_test_resources(options) task = import_all_mit_edx_files.delay( chunk_size=chunk_size, overwrite=options["force_overwrite"], diff --git a/learning_resources/management/commands/backpopulate_mitxonline_files.py b/learning_resources/management/commands/backpopulate_mitxonline_files.py index f772fb6a9a..a07ec98a56 100644 --- a/learning_resources/management/commands/backpopulate_mitxonline_files.py +++ b/learning_resources/management/commands/backpopulate_mitxonline_files.py @@ -2,17 +2,19 @@ from django.core.management import BaseCommand +from learning_resources.management.commands.mixins import BaseCommandMixin from learning_resources.tasks import import_all_mitxonline_files from main import settings from main.utils import now_in_utc -class Command(BaseCommand): +class Command(BaseCommandMixin, BaseCommand): """Populate mitxonline course run files""" help = "Populate mitxonline course run files" def add_arguments(self, parser): + super().add_arguments(parser) parser.add_argument( "-c", "--chunk-size", @@ -43,6 +45,7 @@ def handle(self, *args, **options): # noqa: ARG002 if options["learning_resource_ids"] else None ) + self.configure_test_resources(options) task = import_all_mitxonline_files.delay( chunk_size=chunk_size, overwrite=options["force_overwrite"], diff --git a/learning_resources/management/commands/backpopulate_oll_files.py b/learning_resources/management/commands/backpopulate_oll_files.py index bb6c1ca911..e4987ce72f 100644 --- a/learning_resources/management/commands/backpopulate_oll_files.py +++ b/learning_resources/management/commands/backpopulate_oll_files.py @@ -2,12 +2,13 @@ from django.core.management import BaseCommand +from learning_resources.management.commands.mixins import BaseCommandMixin from learning_resources.tasks import import_all_oll_files from main import settings from main.utils import now_in_utc -class Command(BaseCommand): +class Command(BaseCommandMixin, BaseCommand): """Populate OLL course run files""" help = "Populate OLL course run files" @@ -46,6 +47,7 @@ def handle(self, *args, **options): # noqa: ARG002 if options["learning_resource_ids"] else None ) + self.configure_test_resources(options) task = import_all_oll_files.delay( chunk_size=chunk_size, overwrite=options["force_overwrite"], diff --git a/learning_resources/management/commands/backpopulate_xpro_files.py b/learning_resources/management/commands/backpopulate_xpro_files.py index 88d92b97b4..4996e5a709 100644 --- a/learning_resources/management/commands/backpopulate_xpro_files.py +++ b/learning_resources/management/commands/backpopulate_xpro_files.py @@ -3,16 +3,18 @@ from django.conf import settings from django.core.management import BaseCommand +from learning_resources.management.commands.mixins import BaseCommandMixin from learning_resources.tasks import import_all_xpro_files from main.utils import now_in_utc -class Command(BaseCommand): +class Command(BaseCommandMixin, BaseCommand): """Populate xpro course run files""" help = "Populate xpro course run files" def add_arguments(self, parser): + super().add_arguments(parser) parser.add_argument( "-c", "--chunk-size", @@ -42,6 +44,7 @@ def handle(self, *args, **options): # noqa: ARG002 if options["learning_resource_ids"] else None ) + self.configure_test_resources(options) task = import_all_xpro_files.delay( chunk_size=chunk_size, overwrite=options["force_overwrite"], diff --git a/learning_resources/management/commands/mixins.py b/learning_resources/management/commands/mixins.py new file mode 100644 index 0000000000..316d986f9e --- /dev/null +++ b/learning_resources/management/commands/mixins.py @@ -0,0 +1,17 @@ +from learning_resources.models import LearningResource + + +class BaseCommandMixin: + def add_arguments(self, parser): + parser.add_argument( + "--test-ids", + dest="test_ids", + help="List of readable IDs to use for testing", + ) + + def configure_test_resources(self, options): + if options["test_ids"]: + test_ids = options["test_ids"].split(",") + LearningResource.objects.filter(id__in=test_ids).update( + test_mode=True, published=False + ) diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index d2dd0c7d3d..2f59453ad6 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -9,6 +9,7 @@ import boto3 import celery from django.conf import settings +from django.db.models import Q from django.utils import timezone from learning_resources.content_summarizer import ContentSummarizer @@ -52,7 +53,8 @@ def get_micromasters_data(): @app.task def get_mit_edx_data( - api_course_datafile: str | None = None, api_program_datafile: str | None = None + api_course_datafile: str | None = None, + api_program_datafile: str | None = None, ) -> int: """Task to sync MIT edX data with the database @@ -169,9 +171,8 @@ def get_content_tasks( # noqa: PLR0913 ).values_list("id", flat=True) else: learning_resources = ( - LearningResource.objects.filter( - published=True, course__isnull=False, etl_source=etl_source - ) + LearningResource.objects.filter(Q(published=True) | Q(test_mode=True)) + .filter(course__isnull=False, etl_source=etl_source) .exclude(readable_id__in=blocklisted_ids) .order_by("-id") .values_list("id", flat=True) From 2646c1bac6c4cf9d22a60f534055cc234a1eb9bb Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 1 May 2025 12:16:33 -0400 Subject: [PATCH 5/6] Revert "adding test mode field" This reverts commit 9af68402ba4d41924f5d73b0ff7f8b883f8b082e. --- .../0089_learningresource_test_mode.py | 17 ----------------- learning_resources/models.py | 3 --- 2 files changed, 20 deletions(-) delete mode 100644 learning_resources/migrations/0089_learningresource_test_mode.py diff --git a/learning_resources/migrations/0089_learningresource_test_mode.py b/learning_resources/migrations/0089_learningresource_test_mode.py deleted file mode 100644 index d753bd8d7b..0000000000 --- a/learning_resources/migrations/0089_learningresource_test_mode.py +++ /dev/null @@ -1,17 +0,0 @@ -# Generated by Django 4.2.20 on 2025-04-30 19:13 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("learning_resources", "0088_add_content_summarizer_config"), - ] - - operations = [ - migrations.AddField( - model_name="learningresource", - name="test_mode", - field=models.BooleanField(default=False), - ), - ] diff --git a/learning_resources/models.py b/learning_resources/models.py index a6a6df593b..7814aff8da 100644 --- a/learning_resources/models.py +++ b/learning_resources/models.py @@ -467,9 +467,6 @@ class LearningResource(TimestampedModel): default=default_delivery, ) license_cc = models.BooleanField(default=False) - - test_mode = models.BooleanField(default=False) - continuing_ed_credits = models.DecimalField( max_digits=5, decimal_places=2, null=True, blank=True ) From 18d49ec0ac1afa7fd8163a58147fd8e9ceb5f1f9 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 1 May 2025 12:17:05 -0400 Subject: [PATCH 6/6] Revert "adding filters to pull in test_mode resources" This reverts commit 2ff2ac9785a8ee72c69757b7d2e38dee738849cd. --- learning_resources/etl/edx_shared.py | 6 +----- .../commands/backpopulate_mit_edx_files.py | 5 +---- .../commands/backpopulate_mitxonline_files.py | 5 +---- .../commands/backpopulate_oll_files.py | 4 +--- .../commands/backpopulate_xpro_files.py | 5 +---- .../management/commands/mixins.py | 17 ----------------- learning_resources/tasks.py | 9 ++++----- 7 files changed, 9 insertions(+), 42 deletions(-) delete mode 100644 learning_resources/management/commands/mixins.py diff --git a/learning_resources/etl/edx_shared.py b/learning_resources/etl/edx_shared.py index 15a0076847..c992d4af3b 100644 --- a/learning_resources/etl/edx_shared.py +++ b/learning_resources/etl/edx_shared.py @@ -6,8 +6,6 @@ from tarfile import ReadError from tempfile import TemporaryDirectory -from django.db.models import Q - from learning_resources.etl.constants import ETLSource from learning_resources.etl.loaders import load_content_files from learning_resources.etl.utils import ( @@ -114,11 +112,9 @@ def sync_edx_course_files( runs = LearningResourceRun.objects.filter( learning_resource__etl_source=etl_source, learning_resource_id__in=ids, + learning_resource__published=True, published=True, - ).filter( - Q(learning_resource__published=True) | Q(learning_resource__test_mode=True) ) - if etl_source == ETLSource.mit_edx.name: # Additional processing of run ids and tarfile names, # because edx data is structured differently diff --git a/learning_resources/management/commands/backpopulate_mit_edx_files.py b/learning_resources/management/commands/backpopulate_mit_edx_files.py index ada83afd3b..de6a57098a 100644 --- a/learning_resources/management/commands/backpopulate_mit_edx_files.py +++ b/learning_resources/management/commands/backpopulate_mit_edx_files.py @@ -3,18 +3,16 @@ from django.conf import settings from django.core.management import BaseCommand -from learning_resources.management.commands.mixins import BaseCommandMixin from learning_resources.tasks import import_all_mit_edx_files from main.utils import now_in_utc -class Command(BaseCommandMixin, BaseCommand): +class Command(BaseCommand): """Populate MIT edX course run files""" help = "Populate MIT edX course run files" def add_arguments(self, parser): - super().add_arguments(parser) parser.add_argument( "-c", "--chunk-size", @@ -45,7 +43,6 @@ def handle(self, *args, **options): # noqa: ARG002 if options["learning_resource_ids"] else None ) - self.configure_test_resources(options) task = import_all_mit_edx_files.delay( chunk_size=chunk_size, overwrite=options["force_overwrite"], diff --git a/learning_resources/management/commands/backpopulate_mitxonline_files.py b/learning_resources/management/commands/backpopulate_mitxonline_files.py index a07ec98a56..f772fb6a9a 100644 --- a/learning_resources/management/commands/backpopulate_mitxonline_files.py +++ b/learning_resources/management/commands/backpopulate_mitxonline_files.py @@ -2,19 +2,17 @@ from django.core.management import BaseCommand -from learning_resources.management.commands.mixins import BaseCommandMixin from learning_resources.tasks import import_all_mitxonline_files from main import settings from main.utils import now_in_utc -class Command(BaseCommandMixin, BaseCommand): +class Command(BaseCommand): """Populate mitxonline course run files""" help = "Populate mitxonline course run files" def add_arguments(self, parser): - super().add_arguments(parser) parser.add_argument( "-c", "--chunk-size", @@ -45,7 +43,6 @@ def handle(self, *args, **options): # noqa: ARG002 if options["learning_resource_ids"] else None ) - self.configure_test_resources(options) task = import_all_mitxonline_files.delay( chunk_size=chunk_size, overwrite=options["force_overwrite"], diff --git a/learning_resources/management/commands/backpopulate_oll_files.py b/learning_resources/management/commands/backpopulate_oll_files.py index e4987ce72f..bb6c1ca911 100644 --- a/learning_resources/management/commands/backpopulate_oll_files.py +++ b/learning_resources/management/commands/backpopulate_oll_files.py @@ -2,13 +2,12 @@ from django.core.management import BaseCommand -from learning_resources.management.commands.mixins import BaseCommandMixin from learning_resources.tasks import import_all_oll_files from main import settings from main.utils import now_in_utc -class Command(BaseCommandMixin, BaseCommand): +class Command(BaseCommand): """Populate OLL course run files""" help = "Populate OLL course run files" @@ -47,7 +46,6 @@ def handle(self, *args, **options): # noqa: ARG002 if options["learning_resource_ids"] else None ) - self.configure_test_resources(options) task = import_all_oll_files.delay( chunk_size=chunk_size, overwrite=options["force_overwrite"], diff --git a/learning_resources/management/commands/backpopulate_xpro_files.py b/learning_resources/management/commands/backpopulate_xpro_files.py index 4996e5a709..88d92b97b4 100644 --- a/learning_resources/management/commands/backpopulate_xpro_files.py +++ b/learning_resources/management/commands/backpopulate_xpro_files.py @@ -3,18 +3,16 @@ from django.conf import settings from django.core.management import BaseCommand -from learning_resources.management.commands.mixins import BaseCommandMixin from learning_resources.tasks import import_all_xpro_files from main.utils import now_in_utc -class Command(BaseCommandMixin, BaseCommand): +class Command(BaseCommand): """Populate xpro course run files""" help = "Populate xpro course run files" def add_arguments(self, parser): - super().add_arguments(parser) parser.add_argument( "-c", "--chunk-size", @@ -44,7 +42,6 @@ def handle(self, *args, **options): # noqa: ARG002 if options["learning_resource_ids"] else None ) - self.configure_test_resources(options) task = import_all_xpro_files.delay( chunk_size=chunk_size, overwrite=options["force_overwrite"], diff --git a/learning_resources/management/commands/mixins.py b/learning_resources/management/commands/mixins.py deleted file mode 100644 index 316d986f9e..0000000000 --- a/learning_resources/management/commands/mixins.py +++ /dev/null @@ -1,17 +0,0 @@ -from learning_resources.models import LearningResource - - -class BaseCommandMixin: - def add_arguments(self, parser): - parser.add_argument( - "--test-ids", - dest="test_ids", - help="List of readable IDs to use for testing", - ) - - def configure_test_resources(self, options): - if options["test_ids"]: - test_ids = options["test_ids"].split(",") - LearningResource.objects.filter(id__in=test_ids).update( - test_mode=True, published=False - ) diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index 2f59453ad6..d2dd0c7d3d 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -9,7 +9,6 @@ import boto3 import celery from django.conf import settings -from django.db.models import Q from django.utils import timezone from learning_resources.content_summarizer import ContentSummarizer @@ -53,8 +52,7 @@ def get_micromasters_data(): @app.task def get_mit_edx_data( - api_course_datafile: str | None = None, - api_program_datafile: str | None = None, + api_course_datafile: str | None = None, api_program_datafile: str | None = None ) -> int: """Task to sync MIT edX data with the database @@ -171,8 +169,9 @@ def get_content_tasks( # noqa: PLR0913 ).values_list("id", flat=True) else: learning_resources = ( - LearningResource.objects.filter(Q(published=True) | Q(test_mode=True)) - .filter(course__isnull=False, etl_source=etl_source) + LearningResource.objects.filter( + published=True, course__isnull=False, etl_source=etl_source + ) .exclude(readable_id__in=blocklisted_ids) .order_by("-id") .values_list("id", flat=True)