diff --git a/osf/external/spam/tasks.py b/osf/external/spam/tasks.py index e00df54d237..9c2a348cdcd 100644 --- a/osf/external/spam/tasks.py +++ b/osf/external/spam/tasks.py @@ -1,12 +1,14 @@ import re import logging import requests +from framework import sentry from framework.celery_tasks import app as celery_app from framework.postcommit_tasks.handlers import run_postcommit from django.contrib.contenttypes.models import ContentType from django.db import transaction from osf.external.askismet.client import AkismetClient from osf.external.oopspam.client import OOPSpamClient +from osf.utils.fields import ensure_str from website import settings logger = logging.getLogger(__name__) @@ -38,11 +40,8 @@ def reclassify_domain_references(notable_domain_id, current_note, previous_note) item.referrer.save() -def _check_resource_for_domains(guid, content): - from osf.models import Guid, NotableDomain, DomainReference - resource, _ = Guid.load_referent(guid) - if not resource: - return f'{guid} not found' +def _check_resource_for_domains(resource, content): + from osf.models import NotableDomain, DomainReference spammy_domains = [] referrer_content_type = ContentType.objects.get_for_model(resource) @@ -51,7 +50,7 @@ def _check_resource_for_domains(guid, content): domain=domain, defaults={'note': note} ) - if notable_domain.note == NotableDomain.Note.EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT: + if notable_domain.note == NotableDomain.Note.EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT.value: spammy_domains.append(notable_domain.domain) DomainReference.objects.get_or_create( domain=notable_domain, @@ -61,19 +60,8 @@ def _check_resource_for_domains(guid, content): 'is_triaged': notable_domain.note not in (NotableDomain.Note.UNKNOWN, NotableDomain.Note.UNVERIFIED) } ) - if spammy_domains: - resource.confirm_spam(save=True, domains=list(spammy_domains)) - - -@run_postcommit(once_per_request=False, celery=True) -@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60) -def check_resource_for_domains_postcommit(guid, content): - _check_resource_for_domains(guid, content) - -@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60) -def check_resource_for_domains_async(guid, content): - _check_resource_for_domains(guid, content) + return spammy_domains def _extract_domains(content): @@ -111,16 +99,11 @@ def _extract_domains(content): yield domain, note -@run_postcommit(once_per_request=False, celery=True) -@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60) -def check_resource_with_spam_services(guid, content, author, author_email, request_kwargs): +def check_resource_with_spam_services(resource, content, author, author_email, request_kwargs): """ Return statements used only for debugging and recording keeping """ any_is_spam = False - from osf.models import Guid, OSFUser - guid = Guid.load(guid) - resource = guid.referent kwargs = dict( user_ip=request_kwargs.get('remote_addr'), @@ -163,10 +146,54 @@ def check_resource_with_spam_services(guid, content, author, author_email, reque resource.spam_data['author_email'] = author_email resource.flag_spam() - if hasattr(resource, 'check_spam_user'): - user = OSFUser.objects.get(username=author_email) - resource.check_spam_user(user) + return any_is_spam + + +@run_postcommit(once_per_request=False, celery=True) +@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60) +def check_resource_for_spam_postcommit(guid, content, author, author_email, request_headers): + from osf.models import Guid, OSFUser + + resource, _ = Guid.load_referent(guid) + if not resource: + return f'{guid} not found' + + spammy_domains = _check_resource_for_domains(resource, content) + if spammy_domains: + sentry.log_message(f"Spammy domains detected for {guid}: {spammy_domains}") + resource.confirm_spam(save=False, domains=list(spammy_domains)) + elif settings.SPAM_SERVICES_ENABLED: + request_kwargs = { + 'remote_addr': request_headers.get('Remote-Addr') or request_headers.get('Host'), # for local testing + 'user_agent': request_headers.get('User-Agent'), + 'referer': request_headers.get('Referer'), + } + for key, value in request_kwargs.items(): + request_kwargs[key] = ensure_str(value) + + check_resource_with_spam_services( + resource, + content, + author, + author_email, + request_kwargs, + ) resource.save() - return f'{resource} is spam: {any_is_spam} {resource.spam_data.get("content")}' + if hasattr(resource, 'check_spam_user'): + user = OSFUser.objects.get(username=author_email) + resource.check_spam_user(user) + + +@celery_app.task(ignore_results=False, max_retries=5, default_retry_delay=60) +def check_resource_for_domains_async(guid, content): + from osf.models import Guid + + resource, _ = Guid.load_referent(guid) + if not resource: + return f'{guid} not found' + + spammy_domains = _check_resource_for_domains(resource, content) + if spammy_domains: + resource.confirm_spam(save=True, domains=list(spammy_domains)) diff --git a/osf/models/spam.py b/osf/models/spam.py index cc7d86474e8..d2f5946533c 100644 --- a/osf/models/spam.py +++ b/osf/models/spam.py @@ -6,10 +6,9 @@ from osf.exceptions import ValidationValueError, ValidationTypeError from osf.external.askismet import tasks as akismet_tasks -from osf.external.spam.tasks import check_resource_for_domains_postcommit, check_resource_with_spam_services +from osf.external.spam.tasks import check_resource_for_spam_postcommit from osf.utils.datetime_aware_jsonfield import DateTimeAwareJSONField -from osf.utils.fields import ensure_str, NonNaiveDateTimeField -from website import settings +from osf.utils.fields import NonNaiveDateTimeField logger = logging.getLogger(__name__) @@ -197,28 +196,15 @@ def do_check_spam(self, author, author_email, content, request_headers): if self.is_spammy: return - request_kwargs = { - 'remote_addr': request_headers.get('Remote-Addr') or request_headers.get('Host'), # for local testing - 'user_agent': request_headers.get('User-Agent'), - 'referer': request_headers.get('Referer'), - } if isinstance(self, Preprint): guid__id = self._id else: guid__id = self.guids.first()._id - check_resource_for_domains_postcommit( + + check_resource_for_spam_postcommit( guid__id, content, + author, + author_email, + request_headers, ) - - if settings.SPAM_SERVICES_ENABLED: - for key, value in request_kwargs.items(): - request_kwargs[key] = ensure_str(value) - - check_resource_with_spam_services( - guid__id, - content, - author, - author_email, - request_kwargs, - ) diff --git a/osf_tests/test_node.py b/osf_tests/test_node.py index a84a9565356..e5bae2d866a 100644 --- a/osf_tests/test_node.py +++ b/osf_tests/test_node.py @@ -67,6 +67,8 @@ from .factories import get_default_metaschema from addons.wiki.tests.factories import WikiVersionFactory, WikiFactory from osf_tests.utils import capture_signals, assert_datetime_equal, mock_archive +from osf.models.spam import SpamStatus +from osf.external.spam import tasks as spam_tasks pytestmark = pytest.mark.django_db @@ -2335,6 +2337,38 @@ def test_check_spam_only_public_node_by_default(self, project, user): project.check_spam(user, None, None) assert not project.is_public + @mock.patch('osf.models.node.get_request_and_user_id') + def test_do_check_spam_called_on_set_public(self, mock_get_request, project, user): + mock_request = { + 'headers': { + 'Remote-Addr': '1.2.3.4', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', + 'Referer': 'https://osf.io' + } + } + mock_get_request.return_value = (mock_request, user._id) + + project.title = 'Spam' + project.description = 'spammy content' + project.is_public = False + project.save() + + wiki = WikiFactory(node=project, user=user) + WikiVersionFactory(wiki_page=wiki, content='Some wiki content') + + with mock.patch.object(Node, 'do_check_spam') as mock_do_check_spam: + mock_do_check_spam.return_value = False + project.set_privacy('public', auth=Auth(user)) + + mock_do_check_spam.assert_called_once() + args = mock_do_check_spam.call_args[0] + assert args[0] == user.fullname # author + assert args[1] == user.username # author email + # content + assert 'Some wiki content' in args[2] + assert project.title in args[2] + assert project.description in args[2] + @mock.patch.object(settings, 'SPAM_SERVICES_ENABLED', True) def test_check_spam_skips_ham_user(self, project, user): with mock.patch('osf.models.AbstractNode._get_spam_content', mock.Mock(return_value='some content!')): @@ -2464,6 +2498,107 @@ def test_multiple_privacy_changing(self, project): project.confirm_ham() assert not project.is_public + def test_get_spam_content_includes_wiki_content(self, project, user): + wiki = WikiFactory(node=project, user=user) + WikiVersionFactory(wiki_page=wiki, content='Some wiki content') + + content = project._get_spam_content() + assert 'Some wiki content' in content + + project.title = 'Test Title' + project.save() + content = project._get_spam_content() + assert 'Test Title' in content + assert 'Some wiki content' in content + + +class TestCheckResourceForSpamPostcommit: + + @pytest.fixture() + def user(self): + return UserFactory() + + @pytest.fixture() + def project(self, user): + return ProjectFactory(creator=user) + + @pytest.fixture() + def request_headers(self): + return { + 'Remote-Addr': '1.2.3.4', + 'User-Agent': 'Mozilla/5.0', + 'Referer': 'https://osf.io' + } + + @mock.patch.object(settings, 'SPAM_SERVICES_ENABLED', True) + @mock.patch('osf.external.spam.tasks._check_resource_for_domains') + def test_check_resource_for_spam_postcommit_with_spammy_domains(self, mock_check_domains, project, user): + mock_check_domains.return_value = ['spam.com'] + with mock.patch('osf.external.spam.tasks.check_resource_with_spam_services') as mock_check_services: + spam_tasks.check_resource_for_spam_postcommit( + guid=project._id, + content='Check me for spam at spam.com', + author=user.fullname, + author_email=user.username, + request_headers={} + ) + project.reload() + assert project.spam_status == SpamStatus.SPAM + assert project.spam_data['domains'] == ['spam.com'] + mock_check_services.assert_not_called() + + @mock.patch.object(settings, 'SPAM_SERVICES_ENABLED', True) + @mock.patch('osf.external.spam.tasks._check_resource_for_domains') + def test_check_resource_for_spam_postcommit_no_spammy_domains_checks_services(self, mock_check_domains, project, user, request_headers): + mock_check_domains.return_value = [] + with mock.patch('osf.external.spam.tasks.check_resource_with_spam_services') as mock_check_services: + mock_check_services.return_value = True + spam_tasks.check_resource_for_spam_postcommit( + guid=project._id, + content='Check me for spam', + author=user.fullname, + author_email=user.username, + request_headers=request_headers + ) + mock_check_services.assert_called_once_with( + project, + 'Check me for spam', + user.fullname, + user.username, + { + 'remote_addr': '1.2.3.4', + 'user_agent': 'Mozilla/5.0', + 'referer': 'https://osf.io' + } + ) + + @mock.patch('osf.external.spam.tasks._check_resource_for_domains') + @mock.patch.object(settings, 'SPAM_SERVICES_ENABLED', False) + def test_check_resource_for_spam_postcommit_no_spammy_domains_services_disabled(self, mock_check_domains, project, user): + mock_check_domains.return_value = [] + with mock.patch('osf.external.spam.tasks.check_resource_with_spam_services') as mock_check_services: + spam_tasks.check_resource_for_spam_postcommit( + guid=project._id, + content='Check me for spam', + author=user.fullname, + author_email=user.username, + request_headers={} + ) + mock_check_services.assert_not_called() + + @mock.patch('osf.external.spam.tasks._check_resource_for_domains') + def test_check_resource_for_spam_postcommit_checks_user(self, mock_check_domains, project, user, request_headers): + mock_check_domains.return_value = [] + with mock.patch.object(Node, 'check_spam_user') as mock_check_user: + spam_tasks.check_resource_for_spam_postcommit( + guid=project._id, + content='Check me for spam', + author=user.fullname, + author_email=user.username, + request_headers=request_headers + ) + mock_check_user.assert_called_once_with(user) + # copied from tests/test_models.py class TestPrivateLinks: diff --git a/osf_tests/test_notable_domains.py b/osf_tests/test_notable_domains.py index 68e39912a65..256cb56767f 100644 --- a/osf_tests/test_notable_domains.py +++ b/osf_tests/test_notable_domains.py @@ -153,7 +153,7 @@ def test_check_resource_for_domains_moderation_queue(self, spam_domain, factory) obj = factory() with mock.patch.object(spam_tasks.requests, 'head'): spam_tasks._check_resource_for_domains( - guid=obj.guids.first()._id, + resource=obj, content=spam_domain.geturl(), ) @@ -169,19 +169,17 @@ def test_check_resource_for_domains_moderation_queue(self, spam_domain, factory) def test_check_resource_for_domains_spam(self, spam_domain, marked_as_spam_domain, factory): obj = factory() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj, content=spam_domain.geturl(), ) - obj.reload() + assert spammy_domains == [spam_domain.netloc] + assert NotableDomain.objects.filter( domain=spam_domain.netloc, note=NotableDomain.Note.EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT ).count() == 1 - obj.reload() - assert obj.spam_status == SpamStatus.SPAM - assert obj.spam_data['domains'] == [spam_domain.netloc] assert DomainReference.objects.filter( referrer_object_id=obj.id, referrer_content_type=ContentType.objects.get_for_model(obj), @@ -233,19 +231,17 @@ def test_check_resource_for_duplicate_spam_domains(self, factory, spam_domain, m obj.spam_data['domains'] = [spam_domain.netloc] obj.save() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj, content=f'{spam_domain.geturl()}', ) - obj.reload() + assert spammy_domains == [spam_domain.netloc] + assert NotableDomain.objects.filter( domain=spam_domain.netloc, note=NotableDomain.Note.EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT ).count() == 1 - obj.reload() - assert obj.spam_status == SpamStatus.SPAM - assert obj.spam_data['domains'] == [spam_domain.netloc] assert DomainReference.objects.filter( referrer_object_id=obj.id, referrer_content_type=ContentType.objects.get_for_model(obj), @@ -326,10 +322,12 @@ def ignored_notable_domain(self): def test_from_spam_to_unknown_one_spam_domain(self, factory, spam_notable_domain_one, spam_notable_domain_two, unknown_notable_domain, ignored_notable_domain): obj_one = factory() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_one.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_one, content=f'{self.spam_domain_one.geturl()} {self.unknown_domain.geturl()} {self.ignored_domain.geturl()}', ) + if spammy_domains: + obj_one.confirm_spam(save=True, domains=spammy_domains) obj_one.reload() assert obj_one.spam_status == SpamStatus.SPAM @@ -344,10 +342,12 @@ def test_from_spam_to_unknown_one_spam_domain(self, factory, spam_notable_domain def test_from_spam_to_unknown_two_spam_domains(self, factory, spam_notable_domain_one, spam_notable_domain_two, unknown_notable_domain, ignored_notable_domain): obj_two = factory() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_two.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_two, content=f'{self.spam_domain_one.geturl()} {self.spam_domain_two.geturl()} {self.unknown_domain.geturl()} {self.ignored_domain.geturl()}', ) + if spammy_domains: + obj_two.confirm_spam(save=True, domains=spammy_domains) obj_two.reload() assert obj_two.spam_status == SpamStatus.SPAM @@ -364,10 +364,12 @@ def test_from_spam_to_unknown_marked_by_external(self, factory, spam_notable_dom obj_three.spam_data['who_flagged'] = 'some external spam checker' obj_three.save() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_three.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_three, content=f'{self.spam_domain_one.geturl()} {self.unknown_domain.geturl()} {self.ignored_domain.geturl()}', ) + if spammy_domains: + obj_three.confirm_spam(save=True, domains=spammy_domains) obj_three.reload() assert obj_three.spam_status == SpamStatus.SPAM @@ -382,10 +384,12 @@ def test_from_spam_to_unknown_marked_by_external(self, factory, spam_notable_dom def test_from_spam_to_ignored_one_spam_domain(self, factory, spam_notable_domain_one, spam_notable_domain_two, unknown_notable_domain, ignored_notable_domain): obj_one = factory() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_one.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_one, content=f'{self.spam_domain_one.geturl()} {self.unknown_domain.geturl()} {self.ignored_domain.geturl()}', ) + if spammy_domains: + obj_one.confirm_spam(save=True, domains=spammy_domains) obj_one.reload() assert obj_one.spam_status == SpamStatus.SPAM @@ -400,10 +404,12 @@ def test_from_spam_to_ignored_one_spam_domain(self, factory, spam_notable_domain def test_from_spam_to_ignored_two_spam_domains(self, factory, spam_notable_domain_one, spam_notable_domain_two, unknown_notable_domain, ignored_notable_domain): obj_two = factory() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_two.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_two, content=f'{self.spam_domain_one.geturl()} {self.spam_domain_two.geturl()} {self.unknown_domain.geturl()} {self.ignored_domain.geturl()}', ) + if spammy_domains: + obj_two.confirm_spam(save=True, domains=spammy_domains) obj_two.reload() assert obj_two.spam_status == SpamStatus.SPAM @@ -420,10 +426,12 @@ def test_from_spam_to_ignored_makred_by_external(self, factory, spam_notable_dom obj_three.spam_data['who_flagged'] = 'some external spam checker' obj_three.save() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_three.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_three, content=f'{self.spam_domain_one.geturl()} {self.unknown_domain.geturl()} {self.ignored_domain.geturl()}', ) + if spammy_domains: + obj_three.confirm_spam(save=True, domains=spammy_domains) obj_three.reload() assert obj_three.spam_status == SpamStatus.SPAM @@ -438,10 +446,12 @@ def test_from_spam_to_ignored_makred_by_external(self, factory, spam_notable_dom def test_from_unknown_to_spam_unknown_plus_ignored(self, factory, unknown_notable_domain, ignored_notable_domain): obj_one = factory() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_one.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_one, content=f'{self.unknown_domain.geturl()} {self.ignored_domain.geturl()}', ) + if spammy_domains: + obj_one.confirm_spam(save=True, domains=spammy_domains) obj_one.reload() assert obj_one.spam_status == SpamStatus.UNKNOWN @@ -456,10 +466,12 @@ def test_from_unknown_to_spam_unknown_plus_ignored(self, factory, unknown_notabl def test_from_unknown_to_spam_unknown_only(self, factory, unknown_notable_domain, ignored_notable_domain): obj_two = factory() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_two.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_two, content=f'{self.unknown_domain.geturl()}', ) + if spammy_domains: + obj_two.confirm_spam(save=True, domains=spammy_domains) obj_two.reload() assert obj_two.spam_status == SpamStatus.UNKNOWN @@ -474,10 +486,12 @@ def test_from_unknown_to_spam_unknown_only(self, factory, unknown_notable_domain def test_from_ignored_to_spam_unknown_plus_ignored(self, factory, unknown_notable_domain, ignored_notable_domain): obj_one = factory() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_one.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_one, content=f'{self.unknown_domain.geturl()} {self.ignored_domain.geturl()}', ) + if spammy_domains: + obj_one.confirm_spam(save=True, domains=spammy_domains) obj_one.reload() assert obj_one.spam_status == SpamStatus.UNKNOWN @@ -492,10 +506,12 @@ def test_from_ignored_to_spam_unknown_plus_ignored(self, factory, unknown_notabl def test_from_ignored_to_spam_ignored_only(self, factory, unknown_notable_domain, ignored_notable_domain): obj_two = factory() with mock.patch.object(spam_tasks.requests, 'head'): - spam_tasks._check_resource_for_domains( - guid=obj_two.guids.first()._id, + spammy_domains = spam_tasks._check_resource_for_domains( + resource=obj_two, content=f'{self.ignored_domain.geturl()}', ) + if spammy_domains: + obj_two.confirm_spam(save=True, domains=spammy_domains) obj_two.reload() assert obj_two.spam_status == SpamStatus.UNKNOWN