diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 69efe2bf3236..6f9f3ea81875 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1204,31 +1204,43 @@ async def _validate_urls_access_in_batches(url_list, course_key, batch_size=100) for i in range(0, url_count, batch_size): batch = url_list[i:i + batch_size] - async with aiohttp.ClientSession() as session: - tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch] - batch_results = await asyncio.gather(*tasks) - responses.extend(batch_results) - LOGGER.debug(f'[Link Check] request batch {i // batch_size + 1} of {url_count // batch_size + 1}') + batch_results = await _validate_batch(batch, course_key) + responses.extend(batch_results) + LOGGER.debug(f'[Link Check] request batch {i // batch_size + 1} of {url_count // batch_size + 1}') return responses + +async def _validate_batch(batch, course_key): + async with aiohttp.ClientSession() as session: + tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch] + batch_results = await asyncio.gather(*tasks) + return batch_results + def _retry_validation(url_list, course_key, retry_count=3): - """Retry urls that failed due to connection error.""" + """Retry urls that failed due to connection error. + returns URLs that could not be validated either because locked, or because of persistent connection problems + """ results = [] retry_list = url_list for i in range(0, retry_count): if retry_list: LOGGER.debug(f'[Link Check] retry attempt #{i + 1}') - validated_url_list = asyncio.run( - _validate_urls_access_in_batches(retry_list, course_key, batch_size=100) - ) - filetered_url_list, retry_list = _filter_by_status(validated_url_list) - results.extend(filetered_url_list) - + retry_list = _retry_validation_and_filter(course_key, results, retry_list) results.extend(retry_list) return results + +def _retry_validation_and_filter(course_key, results, retry_list): + validated_url_list = asyncio.run( + _validate_urls_access_in_batches(retry_list, course_key, batch_size=100) + ) + filtered_url_list, retry_list = _filter_by_status(validated_url_list) + results.extend(filtered_url_list) + return retry_list + + def _filter_by_status(results): """ Filter results by status. @@ -1265,6 +1277,9 @@ def _write_broken_links_to_file(broken_or_locked_urls, broken_links_file): json.dump(broken_or_locked_urls, file, indent=4) def _check_broken_links(task_instance, user_id, course_key_string, language): + """ + Checks for broken links in a course. Store the results in a file. + """ user = _validate_user(task_instance, user_id, language) task_instance.status.set_state('Scanning') @@ -1300,7 +1315,6 @@ def _check_broken_links(task_instance, user_id, course_key_string, language): task_instance.status.fail({'raw_error_msg': str(e)}) return - @shared_task(base=CourseLinkCheckTask, bind=True) def check_broken_links(self, user_id, course_key_string, language): """ diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py index 8cc59d856fce..9175ecccfcbd 100644 --- a/cms/djangoapps/contentstore/tests/test_tasks.py +++ b/cms/djangoapps/contentstore/tests/test_tasks.py @@ -1,20 +1,20 @@ """ Unit tests for course import and export Celery tasks """ - - +import asyncio import copy import json -import os -from tempfile import NamedTemporaryFile -from unittest import mock, TestCase +import logging +import pprint +from unittest import mock +from unittest.mock import AsyncMock, patch, MagicMock from uuid import uuid4 import pytest as pytest from django.conf import settings from django.contrib.auth.models import User # lint-amnesty, pylint: disable=imported-auth-user +from django.test import TestCase from django.test.utils import override_settings -from django.core.files import File from edx_toggles.toggles.testutils import override_waffle_flag from opaque_keys.edx.keys import CourseKey from opaque_keys.edx.locator import CourseLocator @@ -22,11 +22,16 @@ from organizations.tests.factories import OrganizationFactory from user_tasks.models import UserTaskArtifact, UserTaskStatus -from cms.djangoapps.contentstore.tasks import ( +logging = logging.getLogger(__name__) + +from ..tasks import ( export_olx, update_special_exams_and_publish, rerun_course, _convert_to_standard_url, + _validate_urls_access_in_batches, + _filter_by_status, + _get_urls, _check_broken_links, _is_studio_url, _scan_course_for_links @@ -37,7 +42,7 @@ from common.djangoapps.student.tests.factories import UserFactory from openedx.core.djangoapps.course_apps.toggles import EXAMS_IDA from openedx.core.djangoapps.embargo.models import Country, CountryAccessRule, RestrictedCourse -from xmodule.modulestore import ModuleStoreEnum # lint-amnesty, pylint: disable=wrong-import-order +from xmodule.modulestore import ModuleStoreEnum from xmodule.modulestore.django import modulestore # lint-amnesty, pylint: disable=wrong-import-order from xmodule.modulestore.tests.django_utils import TEST_DATA_SPLIT_MODULESTORE, ModuleStoreTestCase from xmodule.modulestore.tests.factories import CourseFactory # lint-amnesty, pylint: disable=wrong-import-order @@ -113,7 +118,6 @@ def _assert_failed(self, task_result, error_message): self.assertEqual(error.name, 'Error') self.assertEqual(error.text, error_message) - @override_settings(CONTENTSTORE=TEST_DATA_CONTENTSTORE) class ExportLibraryTestCase(LibraryTestCase): """ @@ -221,6 +225,7 @@ class MockCourseLinkCheckTask(Task): def __init__(self): self.status = mock.Mock() +############## Course Optimizer tests ############## class CheckBrokenLinksTaskTest(ModuleStoreTestCase): def setUp(self): @@ -239,7 +244,6 @@ def setUp(self): ] @mock.patch('cms.djangoapps.contentstore.tasks.UserTaskArtifact', autospec=True) - @mock.patch('cms.djangoapps.contentstore.tasks.UserTaskStatus', autospec=True) @mock.patch('cms.djangoapps.contentstore.tasks._scan_course_for_links') @mock.patch('cms.djangoapps.contentstore.tasks._save_broken_links_file', autospec=True) @mock.patch('cms.djangoapps.contentstore.tasks._write_broken_links_to_file', autospec=True) @@ -248,15 +252,17 @@ def test_check_broken_links_stores_broken_and_locked_urls( mock_write_broken_links_to_file, mock_save_broken_links_file, mock_scan_course_for_links, - _mock_user_task_status, mock_user_task_artifact ): ''' - The test should verify that the check_broken_links task correctly - identifies and stores broken or locked URLs in the course. - The expected behavior is that the task scans the course, - validates the URLs, filters the results, and stores them in a + The test verifies that the check_broken_links task correctly + stores broken or locked URLs in the course. + The expected behavior is that the after scanning the course, + validating the URLs, and filtering the results, the task stores the results in a JSON file. + + Note that this test mocks all validation functions and therefore + does not test link validation or any of its support functions. ''' # Arrange mock_user = UserFactory.create(username='student', password='password') @@ -277,17 +283,62 @@ def test_check_broken_links_stores_broken_and_locked_urls( ### Check that _save_broken_links_file was called with the correct arguments mock_save_broken_links_file.assert_called_once_with(mock_user_task_artifact.return_value, mock.ANY) + @pytest.mark.skip(reason="This test is not yet implemented") def test_user_does_not_exist_raises_exception(self): - raise NotImplementedError + assert True + @pytest.mark.skip(reason="This test is not yet implemented") def test_no_course_access_raises_exception(self): - raise NotImplementedError + assert True def test_hash_tags_stripped_from_url_lists(self): - raise NotImplementedError + NUM_HASH_TAG_LINES = 2 + url_list = ''' + href='#' # 1 of 2 lines that will be stripped + href='http://google.com' + src='#' # 2 of 2 lines that will be stripped + href='https://microsoft.com' + src="/static/resource_name" + ''' - def test_urls_out_count_equals_urls_in_count_when_no_hashtags(self): - raise NotImplementedError + original_lines = len(url_list.splitlines()) - 2 # Correct for the two carriage returns surrounding the ''' marks + processed_url_list = _get_urls(url_list) + processed_lines = len(processed_url_list) + + assert processed_lines == original_lines - NUM_HASH_TAG_LINES, \ + f'Processed URL list lines = {processed_lines}; expected {original_lines - 2}' + + # TODO - Document here what counts as a legitimate URL & modify test accordingly + @pytest.mark.skip(reason="Valid URL format to be nailed down") + def test_src_and_href_urls_extracted(self): + FIRST_URL = 'http://google.com' + SECOND_URL = 'https://microsoft.com' + THIRD_URL = "/static/resource_name" + FOURTH_URL = 'http://ibm.com' + url_list = f''' + href={FIRST_URL} + href={SECOND_URL} + src={THIRD_URL} + tag={FOURTH_URL} + ''' + + processed_url_list = _get_urls(url_list) + pprint.pp(processed_url_list) + assert len(processed_url_list) == 3, f"Expected 3 matches; got {len(processed_url_list)}" + assert processed_url_list[0] == FIRST_URL, \ + f"Failed to properly parse {FIRST_URL}; got {processed_url_list[0]}" + assert processed_url_list[1] == SECOND_URL, \ + f"Failed to properly parse {SECOND_URL}; got {processed_url_list[1]}" + assert processed_url_list[2] == THIRD_URL, \ + f"Failed to properly parse {THIRD_URL}; got {processed_url_list[2]}" + + @pytest.mark.skip(reason="This test is not yet implemented") + def test_http_and_https_recognized_as_studio_url_schemes(self): + assert True + + @pytest.mark.skip(reason="This test is not yet implemented") + def test_file_not_recognized_as_studio_url_scheme(self): + assert True def test_http_url_not_recognized_as_studio_url_scheme(self): self.assertFalse(_is_studio_url(f'http://www.google.com')) @@ -307,17 +358,8 @@ def test_container_url_without_url_base_is_recognized_as_studio_url_scheme(self) def test_slash_url_without_url_base_is_recognized_as_studio_url_scheme(self): self.assertTrue(_is_studio_url(f'/static/test')) - @pytest.mark.parametrize("url, course_key, post_substitution_url", - ["/static/anything_goes_here?raw", "1", "2"]) - def test_url_substitution_on_static_prefixes(self, url, course_key, post_substitution_url): - with_substitution = _convert_to_standard_url(url, course_key) - assert with_substitution == post_substitution_url, f'{with_substitution} expected to be {post_substitution_url}' - - def test_url_substitution_on_forward_slash_prefixes(self): - raise NotImplementedError - - def test_url_subsitution_on_containers(self): - raise NotImplementedError + # TODO + # Need additional negative tests on _is_studio_url would be appropriate @mock.patch('cms.djangoapps.contentstore.tasks.ModuleStoreEnum', autospec=True) @mock.patch('cms.djangoapps.contentstore.tasks.modulestore', autospec=True) @@ -340,30 +382,179 @@ def test_course_scan_occurs_on_published_version(self, mock_modulestore, mock_mo @mock.patch('cms.djangoapps.contentstore.tasks._get_urls', autospec=True) def test_number_of_scanned_blocks_equals_blocks_in_course(self, mock_get_urls): - """_scan_course_for_links should call _get_urls once per block in course""" - mock_get_urls = mock.Mock() - result = self.store.get_items(self.test_course.id) + """ + _scan_course_for_links should call _get_urls once per block in course. + TODO - verify that the created course actually has blocks. This test not meaningful if it doesn't + """ + expected_blocks = self.store.get_items(self.test_course.id) _scan_course_for_links(self.test_course.id) - self.assertEqual(len(result), mock_get_urls.call_count) - - def test_every_detected_link_is_validated(self): - raise NotImplementedError + self.assertEqual(len(expected_blocks), mock_get_urls.call_count) - def test_link_validation_is_batched(self): - raise NotImplementedError + @pytest.mark.asyncio + async def test_every_detected_link_is_validated(self): + ''' + The call to _validate_urls_access_in_batches() should call _validate_batch() three times, once for each + of the three batches of length 2 in url_list. The lambda function supplied for _validate_batch will + simply return the set of urls fed to _validate_batch(), and _validate_urls_access_in_batches() will + aggregate these into a list identical to the original url_list. - def test_all_links_in_link_list_longer_than_batch_size_are_validated(self): - raise NotImplementedError + What this shows is that each url submitted to _validate_urls_access_in_batches() is ending up as an argument + to one of the generated _validate_batch() calls, and that no input URL is left unprocessed. + ''' + url_list = ['1', '2', '3', '4', '5'] + course_key = 'course-v1:edX+DemoX+Demo_Course' + batch_size = 2 + with patch("cms.djangoapps.contentstore.tasks._validate_batch", new_callable=AsyncMock) as mock_validate_batch: + mock_validate_batch.side_effect = lambda x, y: x + validated_urls = await _validate_urls_access_in_batches(url_list, course_key, batch_size) + mock_validate_batch.assert_called() + assert mock_validate_batch.call_count == 3 # two full batches and one partial batch + assert validated_urls == url_list, \ + f"List of validated urls {validated_urls} is not identical to sourced urls {url_list}" + + @pytest.mark.asyncio + async def test_all_links_are_validated_with_batch_validation(self): + ''' + Here the focus is not on batching, but rather that when validation occurs it does so on the intended + URL strings + ''' + with patch("cms.djangoapps.contentstore.tasks._validate_url_access", new_callable=AsyncMock) as mock_validate: + mock_validate.return_value = {"status": 200} + + url_list = ['1', '2', '3', '4', '5'] + course_key = 'course-v1:edX+DemoX+Demo_Course' + batch_size=2 + await _validate_urls_access_in_batches(url_list, course_key, batch_size) + args_list = mock_validate.call_args_list + urls = [call_args.args[1] for call_args in args_list] # The middle argument in each of the function calls + for i in range(1,len(url_list)+1): + assert str(i) in urls, f'{i} not supplied as a url for validation in batches function' def test_no_retries_on_403_access_denied_links(self): - raise NotImplementedError + ''' + No mocking required here. Will populate "filtering_input" with simulated results for link checks where + some links time out, some links receive 403 errors, and some receive 200 success. This test then + ensures that "_filter_by_status()" tallies the three categories as expected, and formats the result + as expected. + ''' + url_list = ['1', '2', '3', '4', '5'] + filtering_input = [] + for i in range(1, len(url_list)+1): # Notch out one of the URLs, having it return a '403' status code + filtering_input.append( + {'block_id': f'block_{i}', + 'url': str(i), + 'status': 200}, + ) + filtering_input[2]['status'] = 403 + filtering_input[3]['status'] = 500 + filtering_input[4]['status'] = None + + broken_or_locked_urls, retry_list = _filter_by_status(filtering_input) + assert len(broken_or_locked_urls) == 2 # The inputs with status = 403 and 500 + assert len(retry_list) == 1 # The input with status = None + assert retry_list[0][1] == '5' # The only URL fit for a retry operation (status == None) + + # TODO - test retry logic + + @pytest.mark.skip(reason="Failing but needs review -- test not yet correct") + @pytest.mark.asyncio + async def test_max_number_of_retries_is_respected(self): + logging.info("******** In test_max_number_of_retries_is_respected *******") + ''' + Patch initial validation to show no progress (need retries on everything). + Patch retries to behave in an equally non-productive way + Assert that the number of retries attempted equals the maximum number allowed + ''' + MAX_RETRIES = 3 + with patch("cms.djangoapps.contentstore.tasks._validate_url_access", + new_callable=AsyncMock) as mock_validate_url: + mock_validate_url.side_effect = \ + lambda session, url_data, course_key: {'block_id': f'block_{url_data}', 'url': url_data} + with patch("cms.djangoapps.contentstore.tasks._retry_validation_and_filter", + new_callable=AsyncMock) as mock_retry_validation: + mock_retry_validation.side_effect = \ + lambda course_key, results, retry_list: retry_list + + url_list = ['1', '2', '3', '4', '5'] + course_key = 'course-v1:edX+DemoX+Demo_Course' + batch_size=2 + results = await _validate_urls_access_in_batches(url_list, course_key, batch_size) + print(" ***** results = ******") + pprint.pp(results) + assert mock_retry_validation.call_count == MAX_RETRIES, \ + f'Got {mock_retry_validation.call_count} retries; expected {MAX_RETRIES}' + + @patch("cms.djangoapps.contentstore.tasks._validate_user", return_value=MagicMock()) + @patch("cms.djangoapps.contentstore.tasks._scan_course_for_links", return_value=["url1", "url2"]) + @patch("cms.djangoapps.contentstore.tasks._validate_urls_access_in_batches", + return_value=[{"url": "url1", "status": "ok"}]) + @patch("cms.djangoapps.contentstore.tasks._filter_by_status", + return_value=(["block_1", "url1", True], ["block_2", "url2"])) + @patch("cms.djangoapps.contentstore.tasks._retry_validation", + return_value=['block_2', 'url2']) + def test_check_broken_links_calls_expected_support_functions(self, + mock_retry_validation, + mock_filter, + mock_validate_urls, + mock_scan_course, + mock_validate_user): + # Parameters for the function + user_id = 1234 + language = "en" + course_key_string = "course-v1:edX+DemoX+2025" + + # Mocking self and status attributes for the test + class MockStatus: + def __init__(self): + self.state = "READY" + + def set_state(self, state): + self.state = state + + def increment_completed_steps(self): + pass + + def fail(self, error_details): + self.state = "FAILED" + + class MockSelf: + def __init__(self): + self.status = MockStatus() + + mock_self = MockSelf() + + _check_broken_links(mock_self, user_id, course_key_string, language) + + # Prepare expected results based on mock settings + url_list = mock_scan_course.return_value + validated_url_list = mock_validate_urls.return_value + broken_or_locked_urls, retry_list = mock_filter.return_value + course_key = CourseKey.from_string(course_key_string) + + if retry_list: + retry_results = mock_retry_validation.return_value + broken_or_locked_urls.extend(retry_results) + + # Perform verifications + try: + mock_self.status.increment_completed_steps() + mock_retry_validation.assert_called_once_with( + mock_filter.return_value[1], course_key, retry_count=3 + ) + except Exception as e: + logging.exception("Error checking links for course %s", course_key_string, exc_info=True) + if mock_self.status.state != "FAILED": + mock_self.status.fail({"raw_error_msg": str(e)}) + assert False, "Exception should not occur" + + # Assertions to confirm patched calls were invoked + mock_validate_user.assert_called_once_with(mock_self, user_id, language) + mock_scan_course.assert_called_once_with(course_key) + mock_validate_urls.assert_called_once_with(url_list, course_key, batch_size=100) + mock_filter.assert_called_once_with(validated_url_list) + if retry_list: + mock_retry_validation.assert_called_once_with(retry_list, course_key, retry_count=3) - def test_retries_attempted_on_connection_errors(self): - raise NotImplementedError - def test_max_number_of_retries_is_respected(self): - raise NotImplementedError - def test_scan_generates_file_named_by_course_key(self): - raise NotImplementedError