From 162510b48fe1083c5dec0048e6ef485974f3714b Mon Sep 17 00:00:00 2001 From: Raymond Zhou <56318341+rayzhou-bit@users.noreply.github.com> Date: Thu, 19 Dec 2024 19:52:16 -0500 Subject: [PATCH 1/7] fix: do not require output or error (#36052) --- .../contentstore/rest_api/v0/serializers/course_optimizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cms/djangoapps/contentstore/rest_api/v0/serializers/course_optimizer.py b/cms/djangoapps/contentstore/rest_api/v0/serializers/course_optimizer.py index fa78b4710ff4..f86947270660 100644 --- a/cms/djangoapps/contentstore/rest_api/v0/serializers/course_optimizer.py +++ b/cms/djangoapps/contentstore/rest_api/v0/serializers/course_optimizer.py @@ -39,4 +39,5 @@ class LinkCheckSerializer(serializers.Serializer): """ Serializer for broken links """ LinkCheckStatus = serializers.CharField(required=True) LinkCheckCreatedAt = serializers.DateTimeField(required=False) - LinkCheckOutput = LinkCheckOutputSerializer(required=True) + LinkCheckOutput = LinkCheckOutputSerializer(required=False) + LinkCheckError = serializers.CharField(required=False) From 067e1b08781c7a6b432059dcd3a7abddf7a40664 Mon Sep 17 00:00:00 2001 From: Jesper Hodge Date: Thu, 9 Jan 2025 17:00:12 -0500 Subject: [PATCH 2/7] fix: broken links not showing up --- .../contentstore/core/course_optimizer_provider.py | 5 ++++- cms/djangoapps/contentstore/tasks.py | 11 ++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cms/djangoapps/contentstore/core/course_optimizer_provider.py b/cms/djangoapps/contentstore/core/course_optimizer_provider.py index 9a6ecfdaebeb..b4121642a96d 100644 --- a/cms/djangoapps/contentstore/core/course_optimizer_provider.py +++ b/cms/djangoapps/contentstore/core/course_optimizer_provider.py @@ -55,7 +55,10 @@ def generate_broken_links_descriptor(json_content, request_user): for item in json_content: block_id, link, *rest = item - is_locked_flag = bool(rest[0]) + if rest: + is_locked_flag = bool(rest[0]) + else: + is_locked_flag = False usage_key = usage_key_with_run(block_id) block = get_xblock(usage_key, request_user) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 451bc9221ea0..b047f3618b68 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1214,12 +1214,13 @@ def retry_validation(url_list, course_key, retry_count=3): for i in range(0, retry_count): if retry_list: LOGGER.debug(f'[Link Check] retry attempt #{i+1}') - validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100)) + validated_url_list = asyncio.run( + validate_urls_access_in_batches(retry_list, course_key, batch_size=100) + ) filetered_url_list, retry_list = filter_by_status(validated_url_list) results.extend(filetered_url_list) - - if retry_list: - LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error') + + results.extend(retry_list) return results @@ -1237,7 +1238,7 @@ def filter_by_status(results): filtered_results = [] retry_list = [] for result in results: - if result['status'] == None: + if result['status'] is None: retry_list.append([result['block_id'], result['url']]) elif result['status'] == 200: continue From 51176cb086eb2d61ef88588f882de2e0ff3b65de Mon Sep 17 00:00:00 2001 From: Bernard Szabo Date: Fri, 10 Jan 2025 15:14:31 -0500 Subject: [PATCH 3/7] feat: TNL-11812 no nested course optimizer functions for unit testing ease --- cms/djangoapps/contentstore/tasks.py | 276 ++++++++++++++------------- 1 file changed, 139 insertions(+), 137 deletions(-) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index b047f3618b68..ebe043b5647a 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1104,161 +1104,163 @@ def generate_name(cls, arguments_dict): key = arguments_dict['course_key_string'] return f'Broken link check of {key}' +# -------------- Course optimizer functions ------------------ -@shared_task(base=CourseLinkCheckTask, bind=True) -def check_broken_links(self, user_id, course_key_string, language): +def _validate_user(task, user_id, language): + """Validate if the user exists. Otherwise log error. """ + try: + return User.objects.get(pk=user_id) + except User.DoesNotExist as exc: + with translation_language(language): + task.status.fail(UserErrors.UNKNOWN_USER_ID.format(user_id)) + return + +def _get_urls(content): """ - Checks for broken links in a course. Store the results in a file. + Returns all urls found after href and src in content. + Excludes urls that are only '#'. """ - def validate_user(): - """Validate if the user exists. Otherwise log error. """ - try: - return User.objects.get(pk=user_id) - except User.DoesNotExist as exc: - with translation_language(language): - self.status.fail(UserErrors.UNKNOWN_USER_ID.format(user_id)) - return + regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' + url_list = re.findall(regex, content) + return url_list - def get_urls(content): - """ - Returns all urls found after href and src in content. - Excludes urls that are only '#'. - """ - regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' - url_list = re.findall(regex, content) - return url_list - - def is_studio_url(url): - """Returns True if url is a studio url.""" - return not url.startswith('http://') and not url.startswith('https://') - - def convert_to_standard_url(url, course_key): - """ - Returns standard urls when given studio urls. Otherwise return url as is. - Example urls: - /assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png - /static/getting-started_x250.png - /container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7 - """ - if is_studio_url(url): - if url.startswith('/static/'): - processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1] - return 'http://' + settings.CMS_BASE + processed_url - elif url.startswith('/'): - return 'http://' + settings.CMS_BASE + url - else: - return 'http://' + settings.CMS_BASE + '/container/' + url +def _is_studio_url(url): + """Returns True if url is a studio url.""" + return not url.startswith('http://') and not url.startswith('https://') + +def _convert_to_standard_url(url, course_key): + """ + Returns standard urls when given studio urls. Otherwise return url as is. + Example urls: + /assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png + /static/getting-started_x250.png + /container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7 + """ + if _is_studio_url(url): + if url.startswith('/static/'): + processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1] + return 'http://' + settings.CMS_BASE + processed_url + elif url.startswith('/'): + return 'http://' + settings.CMS_BASE + url else: - return url + return 'http://' + settings.CMS_BASE + '/container/' + url + else: + return url - def scan_course_for_links(course_key): - """ - Returns a list of all urls in a course. - Returns: [ [block_id1, url1], [block_id2, url2], ... ] - """ - verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only) - blocks = [] - urls_to_validate = [] +def _scan_course_for_links(course_key): + """ + Returns a list of all urls in a course. + Returns: [ [block_id1, url1], [block_id2, url2], ... ] + """ + verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, + revision=ModuleStoreEnum.RevisionOption.published_only) + blocks = [] + urls_to_validate = [] - for vertical in verticals: - blocks.extend(vertical.get_children()) + for vertical in verticals: + blocks.extend(vertical.get_children()) - for block in blocks: - block_id = str(block.usage_key) - block_info = get_block_info(block) - block_data = block_info['data'] + for block in blocks: + block_id = str(block.usage_key) + block_info = get_block_info(block) + block_data = block_info['data'] - url_list = get_urls(block_data) - urls_to_validate += [[block_id, url] for url in url_list] + url_list = _get_urls(block_data) + urls_to_validate += [[block_id, url] for url in url_list] - return urls_to_validate + return urls_to_validate - async def validate_url_access(session, url_data, course_key): - """ - Returns the status of a url request - Returns: {block_id1, url1, status} - """ - block_id, url = url_data - result = {'block_id': block_id, 'url': url} - standardized_url = convert_to_standard_url(url, course_key) - try: - async with session.get(standardized_url, timeout=5) as response: - result.update({'status': response.status}) - except Exception as e: - result.update({'status': None}) - LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}') - return result - - async def validate_urls_access_in_batches(url_list, course_key, batch_size=100): - """ - Returns the statuses of a list of url requests. - Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ] - """ - responses = [] - url_count = len(url_list) - - for i in range(0, url_count, batch_size): - batch = url_list[i:i + batch_size] - async with aiohttp.ClientSession() as session: - tasks = [validate_url_access(session, url_data, course_key) for url_data in batch] - batch_results = await asyncio.gather(*tasks) - responses.extend(batch_results) - LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}') - - return responses - - def retry_validation(url_list, course_key, retry_count=3): - """Retry urls that failed due to connection error.""" - results = [] - retry_list = url_list - for i in range(0, retry_count): - if retry_list: - LOGGER.debug(f'[Link Check] retry attempt #{i+1}') - validated_url_list = asyncio.run( - validate_urls_access_in_batches(retry_list, course_key, batch_size=100) - ) - filetered_url_list, retry_list = filter_by_status(validated_url_list) - results.extend(filetered_url_list) +async def _validate_url_access(session, url_data, course_key): + """ + Returns the status of a url request + Returns: {block_id1, url1, status} + """ + block_id, url = url_data + result = {'block_id': block_id, 'url': url} + standardized_url = _convert_to_standard_url(url, course_key) + try: + async with session.get(standardized_url, timeout=5) as response: + result.update({'status': response.status}) + except Exception as e: + result.update({'status': None}) + LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}') + return result + +async def _validate_urls_access_in_batches(url_list, course_key, batch_size=100): + """ + Returns the statuses of a list of url requests. + Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ] + """ + responses = [] + url_count = len(url_list) + + for i in range(0, url_count, batch_size): + batch = url_list[i:i + batch_size] + async with aiohttp.ClientSession() as session: + tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch] + batch_results = await asyncio.gather(*tasks) + responses.extend(batch_results) + LOGGER.debug(f'[Link Check] request batch {i // batch_size + 1} of {url_count // batch_size + 1}') + + return responses + +def _retry_validation(url_list, course_key, retry_count=3): + """Retry urls that failed due to connection error.""" + results = [] + retry_list = url_list + for i in range(0, retry_count): + if retry_list: + LOGGER.debug(f'[Link Check] retry attempt #{i + 1}') + validated_url_list = asyncio.run( + _validate_urls_access_in_batches(retry_list, course_key, batch_size=100) + ) + filetered_url_list, retry_list = _filter_by_status(validated_url_list) + results.extend(filetered_url_list) - results.extend(retry_list) + results.extend(retry_list) - return results + return results - def filter_by_status(results): - """ - Filter results by status. - 200: OK. No need to do more - 403: Forbidden. Record as locked link. - None: Error. Retry up to 3 times. - Other: Failure. Record as broken link. - Returns: - filtered_results: [ [block_id1, url1, is_locked], ... ] - retry_list: [ [block_id1, url1], ... ] - """ - filtered_results = [] - retry_list = [] - for result in results: - if result['status'] is None: - retry_list.append([result['block_id'], result['url']]) - elif result['status'] == 200: - continue - elif result['status'] == 403 and is_studio_url(result['url']): - filtered_results.append([result['block_id'], result['url'], True]) - else: - filtered_results.append([result['block_id'], result['url'], False]) - - return filtered_results, retry_list +def _filter_by_status(results): + """ + Filter results by status. + 200: OK. No need to do more + 403: Forbidden. Record as locked link. + None: Error. Retry up to 3 times. + Other: Failure. Record as broken link. + Returns: + filtered_results: [ [block_id1, url1, is_locked], ... ] + retry_list: [ [block_id1, url1], ... ] + """ + filtered_results = [] + retry_list = [] + for result in results: + if result['status'] is None: + retry_list.append([result['block_id'], result['url']]) + elif result['status'] == 200: + continue + elif result['status'] == 403 and _is_studio_url(result['url']): + filtered_results.append([result['block_id'], result['url'], True]) + else: + filtered_results.append([result['block_id'], result['url'], False]) - user = validate_user() + return filtered_results, retry_list + +@shared_task(base=CourseLinkCheckTask, bind=True) +def check_broken_links(self, user_id, course_key_string, language): + """ + Checks for broken links in a course. Store the results in a file. + """ + user = _validate_user(self, user_id, language) self.status.set_state('Scanning') course_key = CourseKey.from_string(course_key_string) - url_list = scan_course_for_links(course_key) - validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100)) - broken_or_locked_urls, retry_list = filter_by_status(validated_url_list) - + url_list = _scan_course_for_links(course_key) + validated_url_list = asyncio.run(_validate_urls_access_in_batches(url_list, course_key, batch_size=100)) + broken_or_locked_urls, retry_list = _filter_by_status(validated_url_list) + if retry_list: - retry_results = retry_validation(retry_list, course_key, retry_count=3) + retry_results = _retry_validation(retry_list, course_key, retry_count=3) broken_or_locked_urls.extend(retry_results) try: @@ -1274,7 +1276,7 @@ def filter_by_status(results): artifact = UserTaskArtifact(status=self.status, name='BrokenLinks') artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file)) artifact.save() - + # catch all exceptions so we can record useful error messages except Exception as e: # pylint: disable=broad-except LOGGER.exception('Error checking links for course %s', course_key, exc_info=True) From 8862d696efdaf70fa1521217b597c97f706624da Mon Sep 17 00:00:00 2001 From: bszabo Date: Sun, 12 Jan 2025 11:17:03 -0500 Subject: [PATCH 4/7] stubbed course optimizer tests --- .../contentstore/tests/test_tasks.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py index cf82a6d16571..7e5dfd22a4c4 100644 --- a/cms/djangoapps/contentstore/tests/test_tasks.py +++ b/cms/djangoapps/contentstore/tests/test_tasks.py @@ -199,3 +199,59 @@ def test_register_exams_failure(self, _mock_register_exams_proctoring, _mock_reg _mock_register_exams_proctoring.side_effect = Exception('boom!') update_special_exams_and_publish(str(self.course.id)) course_publish.assert_called() + + +class CourseOptimizerTestCase(): + def test_user_does_not_exist_raises_exception(self): + raise NotImplementedError + + def test_no_course_access_raises_exception(self): + raise NotImplementedError + + def test_hash_tags_stripped_from_url_lists(self): + raise NotImplementedError + + def test_urls_out_count_equals_urls_in_count_when_no_hashtags(self): + raise NotImplementedError + + def test_http_and_https_recognized_as_studio_url_schemes(self): + raise NotImplementedError + + def test_file_not_recognized_as_studio_url_scheme(self): + raise NotImplementedError + + def test_url_substitution_on_static_prefixes(self): + raise NotImplementedError + + def test_url_substitution_on_forward_slash_prefixes(self): + raise NotImplementedError + + def test_url_subsitution_on_containers(self): + raise NotImplementedError + + def test_optimization_occurs_on_published_version(self): + raise NotImplementedError + + def test_number_of_scanned_blocks_equals_blocks_in_course(self): + raise NotImplementedError + + def test_every_detected_link_is_validated(self): + raise NotImplementedError + + def test_link_validation_is_batched(self): + raise NotImplementedError + + def test_all_links_in_link_list_longer_than_batch_size_are_validated(self): + raise NotImplementedError + + def test_no_retries_on_403_access_denied_links(self): + raise NotImplementedError + + def test_retries_attempted_on_connection_errors(self): + raise NotImplementedError + + def test_max_number_of_retries_is_respected(self): + raise NotImplementedError + + def test_scan_generates_file_named_by_course_key(self): + raise NotImplementedError From e4787e70ac5d78b88ef5117b6fa2ec57ee13a547 Mon Sep 17 00:00:00 2001 From: Bernard Szabo Date: Sun, 12 Jan 2025 11:23:54 -0500 Subject: [PATCH 5/7] feat: TNL-11812 Use TestCase base class --- cms/djangoapps/contentstore/tests/test_tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py index 7e5dfd22a4c4..a998e6fa2a4c 100644 --- a/cms/djangoapps/contentstore/tests/test_tasks.py +++ b/cms/djangoapps/contentstore/tests/test_tasks.py @@ -5,7 +5,7 @@ import copy import json -from unittest import mock +from unittest import mock, TestCase from uuid import uuid4 from django.conf import settings @@ -201,7 +201,7 @@ def test_register_exams_failure(self, _mock_register_exams_proctoring, _mock_reg course_publish.assert_called() -class CourseOptimizerTestCase(): +class CourseOptimizerTestCase(TestCase): def test_user_does_not_exist_raises_exception(self): raise NotImplementedError @@ -254,4 +254,4 @@ def test_max_number_of_retries_is_respected(self): raise NotImplementedError def test_scan_generates_file_named_by_course_key(self): - raise NotImplementedError + raise NotImplementedErro From e09781bf3fb1b74ec8263733597642949fd891e4 Mon Sep 17 00:00:00 2001 From: Bernard Szabo Date: Sun, 12 Jan 2025 11:58:05 -0500 Subject: [PATCH 6/7] feat: TNL-11812 Try static substitution --- cms/djangoapps/contentstore/tests/test_tasks.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py index a998e6fa2a4c..84071083dbc8 100644 --- a/cms/djangoapps/contentstore/tests/test_tasks.py +++ b/cms/djangoapps/contentstore/tests/test_tasks.py @@ -8,6 +8,7 @@ from unittest import mock, TestCase from uuid import uuid4 +import pytest as pytest from django.conf import settings from django.contrib.auth.models import User # lint-amnesty, pylint: disable=imported-auth-user from django.test.utils import override_settings @@ -17,7 +18,12 @@ from organizations.tests.factories import OrganizationFactory from user_tasks.models import UserTaskArtifact, UserTaskStatus -from cms.djangoapps.contentstore.tasks import export_olx, update_special_exams_and_publish, rerun_course +from cms.djangoapps.contentstore.tasks import ( + export_olx, + update_special_exams_and_publish, + rerun_course, + _convert_to_standard_url +) from cms.djangoapps.contentstore.tests.test_libraries import LibraryTestCase from cms.djangoapps.contentstore.tests.utils import CourseTestCase from common.djangoapps.course_action_state.models import CourseRerunState @@ -202,6 +208,8 @@ def test_register_exams_failure(self, _mock_register_exams_proctoring, _mock_reg class CourseOptimizerTestCase(TestCase): + + def test_user_does_not_exist_raises_exception(self): raise NotImplementedError @@ -220,8 +228,11 @@ def test_http_and_https_recognized_as_studio_url_schemes(self): def test_file_not_recognized_as_studio_url_scheme(self): raise NotImplementedError - def test_url_substitution_on_static_prefixes(self): - raise NotImplementedError + @pytest.mark.parametrize("url, course_key, post_substitution_url", + ["/static/anything_goes_here?raw", "1", "2"]) + def test_url_substitution_on_static_prefixes(self, url, course_key, post_substitution_url): + with_substitution = _convert_to_standard_url(url, course_key) + assert with_substitution == post_substitution_url def test_url_substitution_on_forward_slash_prefixes(self): raise NotImplementedError From de1aa1d40460fe81792410e53f600bd4a5931e6f Mon Sep 17 00:00:00 2001 From: Bernard Szabo Date: Sun, 12 Jan 2025 12:01:15 -0500 Subject: [PATCH 7/7] feat: TNL-11812 msg for assert --- cms/djangoapps/contentstore/tests/test_tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py index 84071083dbc8..bd0e93711e67 100644 --- a/cms/djangoapps/contentstore/tests/test_tasks.py +++ b/cms/djangoapps/contentstore/tests/test_tasks.py @@ -232,7 +232,7 @@ def test_file_not_recognized_as_studio_url_scheme(self): ["/static/anything_goes_here?raw", "1", "2"]) def test_url_substitution_on_static_prefixes(self, url, course_key, post_substitution_url): with_substitution = _convert_to_standard_url(url, course_key) - assert with_substitution == post_substitution_url + assert with_substitution == post_substitution_url, f'{with_substitution} expected to be {post_substitution_url}' def test_url_substitution_on_forward_slash_prefixes(self): raise NotImplementedError