diff --git a/cms/djangoapps/contentstore/core/course_optimizer_provider.py b/cms/djangoapps/contentstore/core/course_optimizer_provider.py new file mode 100644 index 000000000000..b4121642a96d --- /dev/null +++ b/cms/djangoapps/contentstore/core/course_optimizer_provider.py @@ -0,0 +1,192 @@ +""" +Logic for handling actions in Studio related to Course Optimizer. +""" + +import json + +from cms.djangoapps.contentstore.xblock_storage_handlers.view_handlers import get_xblock +from cms.djangoapps.contentstore.xblock_storage_handlers.xblock_helpers import usage_key_with_run + + +def generate_broken_links_descriptor(json_content, request_user): + """ + Returns a Data Transfer Object for frontend given a list of broken links. + + json_content contains a list of [block_id, link, is_locked] + is_locked is true if the link is a studio link and returns 403 on request + + ** Example DTO structure ** + { + 'sections': [ + { + 'id': 'section_id', + 'displayName': 'section name', + 'subsections': [ + { + 'id': 'subsection_id', + 'displayName': 'subsection name', + 'units': [ + { + 'id': 'unit_id', + 'displayName': 'unit name', + 'blocks': [ + { + 'id': 'block_id', + 'displayName': 'block name', + 'url': 'url/to/block', + 'brokenLinks: [], + 'lockedLinks: [], + }, + ..., + ] + }, + ..., + ] + }, + ..., + ] + }, + ..., + ] + } + """ + xblock_node_tree = {} # tree representation of xblock relationships + xblock_dictionary = {} # dictionary of xblock attributes + + for item in json_content: + block_id, link, *rest = item + if rest: + is_locked_flag = bool(rest[0]) + else: + is_locked_flag = False + + usage_key = usage_key_with_run(block_id) + block = get_xblock(usage_key, request_user) + _update_node_tree_and_dictionary( + block=block, + link=link, + is_locked=is_locked_flag, + node_tree=xblock_node_tree, + dictionary=xblock_dictionary + ) + + return _create_dto_from_node_tree_recursive(xblock_node_tree, xblock_dictionary) + + +def _update_node_tree_and_dictionary(block, link, is_locked, node_tree, dictionary): + """ + Inserts a block into the node tree and add its attributes to the dictionary. + + ** Example node tree structure ** + { + 'section_id1': { + 'subsection_id1': { + 'unit_id1': { + 'block_id1': {}, + 'block_id2': {}, + ..., + }, + 'unit_id2': { + 'block_id3': {}, + ..., + }, + ..., + }, + ..., + }, + ..., + } + + ** Example dictionary structure ** + { + 'xblock_id: { + 'display_name': 'xblock name' + 'category': 'html' + }, + ..., + } + """ + path = _get_node_path(block) + current_node = node_tree + xblock_id = '' + + # Traverse the path and build the tree structure + for xblock in path: + xblock_id = xblock.location.block_id + dictionary.setdefault(xblock_id, + { + 'display_name': xblock.display_name, + 'category': getattr(xblock, 'category', ''), + } + ) + # Sets new current node and creates the node if it doesn't exist + current_node = current_node.setdefault(xblock_id, {}) + + # Add block-level details for the last xblock in the path (URL and broken/locked links) + dictionary[xblock_id].setdefault('url', + f'/course/{block.course_id}/editor/{block.category}/{block.location}' + ) + if is_locked: + dictionary[xblock_id].setdefault('locked_links', []).append(link) + else: + dictionary[xblock_id].setdefault('broken_links', []).append(link) + + +def _get_node_path(block): + """ + Retrieves the path frmo the course root node to a specific block, excluding the root. + + ** Example Path structure ** + [chapter_node, sequential_node, vertical_node, html_node] + """ + path = [] + current_node = block + + while current_node.get_parent(): + path.append(current_node) + current_node = current_node.get_parent() + + return list(reversed(path)) + + +CATEGORY_TO_LEVEL_MAP = { + "chapter": "sections", + "sequential": "subsections", + "vertical": "units" +} + + +def _create_dto_from_node_tree_recursive(xblock_node, xblock_dictionary): + """ + Recursively build the Data Transfer Object from the node tree and dictionary. + """ + # Exit condition when there are no more child nodes (at block level) + if not xblock_node: + return None + + level = None + xblock_children = [] + + for xblock_id, node in xblock_node.items(): + child_blocks = _create_dto_from_node_tree_recursive(node, xblock_dictionary) + xblock_data = xblock_dictionary.get(xblock_id, {}) + + xblock_entry = { + 'id': xblock_id, + 'displayName': xblock_data.get('display_name', ''), + } + if child_blocks == None: # Leaf node + level = 'blocks' + xblock_entry.update({ + 'url': xblock_data.get('url', ''), + 'brokenLinks': xblock_data.get('broken_links', []), + 'lockedLinks': xblock_data.get('locked_links', []), + }) + else: # Non-leaf node + category = xblock_data.get('category', None) + level = CATEGORY_TO_LEVEL_MAP.get(category, None) + xblock_entry.update(child_blocks) + + xblock_children.append(xblock_entry) + + return {level: xblock_children} if level else None diff --git a/cms/djangoapps/contentstore/rest_api/v0/serializers/__init__.py b/cms/djangoapps/contentstore/rest_api/v0/serializers/__init__.py index 33931a4a199a..171f746be438 100644 --- a/cms/djangoapps/contentstore/rest_api/v0/serializers/__init__.py +++ b/cms/djangoapps/contentstore/rest_api/v0/serializers/__init__.py @@ -4,6 +4,7 @@ from .advanced_settings import AdvancedSettingsFieldSerializer, CourseAdvancedSettingsSerializer from .assets import AssetSerializer from .authoring_grading import CourseGradingModelSerializer +from .course_optimizer import LinkCheckSerializer from .tabs import CourseTabSerializer, CourseTabUpdateSerializer, TabIDLocatorSerializer from .transcripts import TranscriptSerializer, YoutubeTranscriptCheckSerializer, YoutubeTranscriptUploadSerializer from .xblock import XblockSerializer diff --git a/cms/djangoapps/contentstore/rest_api/v0/serializers/course_optimizer.py b/cms/djangoapps/contentstore/rest_api/v0/serializers/course_optimizer.py new file mode 100644 index 000000000000..f86947270660 --- /dev/null +++ b/cms/djangoapps/contentstore/rest_api/v0/serializers/course_optimizer.py @@ -0,0 +1,43 @@ +""" +API Serializers for Course Optimizer +""" + +from rest_framework import serializers + + +class LinkCheckBlockSerializer(serializers.Serializer): + """ Serializer for broken links block model data """ + id = serializers.CharField(required=True, allow_null=False, allow_blank=False) + displayName = serializers.CharField(required=True, allow_null=False, allow_blank=True) + url = serializers.CharField(required=True, allow_null=False, allow_blank=False) + brokenLinks = serializers.ListField(required=False) + lockedLinks = serializers.ListField(required=False) + +class LinkCheckUnitSerializer(serializers.Serializer): + """ Serializer for broken links unit model data """ + id = serializers.CharField(required=True, allow_null=False, allow_blank=False) + displayName = serializers.CharField(required=True, allow_null=False, allow_blank=True) + blocks = LinkCheckBlockSerializer(many=True) + +class LinkCheckSubsectionSerializer(serializers.Serializer): + """ Serializer for broken links subsection model data """ + id = serializers.CharField(required=True, allow_null=False, allow_blank=False) + displayName = serializers.CharField(required=True, allow_null=False, allow_blank=True) + units = LinkCheckUnitSerializer(many=True) + +class LinkCheckSectionSerializer(serializers.Serializer): + """ Serializer for broken links section model data """ + id = serializers.CharField(required=True, allow_null=False, allow_blank=False) + displayName = serializers.CharField(required=True, allow_null=False, allow_blank=True) + subsections = LinkCheckSubsectionSerializer(many=True) + +class LinkCheckOutputSerializer(serializers.Serializer): + """ Serializer for broken links output model data """ + sections = LinkCheckSectionSerializer(many=True) + +class LinkCheckSerializer(serializers.Serializer): + """ Serializer for broken links """ + LinkCheckStatus = serializers.CharField(required=True) + LinkCheckCreatedAt = serializers.DateTimeField(required=False) + LinkCheckOutput = LinkCheckOutputSerializer(required=False) + LinkCheckError = serializers.CharField(required=False) diff --git a/cms/djangoapps/contentstore/rest_api/v0/tests/test_course_optimizer.py b/cms/djangoapps/contentstore/rest_api/v0/tests/test_course_optimizer.py new file mode 100644 index 000000000000..d8227d1670d7 --- /dev/null +++ b/cms/djangoapps/contentstore/rest_api/v0/tests/test_course_optimizer.py @@ -0,0 +1,22 @@ +from cms.djangoapps.contentstore.tests.test_utils import AuthorizeStaffTestCase +from rest_framework import status +from django.test import TestCase +from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase +from django.urls import reverse + +class TestCourseOptimizer(AuthorizeStaffTestCase, ModuleStoreTestCase, TestCase): + ''' + Tests for CourseOptimizer + ''' + def test_inherited(self): + # This method ensures that pytest recognizes this class as containing tests + pass + + def make_request(self, course_id=None, data=None): + return self.client.get(self.get_url(course_id), data) + + def get_url(self, course_key): + return reverse( + 'cms.djangoapps.contentstore:v0:link_check_status', + kwargs={'course_id': 'course-v1:someOrg+someCourse+someRun'} + ) diff --git a/cms/djangoapps/contentstore/rest_api/v0/urls.py b/cms/djangoapps/contentstore/rest_api/v0/urls.py index cc1e13b0929c..e5575ca942f3 100644 --- a/cms/djangoapps/contentstore/rest_api/v0/urls.py +++ b/cms/djangoapps/contentstore/rest_api/v0/urls.py @@ -7,14 +7,16 @@ from .views import ( AdvancedCourseSettingsView, + APIHeartBeatView, AuthoringGradingView, CourseTabSettingsView, CourseTabListView, CourseTabReorderView, + LinkCheckView, + LinkCheckStatusView, TranscriptView, YoutubeTranscriptCheckView, YoutubeTranscriptUploadView, - APIHeartBeatView ) from .views import assets from .views import authoring_videos @@ -102,4 +104,14 @@ fr'^youtube_transcripts/{settings.COURSE_ID_PATTERN}/upload?$', YoutubeTranscriptUploadView.as_view(), name='cms_api_youtube_transcripts_upload' ), + + # Course Optimizer + re_path( + fr'^link_check/{settings.COURSE_ID_PATTERN}$', + LinkCheckView.as_view(), name='link_check' + ), + re_path( + fr'^link_check_status/{settings.COURSE_ID_PATTERN}$', + LinkCheckStatusView.as_view(), name='link_check_status' + ), ] diff --git a/cms/djangoapps/contentstore/rest_api/v0/views/__init__.py b/cms/djangoapps/contentstore/rest_api/v0/views/__init__.py index 00d22a1ea715..2ce3ea22ea49 100644 --- a/cms/djangoapps/contentstore/rest_api/v0/views/__init__.py +++ b/cms/djangoapps/contentstore/rest_api/v0/views/__init__.py @@ -2,7 +2,8 @@ Views for v0 contentstore API. """ from .advanced_settings import AdvancedCourseSettingsView +from .api_heartbeat import APIHeartBeatView from .authoring_grading import AuthoringGradingView +from .course_optimizer import LinkCheckView, LinkCheckStatusView from .tabs import CourseTabSettingsView, CourseTabListView, CourseTabReorderView from .transcripts import TranscriptView, YoutubeTranscriptCheckView, YoutubeTranscriptUploadView -from .api_heartbeat import APIHeartBeatView diff --git a/cms/djangoapps/contentstore/rest_api/v0/views/course_optimizer.py b/cms/djangoapps/contentstore/rest_api/v0/views/course_optimizer.py new file mode 100644 index 000000000000..82e43d629a87 --- /dev/null +++ b/cms/djangoapps/contentstore/rest_api/v0/views/course_optimizer.py @@ -0,0 +1,209 @@ +""" API Views for Course Optimizer. """ + +import json +import edx_api_doc_tools as apidocs +from django.conf import settings +from opaque_keys.edx.keys import CourseKey +from rest_framework.views import APIView +from rest_framework.request import Request +from rest_framework.response import Response +from rest_framework import status +from user_tasks.conf import settings as user_tasks_settings +from user_tasks.models import UserTaskArtifact, UserTaskStatus + +from cms.djangoapps.contentstore.core.course_optimizer_provider import generate_broken_links_descriptor +from cms.djangoapps.contentstore.rest_api.v0.serializers.course_optimizer import LinkCheckSerializer +from cms.djangoapps.contentstore.tasks import CourseLinkCheckTask, check_broken_links +from common.djangoapps.student.auth import has_course_author_access, has_studio_read_access +from common.djangoapps.util.json_request import JsonResponse +from common.djangoapps.util.views import ensure_valid_course_key +from openedx.core.lib.api.view_utils import DeveloperErrorViewMixin, verify_course_exists, view_auth_classes +from xmodule.modulestore.django import modulestore # lint-amnesty, pylint: disable=wrong-import-order + + +# Restricts status in the REST API to only those which the requesting user has permission to view. +# These can be overwritten in django settings. +# By default, these should be the UserTaskStatus statuses: +# 'Pending', 'In Progress', 'Succeeded', 'Failed', 'Canceled', 'Retrying' +STATUS_FILTERS = user_tasks_settings.USER_TASKS_STATUS_FILTERS + + +@view_auth_classes(is_authenticated=True) +class LinkCheckView(DeveloperErrorViewMixin, APIView): + """ + View for queueing a celery task to scan a course for broken links. + """ + @apidocs.schema( + parameters=[ + apidocs.string_parameter("course_id", apidocs.ParameterLocation.PATH, description="Course ID"), + ], + responses={ + 200: "Celery task queued.", + 401: "The requester is not authenticated.", + 403: "The requester cannot access the specified course.", + 404: "The requested course does not exist.", + }, + ) + @verify_course_exists() + def post(self, request: Request, course_id: str): + """ + Queue celery task to scan a course for broken links. + + **Example Request** + POST /api/contentstore/v0/link_check/{course_id} + + **POST Parameters** + ...TODO finish description with examples + ```json + { + "LinkCheckStatus": "Pending" + } + """ + course_key = CourseKey.from_string(course_id) + + if not has_studio_read_access(request.user, course_key): + self.permission_denied(request) + + check_broken_links.delay(request.user.id, course_id, request.LANGUAGE_CODE) + return JsonResponse({'LinkCheckStatus': UserTaskStatus.PENDING}) + + +@view_auth_classes() +class LinkCheckStatusView(DeveloperErrorViewMixin, APIView): + """ + View for checking the status of the celery task and returning the results. + """ + @apidocs.schema( + parameters=[ + apidocs.string_parameter("course_id", apidocs.ParameterLocation.PATH, description="Course ID"), + ], + responses={ + 200: "OK", + 401: "The requester is not authenticated.", + 403: "The requester cannot access the specified course.", + 404: "The requested course does not exist.", + }, + ) + def get(self, request: Request, course_id: str): + """ + TODO update description + GET handler to return the status of the link_check task from UserTaskStatus. + If no task has been started for the course, return 'Uninitiated'. + If link_check task was successful, an output result is also returned. + + For reference, the following status are in UserTaskStatus: + 'Pending', 'In Progress' (sent to frontend as 'In-Progress'), + 'Succeeded', 'Failed', 'Canceled', 'Retrying' + This function adds a status for when status from UserTaskStatus is None: + 'Uninitiated' + + **Example Request** + GET /api/contentstore/v0/link_check_status/{course_id} + + **Example Response** + ```json + { + "LinkCheckStatus": "Succeeded", + "LinkCheckOutput": { + sections: [ + { + id: , + displayName: , + subsections: [ + { + id: , + displayName: , + units: [ + { + id: , + displayName: , + blocks: [ + { + id: , + url: , + brokenLinks: [ + , + , + , + ..., + ], + lockedLinks: [ + , + , + , + ..., + ], + }, + { }, + ], + }, + { }, + ], + }, + { }, + ], + }, + } + """ + course_key = CourseKey.from_string(course_id) + if not has_course_author_access(request.user, course_key): + self.permission_denied(request) + + task_status = _latest_task_status(request, course_id) + status = None + created_at = None + broken_links_dto = None + error = None + if task_status is None: + # The task hasn't been initialized yet; did we store info in the session already? + try: + session_status = request.session['link_check_status'] + status = session_status[course_id] + except KeyError: + status = 'Uninitiated' + else: + status = task_status.state + created_at = task_status.created + if task_status.state == UserTaskStatus.SUCCEEDED: + artifact = UserTaskArtifact.objects.get(status=task_status, name='BrokenLinks') + with artifact.file as file: + content = file.read() + json_content = json.loads(content) + broken_links_dto = generate_broken_links_descriptor(json_content, request.user) + elif task_status.state in (UserTaskStatus.FAILED, UserTaskStatus.CANCELED): + errors = UserTaskArtifact.objects.filter(status=task_status, name='Error') + if errors: + error = errors[0].text + try: + error = json.loads(error) + except ValueError: + # Wasn't JSON, just use the value as a string + pass + + data = { + 'LinkCheckStatus': status, + **({'LinkCheckCreatedAt': created_at} if created_at else {}), + **({'LinkCheckOutput': broken_links_dto} if broken_links_dto else {}), + **({'LinkCheckError': error} if error else {}) + } + + serializer = LinkCheckSerializer(data=data) + serializer.is_valid(raise_exception=True) + + return Response(serializer.data) + + +def _latest_task_status(request, course_key_string, view_func=None): + """ + Get the most recent link check status update for the specified course + key. + """ + args = {'course_key_string': course_key_string} + name = CourseLinkCheckTask.generate_name(args) + task_status = UserTaskStatus.objects.filter(name=name) + for status_filter in STATUS_FILTERS: + task_status = status_filter().filter_queryset(request, task_status, view_func) + return task_status.order_by('-created').first() diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index bb220c371711..a88d04fde5a1 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -7,6 +7,11 @@ import os import shutil import tarfile +import re +import requests +import aiohttp +import asyncio +import time from datetime import datetime from tempfile import NamedTemporaryFile, mkdtemp @@ -53,8 +58,10 @@ translation_language, delete_course ) +from cms.djangoapps.contentstore.xblock_storage_handlers.view_handlers import get_block_info from cms.djangoapps.models.settings.course_metadata import CourseMetadata from common.djangoapps.course_action_state.models import CourseRerunState +from common.djangoapps.static_replace import replace_static_urls from common.djangoapps.student.auth import has_course_author_access from common.djangoapps.student.roles import CourseInstructorRole, CourseStaffRole, LibraryUserRole from common.djangoapps.util.monitoring import monitor_import_failure @@ -1066,3 +1073,222 @@ def undo_all_library_source_blocks_ids_for_course(course_key_string, v1_to_v2_li store.update_item(draft_library_source_block, None) # return success return + + +class CourseLinkCheckTask(UserTask): # pylint: disable=abstract-method + """ + Base class for course link check tasks. + """ + + @staticmethod + def calculate_total_steps(arguments_dict): + """ + Get the number of in-progress steps in the link check process, as shown in the UI. + + For reference, these are: + 1. Scanning + """ + return 1 + + @classmethod + def generate_name(cls, arguments_dict): + """ + Create a name for this particular task instance. + + Arguments: + arguments_dict (dict): The arguments given to the task function + + Returns: + str: The generated name + """ + key = arguments_dict['course_key_string'] + return f'Broken link check of {key}' + +# -------------- Course optimizer functions ------------------ + +def _validate_user(task, user_id, language): + """Validate if the user exists. Otherwise log error. """ + try: + return User.objects.get(pk=user_id) + except User.DoesNotExist as exc: + with translation_language(language): + task.status.fail(UserErrors.UNKNOWN_USER_ID.format(user_id)) + return + +def _get_urls(content): + """ + Returns all urls found after href and src in content. + Excludes urls that are only '#'. + """ + regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' + url_list = re.findall(regex, content) + return url_list + +def _is_studio_url(url): + """Returns True if url is a studio url.""" + return _is_studio_url_with_base(url) or _is_studio_url_without_base(url) + +def _is_studio_url_with_base(url): + """Returns True if url is a studio url with cms base.""" + return url.startswith('http://' + settings.CMS_BASE) or url.startswith('https://' + settings.CMS_BASE) + +def _is_studio_url_without_base(url): + """Returns True if url is a studio url without cms base.""" + return not url.startswith('http://') and not url.startswith('https://') + +def _convert_to_standard_url(url, course_key): + """ + Returns standard urls when given studio urls. Otherwise return url as is. + Example urls: + /assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png + /static/getting-started_x250.png + /container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7 + """ + if _is_studio_url_without_base(url): + if url.startswith('/static/'): + processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1] + return 'https://' + settings.CMS_BASE + processed_url + elif url.startswith('/'): + return 'https://' + settings.CMS_BASE + url + else: + return 'https://' + settings.CMS_BASE + '/container/' + url + else: + return url + +def _scan_course_for_links(course_key): + """ + Returns a list of all urls in a course. + Returns: [ [block_id1, url1], [block_id2, url2], ... ] + """ + verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, + revision=ModuleStoreEnum.RevisionOption.published_only) + blocks = [] + urls_to_validate = [] + + for vertical in verticals: + blocks.extend(vertical.get_children()) + + for block in blocks: + block_id = str(block.usage_key) + block_info = get_block_info(block) + block_data = block_info['data'] + + url_list = _get_urls(block_data) + urls_to_validate += [[block_id, url] for url in url_list] + + return urls_to_validate + +async def _validate_url_access(session, url_data, course_key): + """ + Returns the status of a url request + Returns: {block_id1, url1, status} + """ + block_id, url = url_data + result = {'block_id': block_id, 'url': url} + standardized_url = _convert_to_standard_url(url, course_key) + try: + async with session.get(standardized_url, timeout=5) as response: + result.update({'status': response.status}) + except Exception as e: + result.update({'status': None}) + LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}') + return result + +async def _validate_urls_access_in_batches(url_list, course_key, batch_size=100): + """ + Returns the statuses of a list of url requests. + Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ] + """ + responses = [] + url_count = len(url_list) + + for i in range(0, url_count, batch_size): + batch = url_list[i:i + batch_size] + async with aiohttp.ClientSession() as session: + tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch] + batch_results = await asyncio.gather(*tasks) + responses.extend(batch_results) + LOGGER.debug(f'[Link Check] request batch {i // batch_size + 1} of {url_count // batch_size + 1}') + + return responses + +def _retry_validation(url_list, course_key, retry_count=3): + """Retry urls that failed due to connection error.""" + results = [] + retry_list = url_list + for i in range(0, retry_count): + if retry_list: + LOGGER.debug(f'[Link Check] retry attempt #{i + 1}') + validated_url_list = asyncio.run( + _validate_urls_access_in_batches(retry_list, course_key, batch_size=100) + ) + filetered_url_list, retry_list = _filter_by_status(validated_url_list) + results.extend(filetered_url_list) + + results.extend(retry_list) + + return results + +def _filter_by_status(results): + """ + Filter results by status. + 200: OK. No need to do more + 403: Forbidden. Record as locked link. + None: Error. Retry up to 3 times. + Other: Failure. Record as broken link. + Returns: + filtered_results: [ [block_id1, url1, is_locked], ... ] + retry_list: [ [block_id1, url1], ... ] + """ + filtered_results = [] + retry_list = [] + for result in results: + status, block_id, url = result['status'], result['block_id'], result['url'] + if status is None: + retry_list.append([block_id, url]) + elif status == 200: + continue + elif status == 403 and _is_studio_url(url): + filtered_results.append([block_id, url, True]) + else: + filtered_results.append([block_id, url, False]) + + return filtered_results, retry_list + +@shared_task(base=CourseLinkCheckTask, bind=True) +def check_broken_links(self, user_id, course_key_string, language): + """ + Checks for broken links in a course. Store the results in a file. + """ + user = _validate_user(self, user_id, language) + + self.status.set_state('Scanning') + course_key = CourseKey.from_string(course_key_string) + url_list = _scan_course_for_links(course_key) + validated_url_list = asyncio.run(_validate_urls_access_in_batches(url_list, course_key, batch_size=100)) + broken_or_locked_urls, retry_list = _filter_by_status(validated_url_list) + + if retry_list: + retry_results = _retry_validation(retry_list, course_key, retry_count=3) + broken_or_locked_urls.extend(retry_results) + + try: + self.status.increment_completed_steps() + + file_name = str(course_key) + broken_links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json') + LOGGER.debug(f'[Link Check] json file being generated at {broken_links_file.name}') + + with open(broken_links_file.name, 'w') as file: + json.dump(broken_or_locked_urls, file, indent=4) + + artifact = UserTaskArtifact(status=self.status, name='BrokenLinks') + artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file)) + artifact.save() + + # catch all exceptions so we can record useful error messages + except Exception as e: # pylint: disable=broad-except + LOGGER.exception('Error checking links for course %s', course_key, exc_info=True) + if self.status.state != UserTaskStatus.FAILED: + self.status.fail({'raw_error_msg': str(e)}) + return diff --git a/cms/djangoapps/contentstore/tests/test_tasks.py b/cms/djangoapps/contentstore/tests/test_tasks.py index cf82a6d16571..bd0e93711e67 100644 --- a/cms/djangoapps/contentstore/tests/test_tasks.py +++ b/cms/djangoapps/contentstore/tests/test_tasks.py @@ -5,9 +5,10 @@ import copy import json -from unittest import mock +from unittest import mock, TestCase from uuid import uuid4 +import pytest as pytest from django.conf import settings from django.contrib.auth.models import User # lint-amnesty, pylint: disable=imported-auth-user from django.test.utils import override_settings @@ -17,7 +18,12 @@ from organizations.tests.factories import OrganizationFactory from user_tasks.models import UserTaskArtifact, UserTaskStatus -from cms.djangoapps.contentstore.tasks import export_olx, update_special_exams_and_publish, rerun_course +from cms.djangoapps.contentstore.tasks import ( + export_olx, + update_special_exams_and_publish, + rerun_course, + _convert_to_standard_url +) from cms.djangoapps.contentstore.tests.test_libraries import LibraryTestCase from cms.djangoapps.contentstore.tests.utils import CourseTestCase from common.djangoapps.course_action_state.models import CourseRerunState @@ -199,3 +205,64 @@ def test_register_exams_failure(self, _mock_register_exams_proctoring, _mock_reg _mock_register_exams_proctoring.side_effect = Exception('boom!') update_special_exams_and_publish(str(self.course.id)) course_publish.assert_called() + + +class CourseOptimizerTestCase(TestCase): + + + def test_user_does_not_exist_raises_exception(self): + raise NotImplementedError + + def test_no_course_access_raises_exception(self): + raise NotImplementedError + + def test_hash_tags_stripped_from_url_lists(self): + raise NotImplementedError + + def test_urls_out_count_equals_urls_in_count_when_no_hashtags(self): + raise NotImplementedError + + def test_http_and_https_recognized_as_studio_url_schemes(self): + raise NotImplementedError + + def test_file_not_recognized_as_studio_url_scheme(self): + raise NotImplementedError + + @pytest.mark.parametrize("url, course_key, post_substitution_url", + ["/static/anything_goes_here?raw", "1", "2"]) + def test_url_substitution_on_static_prefixes(self, url, course_key, post_substitution_url): + with_substitution = _convert_to_standard_url(url, course_key) + assert with_substitution == post_substitution_url, f'{with_substitution} expected to be {post_substitution_url}' + + def test_url_substitution_on_forward_slash_prefixes(self): + raise NotImplementedError + + def test_url_subsitution_on_containers(self): + raise NotImplementedError + + def test_optimization_occurs_on_published_version(self): + raise NotImplementedError + + def test_number_of_scanned_blocks_equals_blocks_in_course(self): + raise NotImplementedError + + def test_every_detected_link_is_validated(self): + raise NotImplementedError + + def test_link_validation_is_batched(self): + raise NotImplementedError + + def test_all_links_in_link_list_longer_than_batch_size_are_validated(self): + raise NotImplementedError + + def test_no_retries_on_403_access_denied_links(self): + raise NotImplementedError + + def test_retries_attempted_on_connection_errors(self): + raise NotImplementedError + + def test_max_number_of_retries_is_respected(self): + raise NotImplementedError + + def test_scan_generates_file_named_by_course_key(self): + raise NotImplementedErro